summaryrefslogtreecommitdiffstats
path: root/third_party/aom/av1
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/aom/av1')
-rw-r--r--third_party/aom/av1/av1.cmake469
-rw-r--r--third_party/aom/av1/av1_cx_iface.c1908
-rw-r--r--third_party/aom/av1/av1_dx_iface.c1328
-rw-r--r--third_party/aom/av1/av1_iface_common.h136
-rw-r--r--third_party/aom/av1/common/alloccommon.c300
-rw-r--r--third_party/aom/av1/common/alloccommon.h48
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.c3231
-rw-r--r--third_party/aom/av1/common/arm/av1_inv_txfm_neon.h154
-rw-r--r--third_party/aom/av1/common/arm/av1_txfm_neon.c28
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_hmask_neon.c134
-rw-r--r--third_party/aom/av1/common/arm/blend_a64_vmask_neon.c141
-rw-r--r--third_party/aom/av1/common/arm/cfl_neon.c584
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.c1455
-rw-r--r--third_party/aom/av1/common/arm/convolve_neon.h228
-rw-r--r--third_party/aom/av1/common/arm/jnt_convolve_neon.c1740
-rw-r--r--third_party/aom/av1/common/arm/mem_neon.h494
-rw-r--r--third_party/aom/av1/common/arm/reconinter_neon.c86
-rw-r--r--third_party/aom/av1/common/arm/selfguided_neon.c1508
-rw-r--r--third_party/aom/av1/common/arm/transpose_neon.h537
-rw-r--r--third_party/aom/av1/common/arm/warp_plane_neon.c714
-rw-r--r--third_party/aom/av1/common/arm/wiener_convolve_neon.c530
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.c1846
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d.h61
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm1d_cfg.h47
-rw-r--r--third_party/aom/av1/common/av1_inv_txfm2d.c505
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.c2377
-rw-r--r--third_party/aom/av1/common/av1_loopfilter.h227
-rw-r--r--third_party/aom/av1/common/av1_rtcd.c22
-rwxr-xr-xthird_party/aom/av1/common/av1_rtcd_defs.pl398
-rw-r--r--third_party/aom/av1/common/av1_txfm.c160
-rw-r--r--third_party/aom/av1/common/av1_txfm.h232
-rw-r--r--third_party/aom/av1/common/blockd.c140
-rw-r--r--third_party/aom/av1/common/blockd.h1176
-rw-r--r--third_party/aom/av1/common/cdef.c403
-rw-r--r--third_party/aom/av1/common/cdef.h51
-rw-r--r--third_party/aom/av1/common/cdef_block.c257
-rw-r--r--third_party/aom/av1/common/cdef_block.h59
-rw-r--r--third_party/aom/av1/common/cdef_block_avx2.c14
-rw-r--r--third_party/aom/av1/common/cdef_block_neon.c14
-rw-r--r--third_party/aom/av1/common/cdef_block_simd.h920
-rw-r--r--third_party/aom/av1/common/cdef_block_sse2.c14
-rw-r--r--third_party/aom/av1/common/cdef_block_sse4.c14
-rw-r--r--third_party/aom/av1/common/cdef_block_ssse3.c14
-rw-r--r--third_party/aom/av1/common/cfl.c448
-rw-r--r--third_party/aom/av1/common/cfl.h302
-rw-r--r--third_party/aom/av1/common/common.h63
-rw-r--r--third_party/aom/av1/common/common_data.h446
-rw-r--r--third_party/aom/av1/common/convolve.c1295
-rw-r--r--third_party/aom/av1/common/convolve.h125
-rw-r--r--third_party/aom/av1/common/debugmodes.c107
-rw-r--r--third_party/aom/av1/common/entropy.c178
-rw-r--r--third_party/aom/av1/common/entropy.h181
-rw-r--r--third_party/aom/av1/common/entropymode.c1103
-rw-r--r--third_party/aom/av1/common/entropymode.h212
-rw-r--r--third_party/aom/av1/common/entropymv.c67
-rw-r--r--third_party/aom/av1/common/entropymv.h104
-rw-r--r--third_party/aom/av1/common/enums.h619
-rw-r--r--third_party/aom/av1/common/filter.h214
-rw-r--r--third_party/aom/av1/common/frame_buffers.c91
-rw-r--r--third_party/aom/av1/common/frame_buffers.h60
-rw-r--r--third_party/aom/av1/common/idct.c322
-rw-r--r--third_party/aom/av1/common/idct.h67
-rw-r--r--third_party/aom/av1/common/mv.h301
-rw-r--r--third_party/aom/av1/common/mvref_common.c1523
-rw-r--r--third_party/aom/av1/common/mvref_common.h361
-rw-r--r--third_party/aom/av1/common/obmc.h91
-rw-r--r--third_party/aom/av1/common/obu_util.c147
-rw-r--r--third_party/aom/av1/common/obu_util.h47
-rw-r--r--third_party/aom/av1/common/odintrin.c541
-rw-r--r--third_party/aom/av1/common/odintrin.h96
-rw-r--r--third_party/aom/av1/common/onyxc_int.h1342
-rw-r--r--third_party/aom/av1/common/ppc/cfl_ppc.c152
-rw-r--r--third_party/aom/av1/common/pred_common.c501
-rw-r--r--third_party/aom/av1/common/pred_common.h360
-rw-r--r--third_party/aom/av1/common/quant_common.c13676
-rw-r--r--third_party/aom/av1/common/quant_common.h63
-rw-r--r--third_party/aom/av1/common/reconinter.c1162
-rw-r--r--third_party/aom/av1/common/reconinter.h365
-rw-r--r--third_party/aom/av1/common/reconintra.c1640
-rw-r--r--third_party/aom/av1/common/reconintra.h119
-rw-r--r--third_party/aom/av1/common/resize.c1280
-rw-r--r--third_party/aom/av1/common/resize.h112
-rw-r--r--third_party/aom/av1/common/restoration.c1556
-rw-r--r--third_party/aom/av1/common/restoration.h377
-rw-r--r--third_party/aom/av1/common/scale.c126
-rw-r--r--third_party/aom/av1/common/scale.h67
-rw-r--r--third_party/aom/av1/common/scan.c3735
-rw-r--r--third_party/aom/av1/common/scan.h55
-rw-r--r--third_party/aom/av1/common/seg_common.c84
-rw-r--r--third_party/aom/av1/common/seg_common.h104
-rw-r--r--third_party/aom/av1/common/thread_common.c786
-rw-r--r--third_party/aom/av1/common/thread_common.h119
-rw-r--r--third_party/aom/av1/common/tile_common.c207
-rw-r--r--third_party/aom/av1/common/tile_common.h72
-rw-r--r--third_party/aom/av1/common/timing.c79
-rw-r--r--third_party/aom/av1/common/timing.h59
-rw-r--r--third_party/aom/av1/common/token_cdfs.h3555
-rw-r--r--third_party/aom/av1/common/txb_common.c475
-rw-r--r--third_party/aom/av1/common/txb_common.h424
-rw-r--r--third_party/aom/av1/common/warped_motion.c1148
-rw-r--r--third_party/aom/av1/common/warped_motion.h95
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c228
-rw-r--r--third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c499
-rw-r--r--third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c205
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c1945
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h71
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c2923
-rw-r--r--third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h232
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse2.h317
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.c21
-rw-r--r--third_party/aom/av1/common/x86/av1_txfm_sse4.h72
-rw-r--r--third_party/aom/av1/common/x86/cfl_avx2.c491
-rw-r--r--third_party/aom/av1/common/x86/cfl_simd.h243
-rw-r--r--third_party/aom/av1/common/x86/cfl_sse2.c89
-rw-r--r--third_party/aom/av1/common/x86/cfl_ssse3.c393
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_avx2.c283
-rw-r--r--third_party/aom/av1/common/x86/convolve_2d_sse2.c472
-rw-r--r--third_party/aom/av1/common/x86/convolve_avx2.c277
-rw-r--r--third_party/aom/av1/common/x86/convolve_sse2.c338
-rw-r--r--third_party/aom/av1/common/x86/filterintra_sse4.c75
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c326
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c191
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c420
-rw-r--r--third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c217
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c1349
-rw-r--r--third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c5348
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c846
-rw-r--r--third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c383
-rw-r--r--third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h125
-rw-r--r--third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c624
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c245
-rw-r--r--third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c202
-rw-r--r--third_party/aom/av1/common/x86/intra_edge_sse4.c318
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_avx2.c633
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_sse2.c385
-rw-r--r--third_party/aom/av1/common/x86/jnt_convolve_ssse3.c232
-rw-r--r--third_party/aom/av1/common/x86/reconinter_avx2.c620
-rw-r--r--third_party/aom/av1/common/x86/reconinter_sse4.c153
-rw-r--r--third_party/aom/av1/common/x86/reconinter_ssse3.c116
-rw-r--r--third_party/aom/av1/common/x86/selfguided_avx2.c724
-rw-r--r--third_party/aom/av1/common/x86/selfguided_sse4.c660
-rw-r--r--third_party/aom/av1/common/x86/warp_plane_sse4.c942
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_avx2.c261
-rw-r--r--third_party/aom/av1/common/x86/wiener_convolve_sse2.c199
-rw-r--r--third_party/aom/av1/decoder/accounting.c138
-rw-r--r--third_party/aom/av1/decoder/accounting.h82
-rw-r--r--third_party/aom/av1/decoder/decodeframe.c5567
-rw-r--r--third_party/aom/av1/decoder/decodeframe.h85
-rw-r--r--third_party/aom/av1/decoder/decodemv.c1560
-rw-r--r--third_party/aom/av1/decoder/decodemv.h35
-rw-r--r--third_party/aom/av1/decoder/decoder.c575
-rw-r--r--third_party/aom/av1/decoder/decoder.h317
-rw-r--r--third_party/aom/av1/decoder/decodetxb.c362
-rw-r--r--third_party/aom/av1/decoder/decodetxb.h32
-rw-r--r--third_party/aom/av1/decoder/detokenize.c78
-rw-r--r--third_party/aom/av1/decoder/detokenize.h29
-rw-r--r--third_party/aom/av1/decoder/dthread.c192
-rw-r--r--third_party/aom/av1/decoder/dthread.h82
-rw-r--r--third_party/aom/av1/decoder/inspection.c117
-rw-r--r--third_party/aom/av1/decoder/inspection.h84
-rw-r--r--third_party/aom/av1/decoder/obu.c839
-rw-r--r--third_party/aom/av1/decoder/obu.h31
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.c172
-rw-r--r--third_party/aom/av1/encoder/aq_complexity.h37
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.c580
-rw-r--r--third_party/aom/av1/encoder/aq_cyclicrefresh.h98
-rw-r--r--third_party/aom/av1/encoder/aq_variance.c202
-rw-r--r--third_party/aom/av1/encoder/aq_variance.h33
-rw-r--r--third_party/aom/av1/encoder/arm/neon/quantize_neon.c118
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.c1885
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d.h49
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h19
-rw-r--r--third_party/aom/av1/encoder/av1_fwd_txfm2d.c431
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.c738
-rw-r--r--third_party/aom/av1/encoder/av1_quantize.h148
-rw-r--r--third_party/aom/av1/encoder/bitstream.c3999
-rw-r--r--third_party/aom/av1/encoder/bitstream.h51
-rw-r--r--third_party/aom/av1/encoder/block.h452
-rw-r--r--third_party/aom/av1/encoder/blockiness.c142
-rw-r--r--third_party/aom/av1/encoder/context_tree.c215
-rw-r--r--third_party/aom/av1/encoder/context_tree.h114
-rw-r--r--third_party/aom/av1/encoder/corner_detect.c37
-rw-r--r--third_party/aom/av1/encoder/corner_detect.h22
-rw-r--r--third_party/aom/av1/encoder/corner_match.c191
-rw-r--r--third_party/aom/av1/encoder/corner_match.h33
-rw-r--r--third_party/aom/av1/encoder/cost.c46
-rw-r--r--third_party/aom/av1/encoder/cost.h47
-rw-r--r--third_party/aom/av1/encoder/dwt.c155
-rw-r--r--third_party/aom/av1/encoder/dwt.h25
-rw-r--r--third_party/aom/av1/encoder/encodeframe.c5739
-rw-r--r--third_party/aom/av1/encoder/encodeframe.h47
-rw-r--r--third_party/aom/av1/encoder/encodemb.c649
-rw-r--r--third_party/aom/av1/encoder/encodemb.h96
-rw-r--r--third_party/aom/av1/encoder/encodemv.c239
-rw-r--r--third_party/aom/av1/encoder/encodemv.h55
-rw-r--r--third_party/aom/av1/encoder/encoder.c6437
-rw-r--r--third_party/aom/av1/encoder/encoder.h985
-rw-r--r--third_party/aom/av1/encoder/encodetxb.c2062
-rw-r--r--third_party/aom/av1/encoder/encodetxb.h87
-rw-r--r--third_party/aom/av1/encoder/ethread.c261
-rw-r--r--third_party/aom/av1/encoder/ethread.h37
-rw-r--r--third_party/aom/av1/encoder/extend.c188
-rw-r--r--third_party/aom/av1/encoder/extend.h32
-rw-r--r--third_party/aom/av1/encoder/firstpass.c3480
-rw-r--r--third_party/aom/av1/encoder/firstpass.h208
-rw-r--r--third_party/aom/av1/encoder/global_motion.c298
-rw-r--r--third_party/aom/av1/encoder/global_motion.h64
-rw-r--r--third_party/aom/av1/encoder/grain_test_vectors.h781
-rw-r--r--third_party/aom/av1/encoder/hash.c125
-rw-r--r--third_party/aom/av1/encoder/hash.h52
-rw-r--r--third_party/aom/av1/encoder/hash_motion.c482
-rw-r--r--third_party/aom/av1/encoder/hash_motion.h78
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.c390
-rw-r--r--third_party/aom/av1/encoder/hybrid_fwd_txfm.h31
-rw-r--r--third_party/aom/av1/encoder/k_means_template.h123
-rw-r--r--third_party/aom/av1/encoder/lookahead.c210
-rw-r--r--third_party/aom/av1/encoder/lookahead.h106
-rw-r--r--third_party/aom/av1/encoder/mathutils.h359
-rw-r--r--third_party/aom/av1/encoder/mbgraph.c401
-rw-r--r--third_party/aom/av1/encoder/mbgraph.h41
-rw-r--r--third_party/aom/av1/encoder/mcomp.c2885
-rw-r--r--third_party/aom/av1/encoder/mcomp.h161
-rw-r--r--third_party/aom/av1/encoder/mips/msa/error_msa.c109
-rw-r--r--third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c46
-rw-r--r--third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c285
-rw-r--r--third_party/aom/av1/encoder/ml.c73
-rw-r--r--third_party/aom/av1/encoder/ml.h49
-rw-r--r--third_party/aom/av1/encoder/palette.c154
-rw-r--r--third_party/aom/av1/encoder/palette.h96
-rw-r--r--third_party/aom/av1/encoder/partition_model_weights.h2448
-rw-r--r--third_party/aom/av1/encoder/pickcdef.c526
-rw-r--r--third_party/aom/av1/encoder/picklpf.c263
-rw-r--r--third_party/aom/av1/encoder/picklpf.h30
-rw-r--r--third_party/aom/av1/encoder/pickrst.c1362
-rw-r--r--third_party/aom/av1/encoder/pickrst.h46
-rw-r--r--third_party/aom/av1/encoder/pustats.h198
-rw-r--r--third_party/aom/av1/encoder/random.h29
-rw-r--r--third_party/aom/av1/encoder/ransac.c603
-rw-r--r--third_party/aom/av1/encoder/ransac.h35
-rw-r--r--third_party/aom/av1/encoder/rate_distortion_model_params.h591
-rw-r--r--third_party/aom/av1/encoder/ratectrl.c1776
-rw-r--r--third_party/aom/av1/encoder/ratectrl.h295
-rw-r--r--third_party/aom/av1/encoder/rd.c1512
-rw-r--r--third_party/aom/av1/encoder/rd.h464
-rw-r--r--third_party/aom/av1/encoder/rdopt.c12199
-rw-r--r--third_party/aom/av1/encoder/rdopt.h138
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.c627
-rw-r--r--third_party/aom/av1/encoder/reconinter_enc.h127
-rw-r--r--third_party/aom/av1/encoder/segmentation.c244
-rw-r--r--third_party/aom/av1/encoder/segmentation.h38
-rw-r--r--third_party/aom/av1/encoder/speed_features.c564
-rw-r--r--third_party/aom/av1/encoder/speed_features.h568
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.c602
-rw-r--r--third_party/aom/av1/encoder/temporal_filter.h25
-rw-r--r--third_party/aom/av1/encoder/tokenize.c248
-rw-r--r--third_party/aom/av1/encoder/tokenize.h73
-rw-r--r--third_party/aom/av1/encoder/tx_prune_model_weights.h1944
-rw-r--r--third_party/aom/av1/encoder/wedge_utils.c125
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c1217
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c2068
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c365
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h103
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c2889
-rw-r--r--third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h117
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c137
-rw-r--r--third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c195
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_avx2.c330
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_sse2.c189
-rw-r--r--third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm204
-rw-r--r--third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm222
-rw-r--r--third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h142
-rw-r--r--third_party/aom/av1/encoder/x86/corner_match_sse4.c103
-rw-r--r--third_party/aom/av1/encoder/x86/dct_sse2.asm82
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_avx2.c130
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse2.c505
-rw-r--r--third_party/aom/av1/encoder/x86/encodetxb_sse4.c92
-rw-r--r--third_party/aom/av1/encoder/x86/error_intrin_avx2.c88
-rw-r--r--third_party/aom/av1/encoder/x86/error_sse2.asm79
-rw-r--r--third_party/aom/av1/encoder/x86/hash_sse42.c51
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c72
-rw-r--r--third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c1783
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_avx2.c403
-rw-r--r--third_party/aom/av1/encoder/x86/pickrst_sse4.c389
-rw-r--r--third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm217
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_avx2.c215
-rw-r--r--third_party/aom/av1/encoder/x86/wedge_utils_sse2.c254
-rw-r--r--third_party/aom/av1/exports_com2
-rw-r--r--third_party/aom/av1/exports_dec3
-rw-r--r--third_party/aom/av1/exports_enc2
-rw-r--r--third_party/aom/av1/exports_test2
290 files changed, 180998 insertions, 0 deletions
diff --git a/third_party/aom/av1/av1.cmake b/third_party/aom/av1/av1.cmake
new file mode 100644
index 0000000000..3a7cd7ee13
--- /dev/null
+++ b/third_party/aom/av1/av1.cmake
@@ -0,0 +1,469 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AV1_AV1_CMAKE_)
+ return()
+endif() # AOM_AV1_AV1_CMAKE_
+set(AOM_AV1_AV1_CMAKE_ 1)
+
+list(APPEND AOM_AV1_COMMON_SOURCES
+ "${AOM_ROOT}/av1/av1_iface_common.h"
+ "${AOM_ROOT}/av1/common/alloccommon.c"
+ "${AOM_ROOT}/av1/common/alloccommon.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d.c"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm1d_cfg.h"
+ "${AOM_ROOT}/av1/common/av1_inv_txfm2d.c"
+ "${AOM_ROOT}/av1/common/av1_loopfilter.c"
+ "${AOM_ROOT}/av1/common/av1_loopfilter.h"
+ "${AOM_ROOT}/av1/common/av1_txfm.c"
+ "${AOM_ROOT}/av1/common/av1_txfm.h"
+ "${AOM_ROOT}/av1/common/blockd.c"
+ "${AOM_ROOT}/av1/common/blockd.h"
+ "${AOM_ROOT}/av1/common/cdef.c"
+ "${AOM_ROOT}/av1/common/cdef.h"
+ "${AOM_ROOT}/av1/common/cdef_block.c"
+ "${AOM_ROOT}/av1/common/cdef_block.h"
+ "${AOM_ROOT}/av1/common/cfl.c"
+ "${AOM_ROOT}/av1/common/cfl.h"
+ "${AOM_ROOT}/av1/common/common.h"
+ "${AOM_ROOT}/av1/common/common_data.h"
+ "${AOM_ROOT}/av1/common/convolve.c"
+ "${AOM_ROOT}/av1/common/convolve.h"
+ "${AOM_ROOT}/av1/common/debugmodes.c"
+ "${AOM_ROOT}/av1/common/entropy.c"
+ "${AOM_ROOT}/av1/common/entropy.h"
+ "${AOM_ROOT}/av1/common/entropymode.c"
+ "${AOM_ROOT}/av1/common/entropymode.h"
+ "${AOM_ROOT}/av1/common/entropymv.c"
+ "${AOM_ROOT}/av1/common/entropymv.h"
+ "${AOM_ROOT}/av1/common/enums.h"
+ "${AOM_ROOT}/av1/common/filter.h"
+ "${AOM_ROOT}/av1/common/frame_buffers.c"
+ "${AOM_ROOT}/av1/common/frame_buffers.h"
+ "${AOM_ROOT}/av1/common/idct.c"
+ "${AOM_ROOT}/av1/common/idct.h"
+ "${AOM_ROOT}/av1/common/mv.h"
+ "${AOM_ROOT}/av1/common/mvref_common.c"
+ "${AOM_ROOT}/av1/common/mvref_common.h"
+ "${AOM_ROOT}/av1/common/obu_util.c"
+ "${AOM_ROOT}/av1/common/obu_util.h"
+ "${AOM_ROOT}/av1/common/odintrin.c"
+ "${AOM_ROOT}/av1/common/odintrin.h"
+ "${AOM_ROOT}/av1/common/onyxc_int.h"
+ "${AOM_ROOT}/av1/common/pred_common.c"
+ "${AOM_ROOT}/av1/common/pred_common.h"
+ "${AOM_ROOT}/av1/common/quant_common.c"
+ "${AOM_ROOT}/av1/common/quant_common.h"
+ "${AOM_ROOT}/av1/common/reconinter.c"
+ "${AOM_ROOT}/av1/common/reconinter.h"
+ "${AOM_ROOT}/av1/common/reconintra.c"
+ "${AOM_ROOT}/av1/common/reconintra.h"
+ "${AOM_ROOT}/av1/common/resize.c"
+ "${AOM_ROOT}/av1/common/resize.h"
+ "${AOM_ROOT}/av1/common/restoration.c"
+ "${AOM_ROOT}/av1/common/restoration.h"
+ "${AOM_ROOT}/av1/common/scale.c"
+ "${AOM_ROOT}/av1/common/scale.h"
+ "${AOM_ROOT}/av1/common/scan.c"
+ "${AOM_ROOT}/av1/common/scan.h"
+ "${AOM_ROOT}/av1/common/seg_common.c"
+ "${AOM_ROOT}/av1/common/seg_common.h"
+ "${AOM_ROOT}/av1/common/thread_common.c"
+ "${AOM_ROOT}/av1/common/thread_common.h"
+ "${AOM_ROOT}/av1/common/tile_common.c"
+ "${AOM_ROOT}/av1/common/tile_common.h"
+ "${AOM_ROOT}/av1/common/timing.c"
+ "${AOM_ROOT}/av1/common/timing.h"
+ "${AOM_ROOT}/av1/common/token_cdfs.h"
+ "${AOM_ROOT}/av1/common/txb_common.c"
+ "${AOM_ROOT}/av1/common/txb_common.h"
+ "${AOM_ROOT}/av1/common/warped_motion.c"
+ "${AOM_ROOT}/av1/common/warped_motion.h")
+
+list(APPEND AOM_AV1_DECODER_SOURCES
+ "${AOM_ROOT}/av1/av1_dx_iface.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.c"
+ "${AOM_ROOT}/av1/decoder/decodeframe.h"
+ "${AOM_ROOT}/av1/decoder/decodemv.c"
+ "${AOM_ROOT}/av1/decoder/decodemv.h"
+ "${AOM_ROOT}/av1/decoder/decoder.c"
+ "${AOM_ROOT}/av1/decoder/decoder.h"
+ "${AOM_ROOT}/av1/decoder/decodetxb.c"
+ "${AOM_ROOT}/av1/decoder/decodetxb.h"
+ "${AOM_ROOT}/av1/decoder/detokenize.c"
+ "${AOM_ROOT}/av1/decoder/detokenize.h"
+ "${AOM_ROOT}/av1/decoder/dthread.c"
+ "${AOM_ROOT}/av1/decoder/dthread.h"
+ "${AOM_ROOT}/av1/decoder/obu.h"
+ "${AOM_ROOT}/av1/decoder/obu.c")
+
+list(APPEND AOM_AV1_ENCODER_SOURCES
+ "${AOM_ROOT}/av1/av1_cx_iface.c"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.c"
+ "${AOM_ROOT}/av1/encoder/aq_complexity.h"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.c"
+ "${AOM_ROOT}/av1/encoder/aq_cyclicrefresh.h"
+ "${AOM_ROOT}/av1/encoder/aq_variance.c"
+ "${AOM_ROOT}/av1/encoder/aq_variance.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.c"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm1d_cfg.h"
+ "${AOM_ROOT}/av1/encoder/av1_fwd_txfm2d.c"
+ "${AOM_ROOT}/av1/encoder/av1_quantize.c"
+ "${AOM_ROOT}/av1/encoder/av1_quantize.h"
+ "${AOM_ROOT}/av1/encoder/bitstream.c"
+ "${AOM_ROOT}/av1/encoder/bitstream.h"
+ "${AOM_ROOT}/av1/encoder/block.h"
+ "${AOM_ROOT}/av1/encoder/context_tree.c"
+ "${AOM_ROOT}/av1/encoder/context_tree.h"
+ "${AOM_ROOT}/av1/encoder/corner_detect.c"
+ "${AOM_ROOT}/av1/encoder/corner_detect.h"
+ "${AOM_ROOT}/av1/encoder/corner_match.c"
+ "${AOM_ROOT}/av1/encoder/corner_match.h"
+ "${AOM_ROOT}/av1/encoder/cost.c"
+ "${AOM_ROOT}/av1/encoder/cost.h"
+ "${AOM_ROOT}/av1/encoder/encodeframe.c"
+ "${AOM_ROOT}/av1/encoder/encodeframe.h"
+ "${AOM_ROOT}/av1/encoder/encodemb.c"
+ "${AOM_ROOT}/av1/encoder/encodemb.h"
+ "${AOM_ROOT}/av1/encoder/encodemv.c"
+ "${AOM_ROOT}/av1/encoder/encodemv.h"
+ "${AOM_ROOT}/av1/encoder/encoder.c"
+ "${AOM_ROOT}/av1/encoder/encoder.h"
+ "${AOM_ROOT}/av1/encoder/encodetxb.c"
+ "${AOM_ROOT}/av1/encoder/encodetxb.h"
+ "${AOM_ROOT}/av1/encoder/ethread.c"
+ "${AOM_ROOT}/av1/encoder/ethread.h"
+ "${AOM_ROOT}/av1/encoder/extend.c"
+ "${AOM_ROOT}/av1/encoder/extend.h"
+ "${AOM_ROOT}/av1/encoder/firstpass.c"
+ "${AOM_ROOT}/av1/encoder/firstpass.h"
+ "${AOM_ROOT}/av1/encoder/global_motion.c"
+ "${AOM_ROOT}/av1/encoder/global_motion.h"
+ "${AOM_ROOT}/av1/encoder/grain_test_vectors.h"
+ "${AOM_ROOT}/av1/encoder/hash.c"
+ "${AOM_ROOT}/av1/encoder/hash.h"
+ "${AOM_ROOT}/av1/encoder/hash_motion.c"
+ "${AOM_ROOT}/av1/encoder/hash_motion.h"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.c"
+ "${AOM_ROOT}/av1/encoder/hybrid_fwd_txfm.h"
+ "${AOM_ROOT}/av1/encoder/lookahead.c"
+ "${AOM_ROOT}/av1/encoder/lookahead.h"
+ "${AOM_ROOT}/av1/encoder/mbgraph.c"
+ "${AOM_ROOT}/av1/encoder/mbgraph.h"
+ "${AOM_ROOT}/av1/encoder/mcomp.c"
+ "${AOM_ROOT}/av1/encoder/mcomp.h"
+ "${AOM_ROOT}/av1/encoder/ml.c"
+ "${AOM_ROOT}/av1/encoder/ml.h"
+ "${AOM_ROOT}/av1/encoder/palette.c"
+ "${AOM_ROOT}/av1/encoder/palette.h"
+ "${AOM_ROOT}/av1/encoder/pickcdef.c"
+ "${AOM_ROOT}/av1/encoder/picklpf.c"
+ "${AOM_ROOT}/av1/encoder/picklpf.h"
+ "${AOM_ROOT}/av1/encoder/pickrst.c"
+ "${AOM_ROOT}/av1/encoder/pickrst.h"
+ "${AOM_ROOT}/av1/encoder/ransac.c"
+ "${AOM_ROOT}/av1/encoder/ransac.h"
+ "${AOM_ROOT}/av1/encoder/ratectrl.c"
+ "${AOM_ROOT}/av1/encoder/ratectrl.h"
+ "${AOM_ROOT}/av1/encoder/rd.c"
+ "${AOM_ROOT}/av1/encoder/rd.h"
+ "${AOM_ROOT}/av1/encoder/rdopt.c"
+ "${AOM_ROOT}/av1/encoder/rdopt.h"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.c"
+ "${AOM_ROOT}/av1/encoder/reconinter_enc.h"
+ "${AOM_ROOT}/av1/encoder/segmentation.c"
+ "${AOM_ROOT}/av1/encoder/segmentation.h"
+ "${AOM_ROOT}/av1/encoder/speed_features.c"
+ "${AOM_ROOT}/av1/encoder/speed_features.h"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.c"
+ "${AOM_ROOT}/av1/encoder/temporal_filter.h"
+ "${AOM_ROOT}/av1/encoder/tokenize.c"
+ "${AOM_ROOT}/av1/encoder/tokenize.h"
+ "${AOM_ROOT}/av1/encoder/wedge_utils.c"
+ "${AOM_ROOT}/third_party/fastfeat/fast.c"
+ "${AOM_ROOT}/third_party/fastfeat/fast.h"
+ "${AOM_ROOT}/third_party/fastfeat/fast_9.c"
+ "${AOM_ROOT}/third_party/fastfeat/nonmax.c"
+ "${AOM_ROOT}/third_party/vector/vector.c"
+ "${AOM_ROOT}/third_party/vector/vector.h"
+ "${AOM_ROOT}/av1/encoder/dwt.c"
+ "${AOM_ROOT}/av1/encoder/dwt.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE2
+ "${AOM_ROOT}/av1/common/cdef_block_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/cfl_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/wiener_convolve_sse2.c"
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3
+ "${AOM_ROOT}/av1/common/cdef_block_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h"
+ "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_ssse3.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/common/cdef_block_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_highbd_convolve_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/av1_txfm_sse4.h"
+ "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_warp_plane_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/selfguided_sse4.c"
+ "${AOM_ROOT}/av1/common/x86/warp_plane_sse4.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_AVX2
+ "${AOM_ROOT}/av1/common/cdef_block_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_avx2.h"
+ "${AOM_ROOT}/av1/common/x86/cfl_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_2d_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_convolve_2d_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_jnt_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/highbd_wiener_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/jnt_convolve_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/reconinter_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/selfguided_avx2.c"
+ "${AOM_ROOT}/av1/common/x86/wiener_convolve_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSE2 "${AOM_ROOT}/av1/encoder/x86/dct_sse2.asm"
+ "${AOM_ROOT}/av1/encoder/x86/error_sse2.asm"
+ "${AOM_ROOT}/av1/encoder/x86/temporal_filter_apply_sse2.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_sse2.h"
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c"
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
+
+list(APPEND AOM_AV1_ENCODER_ASM_SSSE3_X86_64
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_1
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm1d_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/av1/encoder/x86/av1_quantize_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_highbd_quantize_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/error_intrin_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h"
+ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c"
+ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
+ "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
+ "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
+ "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
+ "${AOM_ROOT}/av1/encoder/mips/msa/temporal_filter_msa.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_NEON
+ "${AOM_ROOT}/av1/common/arm/av1_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/cfl_neon.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/convolve_neon.h"
+ "${AOM_ROOT}/av1/common/arm/jnt_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/mem_neon.h"
+ "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
+ "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
+ "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+ "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
+ "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
+ "${AOM_ROOT}/av1/common/arm/selfguided_neon.c"
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.c"
+ "${AOM_ROOT}/av1/common/arm/av1_inv_txfm_neon.h"
+ "${AOM_ROOT}/av1/common/arm/warp_plane_neon.c"
+ "${AOM_ROOT}/av1/common/cdef_block_neon.c")
+
+list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2
+ "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
+list(APPEND AOM_AV1_COMMON_INTRIN_VSX "${AOM_ROOT}/av1/common/ppc/cfl_ppc.c")
+
+if(CONFIG_ACCOUNTING)
+ list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/accounting.c"
+ "${AOM_ROOT}/av1/decoder/accounting.h")
+endif()
+
+if(CONFIG_INSPECTION)
+ list(APPEND AOM_AV1_DECODER_SOURCES "${AOM_ROOT}/av1/decoder/inspection.c"
+ "${AOM_ROOT}/av1/decoder/inspection.h")
+endif()
+
+if(CONFIG_INTERNAL_STATS)
+ list(APPEND AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/blockiness.c")
+endif()
+
+# Setup AV1 common/decoder/encoder targets. The libaom target must exist before
+# this function is called.
+function(setup_av1_targets)
+ add_library(aom_av1_common OBJECT ${AOM_AV1_COMMON_SOURCES})
+ list(APPEND AOM_LIB_TARGETS aom_av1_common)
+
+ create_dummy_source_file("aom_av1" "c" "dummy_source_file")
+ add_library(aom_av1 OBJECT "${dummy_source_file}")
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_common>)
+ list(APPEND AOM_LIB_TARGETS aom_av1)
+
+ # Not all generators support libraries consisting only of object files. Add a
+ # dummy source file to the aom_av1 target.
+ add_dummy_source_file_to_target("aom_av1" "c")
+
+ if(CONFIG_AV1_DECODER)
+ add_library(aom_av1_decoder OBJECT ${AOM_AV1_DECODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_decoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_decoder>)
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_library(aom_av1_encoder OBJECT ${AOM_AV1_ENCODER_SOURCES})
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} aom_av1_encoder)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_av1_encoder>)
+ endif()
+
+ if(HAVE_SSE2)
+ require_compiler_flag_nomsvc("-msse2" NO)
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSE2" "aom")
+ if(CONFIG_AV1_DECODER)
+ if(AOM_AV1_DECODER_ASM_SSE2)
+ add_asm_library("aom_av1_decoder_sse2" "AOM_AV1_DECODER_ASM_SSE2" "aom")
+ endif()
+
+ if(AOM_AV1_DECODER_INTRIN_SSE2)
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_decoder"
+ "AOM_AV1_DECODER_INTRIN_SSE2" "aom")
+ endif()
+ endif()
+
+ if(CONFIG_AV1_ENCODER)
+ add_asm_library("aom_av1_encoder_sse2" "AOM_AV1_ENCODER_ASM_SSE2" "aom")
+ add_intrinsics_object_library("-msse2" "sse2" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE2" "aom")
+ endif()
+ endif()
+
+ if(HAVE_SSSE3)
+ require_compiler_flag_nomsvc("-mssse3" NO)
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSSE3" "aom")
+
+ if(CONFIG_AV1_DECODER)
+ if(AOM_AV1_DECODER_INTRIN_SSSE3)
+ add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_decoder"
+ "AOM_AV1_DECODER_INTRIN_SSSE3" "aom")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_SSE4_1)
+ require_compiler_flag_nomsvc("-msse4.1" NO)
+ add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_SSE4_1" "aom")
+
+ if(CONFIG_AV1_ENCODER)
+ if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+ add_asm_library("aom_av1_encoder_ssse3"
+ "AOM_AV1_ENCODER_ASM_SSSE3_X86_64" "aom")
+ endif()
+
+ if(AOM_AV1_ENCODER_INTRIN_SSE4_1)
+ add_intrinsics_object_library("-msse4.1" "sse4" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_1" "aom")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_SSE4_2)
+ require_compiler_flag_nomsvc("-msse4.2" NO)
+ if(CONFIG_AV1_ENCODER)
+ if(AOM_AV1_ENCODER_INTRIN_SSE4_2)
+ add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+ endif()
+ endif()
+ endif()
+
+ if(HAVE_AVX2)
+ require_compiler_flag_nomsvc("-mavx2" NO)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_AVX2" "aom")
+
+ if(CONFIG_AV1_ENCODER)
+ add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_AVX2" "aom")
+ endif()
+ endif()
+
+ if(HAVE_NEON)
+ if(AOM_AV1_COMMON_INTRIN_NEON)
+ add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+ "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_NEON" "aom")
+ endif()
+
+ if(AOM_AV1_ENCODER_INTRIN_NEON)
+ add_intrinsics_object_library("${AOM_INTRIN_NEON_FLAG}" "neon"
+ "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_NEON" "aom")
+ endif()
+ endif()
+
+ if(HAVE_VSX)
+ if(AOM_AV1_COMMON_INTRIN_VSX)
+ add_intrinsics_object_library("-mvsx -maltivec" "vsx" "aom_av1_common"
+ "AOM_AV1_COMMON_INTRIN_VSX" "aom")
+ endif()
+ endif()
+
+ if(HAVE_MSA)
+ add_intrinsics_object_library("" "msa" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_MSA" "aom")
+ endif()
+
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+ target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_scale>)
+
+ # Pass the new lib targets up to the parent scope instance of
+ # $AOM_LIB_TARGETS.
+ set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
+
+function(setup_av1_test_targets)
+endfunction()
diff --git a/third_party/aom/av1/av1_cx_iface.c b/third_party/aom/av1/av1_cx_iface.c
new file mode 100644
index 0000000000..3295f618aa
--- /dev/null
+++ b/third_party/aom/av1/av1_cx_iface.c
@@ -0,0 +1,1908 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+
+#include "aom/aom_encoder.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#include "av1/av1_iface_common.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/firstpass.h"
+
+#define MAG_SIZE (4)
+#define MAX_NUM_ENHANCEMENT_LAYERS 3
+
+struct av1_extracfg {
+ int cpu_used; // available cpu percentage in 1/16
+ unsigned int enable_auto_alt_ref;
+ unsigned int enable_auto_bwd_ref;
+ unsigned int noise_sensitivity;
+ unsigned int sharpness;
+ unsigned int static_thresh;
+ unsigned int row_mt;
+ unsigned int tile_columns; // log2 number of tile columns
+ unsigned int tile_rows; // log2 number of tile rows
+ unsigned int arnr_max_frames;
+ unsigned int arnr_strength;
+ unsigned int min_gf_interval;
+ unsigned int max_gf_interval;
+ aom_tune_metric tuning;
+ unsigned int cq_level; // constrained quality level
+ unsigned int rc_max_intra_bitrate_pct;
+ unsigned int rc_max_inter_bitrate_pct;
+ unsigned int gf_cbr_boost_pct;
+ unsigned int lossless;
+ unsigned int enable_cdef;
+ unsigned int enable_restoration;
+ unsigned int disable_trellis_quant;
+ unsigned int enable_qm;
+ unsigned int qm_y;
+ unsigned int qm_u;
+ unsigned int qm_v;
+ unsigned int qm_min;
+ unsigned int qm_max;
+#if CONFIG_DIST_8X8
+ unsigned int enable_dist_8x8;
+#endif
+ unsigned int num_tg;
+ unsigned int mtu_size;
+
+ aom_timing_info_type_t timing_info_type;
+ unsigned int frame_parallel_decoding_mode;
+ int use_dual_filter;
+ AQ_MODE aq_mode;
+ DELTAQ_MODE deltaq_mode;
+ unsigned int frame_periodic_boost;
+ aom_bit_depth_t bit_depth;
+ aom_tune_content content;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ aom_chroma_sample_position_t chroma_sample_position;
+ int color_range;
+ int render_width;
+ int render_height;
+ aom_superblock_size_t superblock_size;
+ unsigned int single_tile_decoding;
+ int error_resilient_mode;
+ int s_frame_mode;
+
+ int film_grain_test_vector;
+ const char *film_grain_table_filename;
+ unsigned int motion_vector_unit_test;
+ unsigned int cdf_update_mode;
+ int enable_order_hint;
+ int enable_jnt_comp;
+ int enable_ref_frame_mvs; // sequence level
+ int allow_ref_frame_mvs; // frame level
+ int enable_warped_motion; // sequence level
+ int allow_warped_motion; // frame level
+ int enable_superres;
+#if CONFIG_DENOISE
+ float noise_level;
+ int noise_block_size;
+#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
+};
+
+static struct av1_extracfg default_extra_cfg = {
+ 0, // cpu_used
+ 1, // enable_auto_alt_ref
+ 0, // enable_auto_bwd_ref
+ 0, // noise_sensitivity
+ CONFIG_SHARP_SETTINGS, // sharpness
+ 0, // static_thresh
+ 0, // row_mt
+ 0, // tile_columns
+ 0, // tile_rows
+ 7, // arnr_max_frames
+ 5, // arnr_strength
+ 0, // min_gf_interval; 0 -> default decision
+ 0, // max_gf_interval; 0 -> default decision
+ AOM_TUNE_PSNR, // tuning
+ 10, // cq_level
+ 0, // rc_max_intra_bitrate_pct
+ 0, // rc_max_inter_bitrate_pct
+ 0, // gf_cbr_boost_pct
+ 0, // lossless
+ !CONFIG_SHARP_SETTINGS, // enable_cdef
+ 1, // enable_restoration
+ 0, // disable_trellis_quant
+ 0, // enable_qm
+ DEFAULT_QM_Y, // qm_y
+ DEFAULT_QM_U, // qm_u
+ DEFAULT_QM_V, // qm_v
+ DEFAULT_QM_FIRST, // qm_min
+ DEFAULT_QM_LAST, // qm_max
+#if CONFIG_DIST_8X8
+ 0,
+#endif
+ 1, // max number of tile groups
+ 0, // mtu_size
+ AOM_TIMING_UNSPECIFIED, // No picture timing signaling in bitstream
+ 1, // frame_parallel_decoding_mode
+ 1, // enable dual filter
+ NO_AQ, // aq_mode
+ NO_DELTA_Q, // deltaq_mode
+ 0, // frame_periodic_delta_q
+ AOM_BITS_8, // Bit depth
+ AOM_CONTENT_DEFAULT, // content
+ AOM_CICP_CP_UNSPECIFIED, // CICP color space
+ AOM_CICP_TC_UNSPECIFIED, // CICP transfer characteristics
+ AOM_CICP_MC_UNSPECIFIED, // CICP matrix coefficients
+ AOM_CSP_UNKNOWN, // chroma sample position
+ 0, // color range
+ 0, // render width
+ 0, // render height
+ AOM_SUPERBLOCK_SIZE_DYNAMIC, // superblock_size
+ 1, // this depends on large_scale_tile.
+ 0, // error_resilient_mode off by default.
+ 0, // s_frame_mode off by default.
+ 0, // film_grain_test_vector
+ 0, // film_grain_table_filename
+ 0, // motion_vector_unit_test
+ 1, // CDF update mode
+ 1, // frame order hint
+ 1, // jnt_comp
+ 1, // enable_ref_frame_mvs sequence level
+ 1, // allow ref_frame_mvs frame level
+ 1, // enable_warped_motion at sequence level
+ 1, // allow_warped_motion at frame level
+ 1, // superres
+#if CONFIG_DENOISE
+ 0, // noise_level
+ 32, // noise_block_size
+#endif
+ 0, // chroma_subsampling_x
+ 0, // chroma_subsampling_y
+};
+
+struct aom_codec_alg_priv {
+ aom_codec_priv_t base;
+ aom_codec_enc_cfg_t cfg;
+ struct av1_extracfg extra_cfg;
+ AV1EncoderConfig oxcf;
+ AV1_COMP *cpi;
+ unsigned char *cx_data;
+ size_t cx_data_sz;
+ unsigned char *pending_cx_data;
+ size_t pending_cx_data_sz;
+ int pending_frame_count;
+ size_t pending_frame_sizes[8];
+ aom_image_t preview_img;
+ aom_enc_frame_flags_t next_frame_flags;
+ aom_postproc_cfg_t preview_ppcfg;
+ aom_codec_pkt_list_decl(256) pkt_list;
+ unsigned int fixed_kf_cntr;
+ // BufferPool that holds all reference frames.
+ BufferPool *buffer_pool;
+};
+
+static aom_codec_err_t update_error_state(
+ aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+ const aom_codec_err_t res = error->error_code;
+
+ if (res != AOM_CODEC_OK)
+ ctx->base.err_detail = error->has_detail ? error->detail : NULL;
+
+ return res;
+}
+
+#undef ERROR
+#define ERROR(str) \
+ do { \
+ ctx->base.err_detail = str; \
+ return AOM_CODEC_INVALID_PARAM; \
+ } while (0)
+
+#define RANGE_CHECK(p, memb, lo, hi) \
+ do { \
+ if (!((p)->memb >= (lo) && (p)->memb <= (hi))) \
+ ERROR(#memb " out of range [" #lo ".." #hi "]"); \
+ } while (0)
+
+#define RANGE_CHECK_HI(p, memb, hi) \
+ do { \
+ if (!((p)->memb <= (hi))) ERROR(#memb " out of range [.." #hi "]"); \
+ } while (0)
+
+#define RANGE_CHECK_BOOL(p, memb) \
+ do { \
+ if (!!((p)->memb) != (p)->memb) ERROR(#memb " expected boolean"); \
+ } while (0)
+
+static aom_codec_err_t validate_config(aom_codec_alg_priv_t *ctx,
+ const aom_codec_enc_cfg_t *cfg,
+ const struct av1_extracfg *extra_cfg) {
+ RANGE_CHECK(cfg, g_w, 1, 65535); // 16 bits available
+ RANGE_CHECK(cfg, g_h, 1, 65535); // 16 bits available
+ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
+ RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den);
+ RANGE_CHECK_HI(cfg, g_profile, MAX_PROFILES - 1);
+
+ RANGE_CHECK_HI(cfg, rc_max_quantizer, 63);
+ RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer);
+ RANGE_CHECK_BOOL(extra_cfg, lossless);
+ RANGE_CHECK_HI(extra_cfg, aq_mode, AQ_MODE_COUNT - 1);
+ RANGE_CHECK_HI(extra_cfg, deltaq_mode, DELTAQ_MODE_COUNT - 1);
+ RANGE_CHECK_HI(extra_cfg, frame_periodic_boost, 1);
+ RANGE_CHECK_HI(cfg, g_threads, 64);
+ RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+ RANGE_CHECK(cfg, rc_end_usage, AOM_VBR, AOM_Q);
+ RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
+ RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
+ RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+ RANGE_CHECK(cfg, kf_mode, AOM_KF_DISABLED, AOM_KF_AUTO);
+ RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
+ RANGE_CHECK(cfg, g_pass, AOM_RC_ONE_PASS, AOM_RC_LAST_PASS);
+ RANGE_CHECK_HI(extra_cfg, min_gf_interval, MAX_LAG_BUFFERS - 1);
+ RANGE_CHECK_HI(extra_cfg, max_gf_interval, MAX_LAG_BUFFERS - 1);
+ if (extra_cfg->max_gf_interval > 0) {
+ RANGE_CHECK(extra_cfg, max_gf_interval, MAX(2, extra_cfg->min_gf_interval),
+ (MAX_LAG_BUFFERS - 1));
+ }
+
+ RANGE_CHECK_HI(cfg, rc_resize_mode, RESIZE_MODES - 1);
+ RANGE_CHECK(cfg, rc_resize_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_resize_kf_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK_HI(cfg, rc_superres_mode, SUPERRES_MODES - 1);
+ RANGE_CHECK(cfg, rc_superres_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_superres_kf_denominator, SCALE_NUMERATOR,
+ SCALE_NUMERATOR << 1);
+ RANGE_CHECK(cfg, rc_superres_qthresh, 1, 63);
+ RANGE_CHECK(cfg, rc_superres_kf_qthresh, 1, 63);
+ RANGE_CHECK_HI(extra_cfg, cdf_update_mode, 2);
+
+ // AV1 does not support a lower bound on the keyframe interval in
+ // automatic keyframe placement mode.
+ if (cfg->kf_mode != AOM_KF_DISABLED && cfg->kf_min_dist != cfg->kf_max_dist &&
+ cfg->kf_min_dist > 0)
+ ERROR(
+ "kf_min_dist not supported in auto mode, use 0 "
+ "or kf_max_dist instead.");
+
+ RANGE_CHECK_HI(extra_cfg, motion_vector_unit_test, 2);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_alt_ref, 2);
+ RANGE_CHECK_HI(extra_cfg, enable_auto_bwd_ref, 2);
+ RANGE_CHECK(extra_cfg, cpu_used, 0, 8);
+ RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+ RANGE_CHECK(extra_cfg, superblock_size, AOM_SUPERBLOCK_SIZE_64X64,
+ AOM_SUPERBLOCK_SIZE_DYNAMIC);
+ RANGE_CHECK_HI(cfg, large_scale_tile, 1);
+ RANGE_CHECK_HI(extra_cfg, single_tile_decoding, 1);
+
+ RANGE_CHECK_HI(extra_cfg, row_mt, 1);
+
+ RANGE_CHECK_HI(extra_cfg, tile_columns, 6);
+ RANGE_CHECK_HI(extra_cfg, tile_rows, 6);
+
+ RANGE_CHECK_HI(cfg, monochrome, 1);
+
+ if (cfg->large_scale_tile && extra_cfg->aq_mode)
+ ERROR(
+ "Adaptive quantization are not supported in large scale tile "
+ "coding.");
+
+ RANGE_CHECK_HI(extra_cfg, sharpness, 7);
+ RANGE_CHECK_HI(extra_cfg, arnr_max_frames, 15);
+ RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
+ RANGE_CHECK_HI(extra_cfg, cq_level, 63);
+ RANGE_CHECK(cfg, g_bit_depth, AOM_BITS_8, AOM_BITS_12);
+ RANGE_CHECK(cfg, g_input_bit_depth, 8, 12);
+ RANGE_CHECK(extra_cfg, content, AOM_CONTENT_DEFAULT, AOM_CONTENT_INVALID - 1);
+
+ // TODO(yaowu): remove this when ssim tuning is implemented for av1
+ if (extra_cfg->tuning == AOM_TUNE_SSIM)
+ ERROR("Option --tune=ssim is not currently supported in AV1.");
+
+ if (cfg->g_pass == AOM_RC_LAST_PASS) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+ const FIRSTPASS_STATS *stats;
+
+ if (cfg->rc_twopass_stats_in.buf == NULL)
+ ERROR("rc_twopass_stats_in.buf not set.");
+
+ if (cfg->rc_twopass_stats_in.sz % packet_sz)
+ ERROR("rc_twopass_stats_in.sz indicates truncated packet.");
+
+ if (cfg->rc_twopass_stats_in.sz < 2 * packet_sz)
+ ERROR("rc_twopass_stats_in requires at least two packets.");
+
+ stats =
+ (const FIRSTPASS_STATS *)cfg->rc_twopass_stats_in.buf + n_packets - 1;
+
+ if ((int)(stats->count + 0.5) != n_packets - 1)
+ ERROR("rc_twopass_stats_in missing EOS stats packet");
+ }
+
+ if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+ cfg->g_bit_depth > AOM_BITS_10) {
+ ERROR("Codec bit-depth 12 not supported in profile < 2");
+ }
+ if (cfg->g_profile <= (unsigned int)PROFILE_1 &&
+ cfg->g_input_bit_depth > 10) {
+ ERROR("Source bit-depth 12 not supported in profile < 2");
+ }
+
+ RANGE_CHECK(extra_cfg, color_primaries, AOM_CICP_CP_BT_709,
+ AOM_CICP_CP_EBU_3213); // Need to check range more precisely to
+ // check for reserved values?
+ RANGE_CHECK(extra_cfg, transfer_characteristics, AOM_CICP_TC_BT_709,
+ AOM_CICP_TC_HLG);
+ RANGE_CHECK(extra_cfg, matrix_coefficients, AOM_CICP_MC_IDENTITY,
+ AOM_CICP_MC_ICTCP);
+ RANGE_CHECK(extra_cfg, color_range, 0, 1);
+
+#if CONFIG_DIST_8X8
+ RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_DAALA_DIST);
+#else
+ RANGE_CHECK(extra_cfg, tuning, AOM_TUNE_PSNR, AOM_TUNE_SSIM);
+#endif
+
+ RANGE_CHECK(extra_cfg, timing_info_type, AOM_TIMING_UNSPECIFIED,
+ AOM_TIMING_DEC_MODEL);
+
+ RANGE_CHECK(extra_cfg, film_grain_test_vector, 0, 16);
+
+ if (extra_cfg->lossless) {
+ if (extra_cfg->aq_mode != 0)
+ ERROR("Only --aq_mode=0 can be used with --lossless=1.");
+#if CONFIG_DIST_8X8
+ if (extra_cfg->enable_dist_8x8)
+ ERROR("dist-8x8 cannot be used with lossless compression.");
+#endif
+ }
+
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_x, 1);
+ RANGE_CHECK_HI(extra_cfg, chroma_subsampling_y, 1);
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t validate_img(aom_codec_alg_priv_t *ctx,
+ const aom_image_t *img) {
+ switch (img->fmt) {
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_I420:
+ case AOM_IMG_FMT_I42016: break;
+ case AOM_IMG_FMT_I444:
+ case AOM_IMG_FMT_I44416:
+ if (ctx->cfg.g_profile == (unsigned int)PROFILE_0 &&
+ !ctx->cfg.monochrome) {
+ ERROR("Invalid image format. I444 images not supported in profile.");
+ }
+ break;
+ case AOM_IMG_FMT_I422:
+ case AOM_IMG_FMT_I42216:
+ if (ctx->cfg.g_profile != (unsigned int)PROFILE_2) {
+ ERROR("Invalid image format. I422 images not supported in profile.");
+ }
+ break;
+ default:
+ ERROR(
+ "Invalid image format. Only YV12, I420, I422, I444 images are "
+ "supported.");
+ break;
+ }
+
+ if (img->d_w != ctx->cfg.g_w || img->d_h != ctx->cfg.g_h)
+ ERROR("Image size must match encoder init configuration size");
+
+ return AOM_CODEC_OK;
+}
+
+static int get_image_bps(const aom_image_t *img) {
+ switch (img->fmt) {
+ case AOM_IMG_FMT_YV12:
+ case AOM_IMG_FMT_I420: return 12;
+ case AOM_IMG_FMT_I422: return 16;
+ case AOM_IMG_FMT_I444: return 24;
+ case AOM_IMG_FMT_I42016: return 24;
+ case AOM_IMG_FMT_I42216: return 32;
+ case AOM_IMG_FMT_I44416: return 48;
+ default: assert(0 && "Invalid image format"); break;
+ }
+ return 0;
+}
+
+// Set appropriate options to disable frame super-resolution.
+static void disable_superres(AV1EncoderConfig *const oxcf) {
+ oxcf->superres_mode = SUPERRES_NONE;
+ oxcf->superres_scale_denominator = SCALE_NUMERATOR;
+ oxcf->superres_kf_scale_denominator = SCALE_NUMERATOR;
+ oxcf->superres_qthresh = 255;
+ oxcf->superres_kf_qthresh = 255;
+}
+
+static aom_codec_err_t set_encoder_config(
+ AV1EncoderConfig *oxcf, const aom_codec_enc_cfg_t *cfg,
+ const struct av1_extracfg *extra_cfg) {
+ const int is_vbr = cfg->rc_end_usage == AOM_VBR;
+ oxcf->profile = cfg->g_profile;
+ oxcf->fwd_kf_enabled = cfg->fwd_kf_enabled;
+ oxcf->max_threads = (int)cfg->g_threads;
+ oxcf->width = cfg->g_w;
+ oxcf->height = cfg->g_h;
+ oxcf->forced_max_frame_width = cfg->g_forced_max_frame_width;
+ oxcf->forced_max_frame_height = cfg->g_forced_max_frame_height;
+ oxcf->bit_depth = cfg->g_bit_depth;
+ oxcf->input_bit_depth = cfg->g_input_bit_depth;
+ // guess a frame rate if out of whack, use 30
+ oxcf->init_framerate = (double)cfg->g_timebase.den / cfg->g_timebase.num;
+ if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL ||
+ extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+ oxcf->timing_info_present = 1;
+ oxcf->timing_info.num_units_in_display_tick = cfg->g_timebase.num;
+ oxcf->timing_info.time_scale = cfg->g_timebase.den;
+ oxcf->timing_info.num_ticks_per_picture = 1;
+ } else {
+ oxcf->timing_info_present = 0;
+ }
+ if (extra_cfg->timing_info_type == AOM_TIMING_EQUAL) {
+ oxcf->timing_info.equal_picture_interval = 1;
+ oxcf->decoder_model_info_present_flag = 0;
+ oxcf->display_model_info_present_flag = 1;
+ } else if (extra_cfg->timing_info_type == AOM_TIMING_DEC_MODEL) {
+ // if( extra_cfg->arnr_strength > 0 )
+ // {
+ // printf("Only --arnr-strength=0 can currently be used with
+ // --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+ // }
+ // if( extra_cfg->enable_superres)
+ // {
+ // printf("Only --superres-mode=0 can currently be used with
+ // --timing-info=model."); return AOM_CODEC_INVALID_PARAM;
+ // }
+ oxcf->buffer_model.num_units_in_decoding_tick = cfg->g_timebase.num;
+ oxcf->timing_info.equal_picture_interval = 0;
+ oxcf->decoder_model_info_present_flag = 1;
+ oxcf->buffer_removal_time_present = 1;
+ oxcf->display_model_info_present_flag = 1;
+ }
+ if (oxcf->init_framerate > 180) {
+ oxcf->init_framerate = 30;
+ oxcf->timing_info_present = 0;
+ }
+ oxcf->mode = GOOD;
+ oxcf->cfg = &cfg->cfg;
+
+ switch (cfg->g_pass) {
+ case AOM_RC_ONE_PASS: oxcf->pass = 0; break;
+ case AOM_RC_FIRST_PASS: oxcf->pass = 1; break;
+ case AOM_RC_LAST_PASS: oxcf->pass = 2; break;
+ }
+
+ oxcf->lag_in_frames =
+ cfg->g_pass == AOM_RC_FIRST_PASS ? 0 : cfg->g_lag_in_frames;
+ oxcf->rc_mode = cfg->rc_end_usage;
+
+ // Convert target bandwidth from Kbit/s to Bit/s
+ oxcf->target_bandwidth = 1000 * cfg->rc_target_bitrate;
+ oxcf->rc_max_intra_bitrate_pct = extra_cfg->rc_max_intra_bitrate_pct;
+ oxcf->rc_max_inter_bitrate_pct = extra_cfg->rc_max_inter_bitrate_pct;
+ oxcf->gf_cbr_boost_pct = extra_cfg->gf_cbr_boost_pct;
+
+ oxcf->best_allowed_q =
+ extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_min_quantizer);
+ oxcf->worst_allowed_q =
+ extra_cfg->lossless ? 0 : av1_quantizer_to_qindex(cfg->rc_max_quantizer);
+ oxcf->cq_level = av1_quantizer_to_qindex(extra_cfg->cq_level);
+ oxcf->fixed_q = -1;
+
+ oxcf->enable_cdef = extra_cfg->enable_cdef;
+ oxcf->enable_restoration = extra_cfg->enable_restoration;
+ oxcf->disable_trellis_quant = extra_cfg->disable_trellis_quant;
+ oxcf->using_qm = extra_cfg->enable_qm;
+ oxcf->qm_y = extra_cfg->qm_y;
+ oxcf->qm_u = extra_cfg->qm_u;
+ oxcf->qm_v = extra_cfg->qm_v;
+ oxcf->qm_minlevel = extra_cfg->qm_min;
+ oxcf->qm_maxlevel = extra_cfg->qm_max;
+#if CONFIG_DIST_8X8
+ oxcf->using_dist_8x8 = extra_cfg->enable_dist_8x8;
+ if (extra_cfg->tuning == AOM_TUNE_CDEF_DIST ||
+ extra_cfg->tuning == AOM_TUNE_DAALA_DIST)
+ oxcf->using_dist_8x8 = 1;
+#endif
+ oxcf->num_tile_groups = extra_cfg->num_tg;
+ // In large-scale tile encoding mode, num_tile_groups is always 1.
+ if (cfg->large_scale_tile) oxcf->num_tile_groups = 1;
+ oxcf->mtu = extra_cfg->mtu_size;
+
+ // FIXME(debargha): Should this be:
+ // oxcf->allow_ref_frame_mvs = extra_cfg->allow_ref_frame_mvs &
+ // extra_cfg->enable_order_hint ?
+ // Disallow using temporal MVs while large_scale_tile = 1.
+ oxcf->allow_ref_frame_mvs =
+ extra_cfg->allow_ref_frame_mvs && !cfg->large_scale_tile;
+ oxcf->under_shoot_pct = cfg->rc_undershoot_pct;
+ oxcf->over_shoot_pct = cfg->rc_overshoot_pct;
+
+ oxcf->resize_mode = (RESIZE_MODE)cfg->rc_resize_mode;
+ oxcf->resize_scale_denominator = (uint8_t)cfg->rc_resize_denominator;
+ oxcf->resize_kf_scale_denominator = (uint8_t)cfg->rc_resize_kf_denominator;
+ if (oxcf->resize_mode == RESIZE_FIXED &&
+ oxcf->resize_scale_denominator == SCALE_NUMERATOR &&
+ oxcf->resize_kf_scale_denominator == SCALE_NUMERATOR)
+ oxcf->resize_mode = RESIZE_NONE;
+
+ if (extra_cfg->lossless || cfg->large_scale_tile) {
+ disable_superres(oxcf);
+ } else {
+ oxcf->superres_mode = (SUPERRES_MODE)cfg->rc_superres_mode;
+ oxcf->superres_scale_denominator = (uint8_t)cfg->rc_superres_denominator;
+ oxcf->superres_kf_scale_denominator =
+ (uint8_t)cfg->rc_superres_kf_denominator;
+ oxcf->superres_qthresh = av1_quantizer_to_qindex(cfg->rc_superres_qthresh);
+ oxcf->superres_kf_qthresh =
+ av1_quantizer_to_qindex(cfg->rc_superres_kf_qthresh);
+ if (oxcf->superres_mode == SUPERRES_FIXED &&
+ oxcf->superres_scale_denominator == SCALE_NUMERATOR &&
+ oxcf->superres_kf_scale_denominator == SCALE_NUMERATOR) {
+ disable_superres(oxcf);
+ }
+ if (oxcf->superres_mode == SUPERRES_QTHRESH &&
+ oxcf->superres_qthresh == 255 && oxcf->superres_kf_qthresh == 255) {
+ disable_superres(oxcf);
+ }
+ }
+
+ oxcf->maximum_buffer_size_ms = is_vbr ? 240000 : cfg->rc_buf_sz;
+ oxcf->starting_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_initial_sz;
+ oxcf->optimal_buffer_level_ms = is_vbr ? 60000 : cfg->rc_buf_optimal_sz;
+
+ oxcf->drop_frames_water_mark = cfg->rc_dropframe_thresh;
+
+ oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
+ oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
+ oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+
+ oxcf->auto_key =
+ cfg->kf_mode == AOM_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
+
+ oxcf->key_freq = cfg->kf_max_dist;
+ oxcf->sframe_dist = cfg->sframe_dist;
+ oxcf->sframe_mode = cfg->sframe_mode;
+ oxcf->sframe_enabled = cfg->sframe_dist != 0;
+ oxcf->speed = extra_cfg->cpu_used;
+ oxcf->enable_auto_arf = extra_cfg->enable_auto_alt_ref;
+ oxcf->enable_auto_brf = extra_cfg->enable_auto_bwd_ref;
+ oxcf->noise_sensitivity = extra_cfg->noise_sensitivity;
+ oxcf->sharpness = extra_cfg->sharpness;
+
+ oxcf->two_pass_stats_in = cfg->rc_twopass_stats_in;
+
+#if CONFIG_FP_MB_STATS
+ oxcf->firstpass_mb_stats_in = cfg->rc_firstpass_mb_stats_in;
+#endif
+
+ oxcf->color_primaries = extra_cfg->color_primaries;
+ oxcf->transfer_characteristics = extra_cfg->transfer_characteristics;
+ oxcf->matrix_coefficients = extra_cfg->matrix_coefficients;
+ oxcf->chroma_sample_position = extra_cfg->chroma_sample_position;
+
+ oxcf->color_range = extra_cfg->color_range;
+ oxcf->render_width = extra_cfg->render_width;
+ oxcf->render_height = extra_cfg->render_height;
+ oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
+ // Adjust g_lag_in_frames down if not needed
+ oxcf->lag_in_frames =
+ AOMMIN(MAX_GF_INTERVAL + oxcf->arnr_max_frames / 2, oxcf->lag_in_frames);
+ oxcf->arnr_strength = extra_cfg->arnr_strength;
+ oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+ oxcf->max_gf_interval = extra_cfg->max_gf_interval;
+
+ oxcf->tuning = extra_cfg->tuning;
+ oxcf->content = extra_cfg->content;
+ oxcf->cdf_update_mode = (uint8_t)extra_cfg->cdf_update_mode;
+ oxcf->superblock_size = extra_cfg->superblock_size;
+ if (cfg->large_scale_tile) {
+ oxcf->film_grain_test_vector = 0;
+ oxcf->film_grain_table_filename = NULL;
+ } else {
+ oxcf->film_grain_test_vector = extra_cfg->film_grain_test_vector;
+ oxcf->film_grain_table_filename = extra_cfg->film_grain_table_filename;
+ }
+#if CONFIG_DENOISE
+ oxcf->noise_level = extra_cfg->noise_level;
+ oxcf->noise_block_size = extra_cfg->noise_block_size;
+#endif
+ oxcf->large_scale_tile = cfg->large_scale_tile;
+ oxcf->single_tile_decoding =
+ (oxcf->large_scale_tile) ? extra_cfg->single_tile_decoding : 0;
+ if (oxcf->large_scale_tile) {
+ // The superblock_size can only be AOM_SUPERBLOCK_SIZE_64X64 or
+ // AOM_SUPERBLOCK_SIZE_128X128 while oxcf->large_scale_tile = 1. If
+ // superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC, hard set it to
+ // AOM_SUPERBLOCK_SIZE_64X64(default value in large_scale_tile).
+ if (extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_64X64 &&
+ extra_cfg->superblock_size != AOM_SUPERBLOCK_SIZE_128X128)
+ oxcf->superblock_size = AOM_SUPERBLOCK_SIZE_64X64;
+ }
+
+ oxcf->row_mt = extra_cfg->row_mt;
+
+ oxcf->tile_columns = extra_cfg->tile_columns;
+ oxcf->tile_rows = extra_cfg->tile_rows;
+
+ oxcf->monochrome = cfg->monochrome;
+ oxcf->full_still_picture_hdr = cfg->full_still_picture_hdr;
+ oxcf->enable_dual_filter = extra_cfg->use_dual_filter;
+ oxcf->enable_order_hint = extra_cfg->enable_order_hint;
+ oxcf->enable_jnt_comp =
+ extra_cfg->enable_jnt_comp & extra_cfg->enable_order_hint;
+ oxcf->enable_ref_frame_mvs =
+ extra_cfg->enable_ref_frame_mvs & extra_cfg->enable_order_hint;
+
+ oxcf->enable_warped_motion = extra_cfg->enable_warped_motion;
+ oxcf->allow_warped_motion =
+ extra_cfg->allow_warped_motion & extra_cfg->enable_warped_motion;
+
+ oxcf->enable_superres =
+ (oxcf->superres_mode != SUPERRES_NONE) && extra_cfg->enable_superres;
+ if (!oxcf->enable_superres) {
+ disable_superres(oxcf);
+ }
+
+ oxcf->tile_width_count = AOMMIN(cfg->tile_width_count, MAX_TILE_COLS);
+ oxcf->tile_height_count = AOMMIN(cfg->tile_height_count, MAX_TILE_ROWS);
+ for (int i = 0; i < oxcf->tile_width_count; i++) {
+ oxcf->tile_widths[i] = AOMMAX(cfg->tile_widths[i], 1);
+ }
+ for (int i = 0; i < oxcf->tile_height_count; i++) {
+ oxcf->tile_heights[i] = AOMMAX(cfg->tile_heights[i], 1);
+ }
+ oxcf->error_resilient_mode =
+ cfg->g_error_resilient | extra_cfg->error_resilient_mode;
+ oxcf->s_frame_mode = extra_cfg->s_frame_mode;
+ oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
+ if (cfg->g_pass == AOM_RC_LAST_PASS) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
+ oxcf->limit = n_packets - 1;
+ } else {
+ oxcf->limit = cfg->g_limit;
+ }
+
+ if (oxcf->limit == 1) {
+ // still picture mode, display model and timing is meaningless
+ oxcf->display_model_info_present_flag = 0;
+ oxcf->timing_info_present = 0;
+ }
+
+ oxcf->aq_mode = extra_cfg->aq_mode;
+ oxcf->deltaq_mode = extra_cfg->deltaq_mode;
+
+ oxcf->save_as_annexb = cfg->save_as_annexb;
+
+ oxcf->frame_periodic_boost = extra_cfg->frame_periodic_boost;
+ oxcf->motion_vector_unit_test = extra_cfg->motion_vector_unit_test;
+
+#if CONFIG_REDUCED_ENCODER_BORDER
+ if (oxcf->superres_mode != SUPERRES_NONE ||
+ oxcf->resize_mode != RESIZE_NONE) {
+ warn(
+ "Superres / resize cannot be used with CONFIG_REDUCED_ENCODER_BORDER. "
+ "Disabling superres/resize.\n");
+ // return AOM_CODEC_INVALID_PARAM;
+ disable_superres(oxcf);
+ oxcf->resize_mode = RESIZE_NONE;
+ oxcf->resize_scale_denominator = SCALE_NUMERATOR;
+ oxcf->resize_kf_scale_denominator = SCALE_NUMERATOR;
+ }
+#endif // CONFIG_REDUCED_ENCODER_BORDER
+
+ oxcf->chroma_subsampling_x = extra_cfg->chroma_subsampling_x;
+ oxcf->chroma_subsampling_y = extra_cfg->chroma_subsampling_y;
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t encoder_set_config(aom_codec_alg_priv_t *ctx,
+ const aom_codec_enc_cfg_t *cfg) {
+ aom_codec_err_t res;
+ int force_key = 0;
+
+ if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
+ if (cfg->g_lag_in_frames > 1 || cfg->g_pass != AOM_RC_ONE_PASS)
+ ERROR("Cannot change width or height after initialization");
+ if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+ (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
+ (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+ force_key = 1;
+ }
+
+ // Prevent increasing lag_in_frames. This check is stricter than it needs
+ // to be -- the limit is not increasing past the first lag_in_frames
+ // value, but we don't track the initial config, only the last successful
+ // config.
+ if (cfg->g_lag_in_frames > ctx->cfg.g_lag_in_frames)
+ ERROR("Cannot increase lag_in_frames");
+
+ res = validate_config(ctx, cfg, &ctx->extra_cfg);
+
+ if (res == AOM_CODEC_OK) {
+ ctx->cfg = *cfg;
+ set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+ // On profile change, request a key frame
+ force_key |= ctx->cpi->common.seq_params.profile != ctx->oxcf.profile;
+ av1_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ if (force_key) ctx->next_frame_flags |= AOM_EFLAG_FORCE_KF;
+
+ return res;
+}
+
+static aom_fixed_buf_t *encoder_get_global_headers(aom_codec_alg_priv_t *ctx) {
+ return av1_get_global_headers(ctx->cpi);
+}
+
+static aom_codec_err_t ctrl_get_quantizer(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = av1_get_quantizer(ctx->cpi);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_quantizer64(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg = av1_qindex_to_quantizer(av1_get_quantizer(ctx->cpi));
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t update_extra_cfg(aom_codec_alg_priv_t *ctx,
+ const struct av1_extracfg *extra_cfg) {
+ const aom_codec_err_t res = validate_config(ctx, &ctx->cfg, extra_cfg);
+ if (res == AOM_CODEC_OK) {
+ ctx->extra_cfg = *extra_cfg;
+ set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+ av1_change_config(ctx->cpi, &ctx->oxcf);
+ }
+ return res;
+}
+
+static aom_codec_err_t ctrl_set_cpuused(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cpu_used = CAST(AOME_SET_CPUUSED, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_alt_ref(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_auto_alt_ref = CAST(AOME_SET_ENABLEAUTOALTREF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_auto_bwd_ref(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_auto_bwd_ref = CAST(AOME_SET_ENABLEAUTOBWDREF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_noise_sensitivity(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_sensitivity = CAST(AV1E_SET_NOISE_SENSITIVITY, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_sharpness(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.sharpness = CAST(AOME_SET_SHARPNESS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_static_thresh(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.static_thresh = CAST(AOME_SET_STATIC_THRESHOLD, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.row_mt = CAST(AV1E_SET_ROW_MT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_columns(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tile_columns = CAST(AV1E_SET_TILE_COLUMNS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tile_rows(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tile_rows = CAST(AV1E_SET_TILE_ROWS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_max_frames(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.arnr_max_frames = CAST(AOME_SET_ARNR_MAXFRAMES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_arnr_strength(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.arnr_strength = CAST(AOME_SET_ARNR_STRENGTH, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_tuning(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.tuning = CAST(AOME_SET_TUNING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cq_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cq_level = CAST(AOME_SET_CQ_LEVEL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_intra_bitrate_pct(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.rc_max_intra_bitrate_pct =
+ CAST(AOME_SET_MAX_INTRA_BITRATE_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_max_inter_bitrate_pct(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.rc_max_inter_bitrate_pct =
+ CAST(AOME_SET_MAX_INTER_BITRATE_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.gf_cbr_boost_pct = CAST(AV1E_SET_GF_CBR_BOOST_PCT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_lossless(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.lossless = CAST(AV1E_SET_LOSSLESS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_cdef(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_cdef = CAST(AV1E_SET_ENABLE_CDEF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_restoration(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_restoration = CAST(AV1E_SET_ENABLE_RESTORATION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_disable_trellis_quant(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.disable_trellis_quant = CAST(AV1E_SET_DISABLE_TRELLIS_QUANT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_qm(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_y = CAST(AV1E_SET_QM_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_u(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_u = CAST(AV1E_SET_QM_U, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_v(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_v = CAST(AV1E_SET_QM_V, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_qm_min(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_min = CAST(AV1E_SET_QM_MIN, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_qm_max(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.qm_max = CAST(AV1E_SET_QM_MAX, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+#if CONFIG_DIST_8X8
+static aom_codec_err_t ctrl_set_enable_dist_8x8(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_dist_8x8 = CAST(AV1E_SET_ENABLE_DIST_8X8, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+static aom_codec_err_t ctrl_set_num_tg(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.num_tg = CAST(AV1E_SET_NUM_TG, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_mtu(aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.mtu_size = CAST(AV1E_SET_MTU, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+static aom_codec_err_t ctrl_set_timing_info_type(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.timing_info_type = CAST(AV1E_SET_TIMING_INFO_TYPE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_df(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.use_dual_filter = CAST(AV1E_SET_ENABLE_DF, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_order_hint(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_order_hint = CAST(AV1E_SET_ENABLE_ORDER_HINT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_jnt_comp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_jnt_comp = CAST(AV1E_SET_ENABLE_JNT_COMP, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_ref_frame_mvs = CAST(AV1E_SET_ENABLE_REF_FRAME_MVS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_ref_frame_mvs(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.allow_ref_frame_mvs = CAST(AV1E_SET_ALLOW_REF_FRAME_MVS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_warped_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_warped_motion = CAST(AV1E_SET_ENABLE_WARPED_MOTION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_allow_warped_motion(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.allow_warped_motion = CAST(AV1E_SET_ALLOW_WARPED_MOTION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_enable_superres(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.enable_superres = CAST(AV1E_SET_ENABLE_SUPERRES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_error_resilient_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.error_resilient_mode = CAST(AV1E_SET_ERROR_RESILIENT_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_s_frame_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.s_frame_mode = CAST(AV1E_SET_S_FRAME_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_parallel_decoding_mode(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.frame_parallel_decoding_mode =
+ CAST(AV1E_SET_FRAME_PARALLEL_DECODING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_single_tile_decoding(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.single_tile_decoding = CAST(AV1E_SET_SINGLE_TILE_DECODING, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_aq_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.aq_mode = CAST(AV1E_SET_AQ_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_test_vector(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.film_grain_test_vector =
+ CAST(AV1E_SET_FILM_GRAIN_TEST_VECTOR, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_film_grain_table(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.film_grain_table_filename = CAST(AV1E_SET_FILM_GRAIN_TABLE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+#if CONFIG_DENOISE
+static aom_codec_err_t ctrl_set_denoise_noise_level(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_level =
+ ((float)CAST(AV1E_SET_DENOISE_NOISE_LEVEL, args)) / 10.0f;
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_denoise_block_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.noise_block_size = CAST(AV1E_SET_DENOISE_BLOCK_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif
+
+static aom_codec_err_t ctrl_set_deltaq_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.deltaq_mode = CAST(AV1E_SET_DELTAQ_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_min_gf_interval(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.min_gf_interval = CAST(AV1E_SET_MIN_GF_INTERVAL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_max_gf_interval(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.max_gf_interval = CAST(AV1E_SET_MAX_GF_INTERVAL, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_frame_periodic_boost(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.frame_periodic_boost = CAST(AV1E_SET_FRAME_PERIODIC_BOOST, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_enable_motion_vector_unit_test(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.motion_vector_unit_test =
+ CAST(AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t encoder_init(aom_codec_ctx_t *ctx,
+ aom_codec_priv_enc_mr_cfg_t *data) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+ (void)data;
+
+ if (ctx->priv == NULL) {
+ aom_codec_alg_priv_t *const priv = aom_calloc(1, sizeof(*priv));
+ if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+ ctx->priv = (aom_codec_priv_t *)priv;
+ ctx->priv->init_flags = ctx->init_flags;
+ ctx->priv->enc.total_encoders = 1;
+ priv->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+ if (priv->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+#endif
+
+ if (ctx->config.enc) {
+ // Update the reference to the config structure to an internal copy.
+ priv->cfg = *ctx->config.enc;
+ ctx->config.enc = &priv->cfg;
+ }
+
+ priv->extra_cfg = default_extra_cfg;
+ aom_once(av1_initialize_enc);
+
+ res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
+
+ if (res == AOM_CODEC_OK) {
+ set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+ priv->oxcf.use_highbitdepth =
+ (ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+ priv->cpi = av1_create_compressor(&priv->oxcf, priv->buffer_pool);
+ if (priv->cpi == NULL)
+ res = AOM_CODEC_MEM_ERROR;
+ else
+ priv->cpi->output_pkt_list = &priv->pkt_list.head;
+ }
+ }
+
+ return res;
+}
+
+static aom_codec_err_t encoder_destroy(aom_codec_alg_priv_t *ctx) {
+ free(ctx->cx_data);
+ av1_remove_compressor(ctx->cpi);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+ aom_free(ctx->buffer_pool);
+ aom_free(ctx);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_frame_flags_t get_frame_pkt_flags(const AV1_COMP *cpi,
+ unsigned int lib_flags) {
+ aom_codec_frame_flags_t flags = lib_flags << 16;
+
+ if (lib_flags & FRAMEFLAGS_KEY) flags |= AOM_FRAME_IS_KEY;
+
+ if (cpi->droppable) flags |= AOM_FRAME_IS_DROPPABLE;
+
+ return flags;
+}
+
+static aom_codec_err_t encoder_encode(aom_codec_alg_priv_t *ctx,
+ const aom_image_t *img,
+ aom_codec_pts_t pts,
+ unsigned long duration,
+ aom_enc_frame_flags_t enc_flags) {
+ const size_t kMinCompressedSize = 8192;
+ volatile aom_codec_err_t res = AOM_CODEC_OK;
+ AV1_COMP *const cpi = ctx->cpi;
+ const aom_rational_t *const timebase = &ctx->cfg.g_timebase;
+
+ if (cpi == NULL) return AOM_CODEC_INVALID_PARAM;
+
+ if (img != NULL) {
+ res = validate_img(ctx, img);
+ // TODO(jzern) the checks related to cpi's validity should be treated as a
+ // failure condition, encoder setup is done fully in init() currently.
+ if (res == AOM_CODEC_OK) {
+ size_t data_sz = ALIGN_POWER_OF_TWO(ctx->cfg.g_w, 5) *
+ ALIGN_POWER_OF_TWO(ctx->cfg.g_h, 5) * get_image_bps(img);
+ if (data_sz < kMinCompressedSize) data_sz = kMinCompressedSize;
+ if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
+ ctx->cx_data_sz = data_sz;
+ free(ctx->cx_data);
+ ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+ if (ctx->cx_data == NULL) {
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+ }
+ }
+
+ if (ctx->oxcf.mode != GOOD) {
+ ctx->oxcf.mode = GOOD;
+ av1_change_config(ctx->cpi, &ctx->oxcf);
+ }
+
+ aom_codec_pkt_list_init(&ctx->pkt_list);
+
+ volatile aom_enc_frame_flags_t flags = enc_flags;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cpi->common.error.jmp)) {
+ cpi->common.error.setjmp = 0;
+ res = update_error_state(ctx, &cpi->common.error);
+ aom_clear_system_state();
+ return res;
+ }
+ cpi->common.error.setjmp = 1;
+
+ // Note(yunqing): While applying encoding flags, always start from enabling
+ // all, and then modifying according to the flags. Previous frame's flags are
+ // overwritten.
+ av1_apply_encoding_flags(cpi, flags);
+
+ // Handle fixed keyframe intervals
+ if (ctx->cfg.kf_mode == AOM_KF_AUTO &&
+ ctx->cfg.kf_min_dist == ctx->cfg.kf_max_dist) {
+ if (++ctx->fixed_kf_cntr > ctx->cfg.kf_min_dist) {
+ flags |= AOM_EFLAG_FORCE_KF;
+ ctx->fixed_kf_cntr = 1;
+ }
+ }
+
+ if (res == AOM_CODEC_OK) {
+ int64_t dst_time_stamp = timebase_units_to_ticks(timebase, pts);
+ int64_t dst_end_time_stamp =
+ timebase_units_to_ticks(timebase, pts + duration);
+
+ // Set up internal flags
+ if (ctx->base.init_flags & AOM_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
+
+ if (img != NULL) {
+ YV12_BUFFER_CONFIG sd;
+ res = image2yuvconfig(img, &sd);
+
+ // Store the original flags in to the frame buffer. Will extract the
+ // key frame flag when we actually encode this frame.
+ if (av1_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+ dst_time_stamp, dst_end_time_stamp)) {
+ res = update_error_state(ctx, &cpi->common.error);
+ }
+ ctx->next_frame_flags = 0;
+ }
+
+ unsigned char *cx_data = ctx->cx_data;
+ size_t cx_data_sz = ctx->cx_data_sz;
+
+ /* Any pending invisible frames? */
+ if (ctx->pending_cx_data) {
+ memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+ ctx->pending_cx_data = cx_data;
+ cx_data += ctx->pending_cx_data_sz;
+ cx_data_sz -= ctx->pending_cx_data_sz;
+
+ /* TODO: this is a minimal check, the underlying codec doesn't respect
+ * the buffer size anyway.
+ */
+ if (cx_data_sz < ctx->cx_data_sz / 2) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+ "Compressed data buffer too small");
+ }
+ }
+
+ size_t frame_size = 0;
+ unsigned int lib_flags = 0;
+ int is_frame_visible = 0;
+ int index_size = 0;
+ // invisible frames get packed with the next visible frame
+ while (cx_data_sz - index_size >= ctx->cx_data_sz / 2 &&
+ !is_frame_visible &&
+ -1 != av1_get_compressed_data(cpi, &lib_flags, &frame_size, cx_data,
+ &dst_time_stamp, &dst_end_time_stamp,
+ !img, timebase)) {
+ if (cpi->common.seq_params.frame_id_numbers_present_flag) {
+ if (cpi->common.invalid_delta_frame_id_minus_1) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR,
+ "Invalid delta_frame_id_minus_1");
+ }
+ }
+ cpi->seq_params_locked = 1;
+ if (frame_size) {
+ if (ctx->pending_cx_data == 0) ctx->pending_cx_data = cx_data;
+
+ const int write_temporal_delimiter =
+ !cpi->common.spatial_layer_id && !ctx->pending_frame_count;
+
+ if (write_temporal_delimiter) {
+ uint32_t obu_header_size = 1;
+ const uint32_t obu_payload_size = 0;
+ const size_t length_field_size =
+ aom_uleb_size_in_bytes(obu_payload_size);
+
+ if (ctx->pending_cx_data) {
+ const size_t move_offset = length_field_size + 1;
+ memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+ frame_size);
+ }
+ const uint32_t obu_header_offset = 0;
+ obu_header_size = write_obu_header(
+ OBU_TEMPORAL_DELIMITER, 0,
+ (uint8_t *)(ctx->pending_cx_data + obu_header_offset));
+
+ // OBUs are preceded/succeeded by an unsigned leb128 coded integer.
+ if (write_uleb_obu_size(obu_header_size, obu_payload_size,
+ ctx->pending_cx_data) != AOM_CODEC_OK) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+ }
+
+ frame_size += obu_header_size + obu_payload_size + length_field_size;
+ }
+
+ if (ctx->oxcf.save_as_annexb) {
+ size_t curr_frame_size = frame_size;
+ if (av1_convert_sect5obus_to_annexb(cx_data, &curr_frame_size) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+ }
+ frame_size = curr_frame_size;
+
+ // B_PRIME (add frame size)
+ const size_t length_field_size = aom_uleb_size_in_bytes(frame_size);
+ if (ctx->pending_cx_data) {
+ const size_t move_offset = length_field_size;
+ memmove(cx_data + move_offset, cx_data, frame_size);
+ }
+ if (write_uleb_obu_size(0, (uint32_t)frame_size, cx_data) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+ }
+ frame_size += length_field_size;
+ }
+
+ ctx->pending_frame_sizes[ctx->pending_frame_count++] = frame_size;
+ ctx->pending_cx_data_sz += frame_size;
+
+ cx_data += frame_size;
+ cx_data_sz -= frame_size;
+
+ index_size = MAG_SIZE * (ctx->pending_frame_count - 1) + 2;
+
+ is_frame_visible = cpi->common.show_frame;
+ }
+ }
+ if (is_frame_visible) {
+ // Add the frame packet to the list of returned packets.
+ aom_codec_cx_pkt_t pkt;
+
+ if (ctx->oxcf.save_as_annexb) {
+ // B_PRIME (add TU size)
+ size_t tu_size = ctx->pending_cx_data_sz;
+ const size_t length_field_size = aom_uleb_size_in_bytes(tu_size);
+ if (ctx->pending_cx_data) {
+ const size_t move_offset = length_field_size;
+ memmove(ctx->pending_cx_data + move_offset, ctx->pending_cx_data,
+ tu_size);
+ }
+ if (write_uleb_obu_size(0, (uint32_t)tu_size, ctx->pending_cx_data) !=
+ AOM_CODEC_OK) {
+ aom_internal_error(&cpi->common.error, AOM_CODEC_ERROR, NULL);
+ }
+ ctx->pending_cx_data_sz += length_field_size;
+ }
+
+ pkt.kind = AOM_CODEC_CX_FRAME_PKT;
+
+ pkt.data.frame.buf = ctx->pending_cx_data;
+ pkt.data.frame.sz = ctx->pending_cx_data_sz;
+ pkt.data.frame.partition_id = -1;
+ pkt.data.frame.vis_frame_size = frame_size;
+
+ pkt.data.frame.pts = ticks_to_timebase_units(timebase, dst_time_stamp);
+ pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
+ pkt.data.frame.duration = (uint32_t)ticks_to_timebase_units(
+ timebase, dst_end_time_stamp - dst_time_stamp);
+
+ aom_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+
+ ctx->pending_cx_data = NULL;
+ ctx->pending_cx_data_sz = 0;
+ ctx->pending_frame_count = 0;
+ }
+ }
+
+ cpi->common.error.setjmp = 0;
+ return res;
+}
+
+static const aom_codec_cx_pkt_t *encoder_get_cxdata(aom_codec_alg_priv_t *ctx,
+ aom_codec_iter_t *iter) {
+ return aom_codec_pkt_list_get(&ctx->pkt_list.head, iter);
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ av1_set_reference_enc(ctx->cpi, frame->idx, &sd);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG sd;
+
+ image2yuvconfig(&frame->img, &sd);
+ av1_copy_reference_enc(ctx->cpi, frame->idx, &sd);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+
+ if (frame != NULL) {
+ YV12_BUFFER_CONFIG *fb = get_ref_frame(&ctx->cpi->common, frame->idx);
+ if (fb == NULL) return AOM_CODEC_ERROR;
+
+ yuvconfig2image(&frame->img, fb, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+ if (new_img != NULL) {
+ YV12_BUFFER_CONFIG new_frame;
+
+ if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+ yuvconfig2image(new_img, &new_frame, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *const new_img = va_arg(args, aom_image_t *);
+
+ if (new_img != NULL) {
+ YV12_BUFFER_CONFIG new_frame;
+
+ if (av1_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+ YV12_BUFFER_CONFIG sd;
+ image2yuvconfig(new_img, &sd);
+ return av1_copy_new_frame_enc(&ctx->cpi->common, &new_frame, &sd);
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_previewpp(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+}
+
+static aom_image_t *encoder_get_preview(aom_codec_alg_priv_t *ctx) {
+ YV12_BUFFER_CONFIG sd;
+
+ if (av1_get_preview_raw_frame(ctx->cpi, &sd) == 0) {
+ yuvconfig2image(&ctx->preview_img, &sd, NULL);
+ return &ctx->preview_img;
+ } else {
+ return NULL;
+ }
+}
+
+static aom_codec_err_t ctrl_use_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int reference_flag = va_arg(args, int);
+
+ av1_use_as_reference(ctx->cpi, reference_flag);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_roi_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ (void)ctx;
+ (void)args;
+
+ // TODO(yaowu): Need to re-implement and test for AV1.
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_active_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+ if (map) {
+ if (!av1_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
+ (int)map->cols))
+ return AOM_CODEC_OK;
+ else
+ return AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_active_map(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_active_map_t *const map = va_arg(args, aom_active_map_t *);
+
+ if (map) {
+ if (!av1_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
+ (int)map->cols))
+ return AOM_CODEC_OK;
+ else
+ return AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_scale_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_scaling_mode_t *const mode = va_arg(args, aom_scaling_mode_t *);
+
+ if (mode) {
+ const int res =
+ av1_set_internal_size(ctx->cpi, (AOM_SCALING)mode->h_scaling_mode,
+ (AOM_SCALING)mode->v_scaling_mode);
+ return (res == 0) ? AOM_CODEC_OK : AOM_CODEC_INVALID_PARAM;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_spatial_layer_id(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int spatial_layer_id = va_arg(args, int);
+ if (spatial_layer_id > MAX_NUM_ENHANCEMENT_LAYERS)
+ return AOM_CODEC_INVALID_PARAM;
+ ctx->cpi->common.spatial_layer_id = spatial_layer_id;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_number_spatial_layers(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int number_spatial_layers = va_arg(args, int);
+ if (number_spatial_layers > MAX_NUM_ENHANCEMENT_LAYERS)
+ return AOM_CODEC_INVALID_PARAM;
+ ctx->cpi->common.number_spatial_layers = number_spatial_layers;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tune_content(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.content = CAST(AV1E_SET_TUNE_CONTENT, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_cdf_update_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.cdf_update_mode = CAST(AV1E_SET_CDF_UPDATE_MODE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_primaries(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.color_primaries = CAST(AV1E_SET_COLOR_PRIMARIES, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_transfer_characteristics(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.transfer_characteristics =
+ CAST(AV1E_SET_TRANSFER_CHARACTERISTICS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_matrix_coefficients(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.matrix_coefficients = CAST(AV1E_SET_MATRIX_COEFFICIENTS, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_sample_position(
+ aom_codec_alg_priv_t *ctx, va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_sample_position =
+ CAST(AV1E_SET_CHROMA_SAMPLE_POSITION, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_color_range(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.color_range = CAST(AV1E_SET_COLOR_RANGE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_render_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ int *const render_size = va_arg(args, int *);
+ extra_cfg.render_width = render_size[0];
+ extra_cfg.render_height = render_size[1];
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_superblock_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.superblock_size = CAST(AV1E_SET_SUPERBLOCK_SIZE, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_x(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_x = CAST(AV1E_SET_CHROMA_SUBSAMPLING_X, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_err_t ctrl_set_chroma_subsampling_y(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ struct av1_extracfg extra_cfg = ctx->extra_cfg;
+ extra_cfg.chroma_subsampling_y = CAST(AV1E_SET_CHROMA_SUBSAMPLING_Y, args);
+ return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static aom_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
+ { AV1_COPY_REFERENCE, ctrl_copy_reference },
+ { AOME_USE_REFERENCE, ctrl_use_reference },
+
+ // Setters
+ { AV1_SET_REFERENCE, ctrl_set_reference },
+ { AOM_SET_POSTPROC, ctrl_set_previewpp },
+ { AOME_SET_ROI_MAP, ctrl_set_roi_map },
+ { AOME_SET_ACTIVEMAP, ctrl_set_active_map },
+ { AOME_SET_SCALEMODE, ctrl_set_scale_mode },
+ { AOME_SET_SPATIAL_LAYER_ID, ctrl_set_spatial_layer_id },
+ { AOME_SET_CPUUSED, ctrl_set_cpuused },
+ { AOME_SET_ENABLEAUTOALTREF, ctrl_set_enable_auto_alt_ref },
+ { AOME_SET_ENABLEAUTOBWDREF, ctrl_set_enable_auto_bwd_ref },
+ { AOME_SET_SHARPNESS, ctrl_set_sharpness },
+ { AOME_SET_STATIC_THRESHOLD, ctrl_set_static_thresh },
+ { AV1E_SET_ROW_MT, ctrl_set_row_mt },
+ { AV1E_SET_TILE_COLUMNS, ctrl_set_tile_columns },
+ { AV1E_SET_TILE_ROWS, ctrl_set_tile_rows },
+ { AOME_SET_ARNR_MAXFRAMES, ctrl_set_arnr_max_frames },
+ { AOME_SET_ARNR_STRENGTH, ctrl_set_arnr_strength },
+ { AOME_SET_TUNING, ctrl_set_tuning },
+ { AOME_SET_CQ_LEVEL, ctrl_set_cq_level },
+ { AOME_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_rc_max_intra_bitrate_pct },
+ { AOME_SET_NUMBER_SPATIAL_LAYERS, ctrl_set_number_spatial_layers },
+ { AV1E_SET_MAX_INTER_BITRATE_PCT, ctrl_set_rc_max_inter_bitrate_pct },
+ { AV1E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+ { AV1E_SET_LOSSLESS, ctrl_set_lossless },
+ { AV1E_SET_ENABLE_CDEF, ctrl_set_enable_cdef },
+ { AV1E_SET_ENABLE_RESTORATION, ctrl_set_enable_restoration },
+ { AV1E_SET_DISABLE_TRELLIS_QUANT, ctrl_set_disable_trellis_quant },
+ { AV1E_SET_ENABLE_QM, ctrl_set_enable_qm },
+ { AV1E_SET_QM_Y, ctrl_set_qm_y },
+ { AV1E_SET_QM_U, ctrl_set_qm_u },
+ { AV1E_SET_QM_V, ctrl_set_qm_v },
+ { AV1E_SET_QM_MIN, ctrl_set_qm_min },
+ { AV1E_SET_QM_MAX, ctrl_set_qm_max },
+#if CONFIG_DIST_8X8
+ { AV1E_SET_ENABLE_DIST_8X8, ctrl_set_enable_dist_8x8 },
+#endif
+ { AV1E_SET_NUM_TG, ctrl_set_num_tg },
+ { AV1E_SET_MTU, ctrl_set_mtu },
+ { AV1E_SET_TIMING_INFO_TYPE, ctrl_set_timing_info_type },
+ { AV1E_SET_FRAME_PARALLEL_DECODING, ctrl_set_frame_parallel_decoding_mode },
+ { AV1E_SET_ERROR_RESILIENT_MODE, ctrl_set_error_resilient_mode },
+ { AV1E_SET_S_FRAME_MODE, ctrl_set_s_frame_mode },
+ { AV1E_SET_ENABLE_DF, ctrl_set_enable_df },
+ { AV1E_SET_ENABLE_ORDER_HINT, ctrl_set_enable_order_hint },
+ { AV1E_SET_ENABLE_JNT_COMP, ctrl_set_enable_jnt_comp },
+ { AV1E_SET_ENABLE_REF_FRAME_MVS, ctrl_set_enable_ref_frame_mvs },
+ { AV1E_SET_ALLOW_REF_FRAME_MVS, ctrl_set_allow_ref_frame_mvs },
+ { AV1E_SET_ENABLE_WARPED_MOTION, ctrl_set_enable_warped_motion },
+ { AV1E_SET_ALLOW_WARPED_MOTION, ctrl_set_allow_warped_motion },
+ { AV1E_SET_ENABLE_SUPERRES, ctrl_set_enable_superres },
+ { AV1E_SET_AQ_MODE, ctrl_set_aq_mode },
+ { AV1E_SET_DELTAQ_MODE, ctrl_set_deltaq_mode },
+ { AV1E_SET_FRAME_PERIODIC_BOOST, ctrl_set_frame_periodic_boost },
+ { AV1E_SET_TUNE_CONTENT, ctrl_set_tune_content },
+ { AV1E_SET_CDF_UPDATE_MODE, ctrl_set_cdf_update_mode },
+ { AV1E_SET_COLOR_PRIMARIES, ctrl_set_color_primaries },
+ { AV1E_SET_TRANSFER_CHARACTERISTICS, ctrl_set_transfer_characteristics },
+ { AV1E_SET_MATRIX_COEFFICIENTS, ctrl_set_matrix_coefficients },
+ { AV1E_SET_CHROMA_SAMPLE_POSITION, ctrl_set_chroma_sample_position },
+ { AV1E_SET_COLOR_RANGE, ctrl_set_color_range },
+ { AV1E_SET_NOISE_SENSITIVITY, ctrl_set_noise_sensitivity },
+ { AV1E_SET_MIN_GF_INTERVAL, ctrl_set_min_gf_interval },
+ { AV1E_SET_MAX_GF_INTERVAL, ctrl_set_max_gf_interval },
+ { AV1E_SET_RENDER_SIZE, ctrl_set_render_size },
+ { AV1E_SET_SUPERBLOCK_SIZE, ctrl_set_superblock_size },
+ { AV1E_SET_SINGLE_TILE_DECODING, ctrl_set_single_tile_decoding },
+ { AV1E_SET_FILM_GRAIN_TEST_VECTOR, ctrl_set_film_grain_test_vector },
+ { AV1E_SET_FILM_GRAIN_TABLE, ctrl_set_film_grain_table },
+#if CONFIG_DENOISE
+ { AV1E_SET_DENOISE_NOISE_LEVEL, ctrl_set_denoise_noise_level },
+ { AV1E_SET_DENOISE_BLOCK_SIZE, ctrl_set_denoise_block_size },
+#endif // CONFIG_FILM_GRAIN
+ { AV1E_ENABLE_MOTION_VECTOR_UNIT_TEST, ctrl_enable_motion_vector_unit_test },
+
+ // Getters
+ { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer },
+ { AOME_GET_LAST_QUANTIZER_64, ctrl_get_quantizer64 },
+ { AV1_GET_REFERENCE, ctrl_get_reference },
+ { AV1E_GET_ACTIVEMAP, ctrl_get_active_map },
+ { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+ { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+ { AV1E_SET_CHROMA_SUBSAMPLING_X, ctrl_set_chroma_subsampling_x },
+ { AV1E_SET_CHROMA_SUBSAMPLING_Y, ctrl_set_chroma_subsampling_y },
+ { -1, NULL },
+};
+
+static aom_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
+ { 0,
+ {
+ // NOLINT
+ 0, // g_usage
+ 0, // g_threads
+ 0, // g_profile
+
+ 320, // g_width
+ 240, // g_height
+ 0, // g_limit
+ 0, // g_forced_max_frame_width
+ 0, // g_forced_max_frame_height
+ AOM_BITS_8, // g_bit_depth
+ 8, // g_input_bit_depth
+
+ { 1, 30 }, // g_timebase
+
+ 0, // g_error_resilient
+
+ AOM_RC_ONE_PASS, // g_pass
+
+ 19, // g_lag_in_frames
+
+ 0, // rc_dropframe_thresh
+ RESIZE_NONE, // rc_resize_mode
+ SCALE_NUMERATOR, // rc_resize_denominator
+ SCALE_NUMERATOR, // rc_resize_kf_denominator
+
+ 0, // rc_superres_mode
+ SCALE_NUMERATOR, // rc_superres_denominator
+ SCALE_NUMERATOR, // rc_superres_kf_denominator
+ 63, // rc_superres_qthresh
+ 63, // rc_superres_kf_qthresh
+
+ AOM_VBR, // rc_end_usage
+ { NULL, 0 }, // rc_twopass_stats_in
+ { NULL, 0 }, // rc_firstpass_mb_stats_in
+ 256, // rc_target_bandwidth
+ 0, // rc_min_quantizer
+ 63, // rc_max_quantizer
+ 25, // rc_undershoot_pct
+ 25, // rc_overshoot_pct
+
+ 6000, // rc_max_buffer_size
+ 4000, // rc_buffer_initial_size
+ 5000, // rc_buffer_optimal_size
+
+ 50, // rc_two_pass_vbrbias
+ 0, // rc_two_pass_vbrmin_section
+ 2000, // rc_two_pass_vbrmax_section
+
+ // keyframing settings (kf)
+ 0, // fwd_kf_enabled
+ AOM_KF_AUTO, // g_kfmode
+ 0, // kf_min_dist
+ 9999, // kf_max_dist
+ 0, // sframe_dist
+ 1, // sframe_mode
+ 0, // large_scale_tile
+ 0, // monochrome
+ 0, // full_still_picture_hdr
+ 0, // save_as_annexb
+ 0, // tile_width_count
+ 0, // tile_height_count
+ { 0 }, // tile_widths
+ { 0 }, // tile_heights
+ { 1 }, // config file
+ } },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_cx) = {
+ "AOMedia Project AV1 Encoder" VERSION_STRING,
+ AOM_CODEC_INTERNAL_ABI_VERSION,
+ AOM_CODEC_CAP_HIGHBITDEPTH | AOM_CODEC_CAP_ENCODER |
+ AOM_CODEC_CAP_PSNR, // aom_codec_caps_t
+ encoder_init, // aom_codec_init_fn_t
+ encoder_destroy, // aom_codec_destroy_fn_t
+ encoder_ctrl_maps, // aom_codec_ctrl_fn_map_t
+ {
+ // NOLINT
+ NULL, // aom_codec_peek_si_fn_t
+ NULL, // aom_codec_get_si_fn_t
+ NULL, // aom_codec_decode_fn_t
+ NULL, // aom_codec_get_frame_fn_t
+ NULL // aom_codec_set_fb_fn_t
+ },
+ {
+ // NOLINT
+ 1, // 1 cfg map
+ encoder_usage_cfg_map, // aom_codec_enc_cfg_map_t
+ encoder_encode, // aom_codec_encode_fn_t
+ encoder_get_cxdata, // aom_codec_get_cx_data_fn_t
+ encoder_set_config, // aom_codec_enc_config_set_fn_t
+ encoder_get_global_headers, // aom_codec_get_global_headers_fn_t
+ encoder_get_preview, // aom_codec_get_preview_frame_fn_t
+ NULL // aom_codec_enc_mr_get_mem_loc_fn_t
+ }
+};
diff --git a/third_party/aom/av1/av1_dx_iface.c b/third_party/aom/av1/av1_dx_iface.c
new file mode 100644
index 0000000000..4a66310471
--- /dev/null
+++ b/third_party/aom/av1/av1_dx_iface.c
@@ -0,0 +1,1328 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_version.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom/aomdx.h"
+#include "aom/aom_decoder.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/enums.h"
+#include "av1/common/obu_util.h"
+
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+#include "av1/av1_iface_common.h"
+
+struct aom_codec_alg_priv {
+ aom_codec_priv_t base;
+ aom_codec_dec_cfg_t cfg;
+ aom_codec_stream_info_t si;
+ int postproc_cfg_set;
+ aom_postproc_cfg_t postproc_cfg;
+ aom_image_t img;
+ int img_avail;
+ int flushed;
+ int invert_tile_order;
+ int last_show_frame; // Index of last output frame.
+ int byte_alignment;
+ int skip_loop_filter;
+ int skip_film_grain;
+ int decode_tile_row;
+ int decode_tile_col;
+ unsigned int tile_mode;
+ unsigned int ext_tile_debug;
+ unsigned int row_mt;
+ EXTERNAL_REFERENCES ext_refs;
+ unsigned int is_annexb;
+ int operating_point;
+ int output_all_layers;
+
+ AVxWorker *frame_workers;
+ int num_frame_workers;
+ int next_submit_worker_id;
+ int last_submit_worker_id;
+ int next_output_worker_id;
+ int available_threads;
+ aom_image_t *image_with_grain[MAX_NUM_SPATIAL_LAYERS];
+ int need_resync; // wait for key/intra-only frame
+ // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+ BufferPool *buffer_pool;
+
+ // External frame buffer info to save for AV1 common.
+ void *ext_priv; // Private data associated with the external frame buffers.
+ aom_get_frame_buffer_cb_fn_t get_ext_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_ext_fb_cb;
+
+#if CONFIG_INSPECTION
+ aom_inspect_cb inspect_cb;
+ void *inspect_ctx;
+#endif
+};
+
+static aom_codec_err_t decoder_init(aom_codec_ctx_t *ctx,
+ aom_codec_priv_enc_mr_cfg_t *data) {
+ // This function only allocates space for the aom_codec_alg_priv_t
+ // structure. More memory may be required at the time the stream
+ // information becomes known.
+ (void)data;
+
+ if (!ctx->priv) {
+ aom_codec_alg_priv_t *const priv =
+ (aom_codec_alg_priv_t *)aom_calloc(1, sizeof(*priv));
+ if (priv == NULL) return AOM_CODEC_MEM_ERROR;
+
+ ctx->priv = (aom_codec_priv_t *)priv;
+ ctx->priv->init_flags = ctx->init_flags;
+ priv->flushed = 0;
+
+ // TODO(tdaede): this should not be exposed to the API
+ priv->cfg.allow_lowbitdepth = CONFIG_LOWBITDEPTH;
+ if (ctx->config.dec) {
+ priv->cfg = *ctx->config.dec;
+ ctx->config.dec = &priv->cfg;
+ // default values
+ priv->cfg.cfg.ext_partition = 1;
+ }
+ av1_zero(priv->image_with_grain);
+ // Turn row_mt on by default.
+ priv->row_mt = 1;
+
+ // Turn on normal tile coding mode by default.
+ // 0 is for normal tile coding mode, and 1 is for large scale tile coding
+ // mode(refer to lightfield example).
+ priv->tile_mode = 0;
+ priv->decode_tile_row = -1;
+ priv->decode_tile_col = -1;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_destroy(aom_codec_alg_priv_t *ctx) {
+ if (ctx->frame_workers != NULL) {
+ int i;
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ AVxWorker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ aom_get_worker_interface()->end(worker);
+ aom_free(frame_worker_data->pbi->common.tpl_mvs);
+ frame_worker_data->pbi->common.tpl_mvs = NULL;
+ av1_remove_common(&frame_worker_data->pbi->common);
+ av1_free_restoration_buffers(&frame_worker_data->pbi->common);
+ av1_decoder_remove(frame_worker_data->pbi);
+ aom_free(frame_worker_data->scratch_buffer);
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&frame_worker_data->stats_mutex);
+ pthread_cond_destroy(&frame_worker_data->stats_cond);
+#endif
+ aom_free(frame_worker_data);
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
+#endif
+ }
+
+ if (ctx->buffer_pool) {
+ av1_free_ref_frame_buffers(ctx->buffer_pool);
+ av1_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
+ }
+
+ aom_free(ctx->frame_workers);
+ aom_free(ctx->buffer_pool);
+ for (int i = 0; i < MAX_NUM_SPATIAL_LAYERS; i++) {
+ if (ctx->image_with_grain[i]) aom_img_free(ctx->image_with_grain[i]);
+ }
+ aom_free(ctx);
+ return AOM_CODEC_OK;
+}
+
+// Parses the operating points (including operating_point_idc, seq_level_idx,
+// and seq_tier) and then sets si->number_spatial_layers and
+// si->number_temporal_layers based on operating_point_idc[0].
+static aom_codec_err_t parse_operating_points(struct aom_read_bit_buffer *rb,
+ int is_reduced_header,
+ aom_codec_stream_info_t *si) {
+ int operating_point_idc0 = 0;
+
+ if (is_reduced_header) {
+ aom_rb_read_literal(rb, LEVEL_BITS); // level
+ } else {
+ const uint8_t operating_points_cnt_minus_1 =
+ aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+ for (int i = 0; i < operating_points_cnt_minus_1 + 1; i++) {
+ int operating_point_idc;
+ operating_point_idc = aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+ if (i == 0) operating_point_idc0 = operating_point_idc;
+ int seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS); // level
+ if (seq_level_idx > 7) aom_rb_read_bit(rb); // tier
+ }
+ }
+
+ if (aom_get_num_layers_from_operating_point_idc(
+ operating_point_idc0, &si->number_spatial_layers,
+ &si->number_temporal_layers) != AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si_internal(const uint8_t *data,
+ size_t data_sz,
+ aom_codec_stream_info_t *si,
+ int *is_intra_only) {
+ int intra_only_flag = 0;
+ int got_sequence_header = 0;
+ int found_keyframe = 0;
+
+ if (data + data_sz <= data || data_sz < 1) return AOM_CODEC_INVALID_PARAM;
+
+ si->w = 0;
+ si->h = 0;
+ si->is_kf = 0; // is_kf indicates whether the current packet contains a RAP
+
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+ size_t payload_size = 0;
+ size_t bytes_read = 0;
+ int reduced_still_picture_hdr = 0;
+ aom_codec_err_t status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+
+ // If the first OBU is a temporal delimiter, skip over it and look at the next
+ // OBU in the bitstream
+ if (obu_header.type == OBU_TEMPORAL_DELIMITER) {
+ // Skip any associated payload (there shouldn't be one, but just in case)
+ if (data_sz < bytes_read + payload_size) return AOM_CODEC_CORRUPT_FRAME;
+ data += bytes_read + payload_size;
+ data_sz -= bytes_read + payload_size;
+
+ status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+ }
+ while (1) {
+ data += bytes_read;
+ data_sz -= bytes_read;
+ if (data_sz < payload_size) return AOM_CODEC_CORRUPT_FRAME;
+ // Check that the selected OBU is a sequence header
+ if (obu_header.type == OBU_SEQUENCE_HEADER) {
+ // Sanity check on sequence header size
+ if (data_sz < 2) return AOM_CODEC_CORRUPT_FRAME;
+ // Read a few values from the sequence header payload
+ struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+
+ av1_read_profile(&rb); // profile
+ const int still_picture = aom_rb_read_bit(&rb);
+ reduced_still_picture_hdr = aom_rb_read_bit(&rb);
+
+ if (!still_picture && reduced_still_picture_hdr) {
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ if (parse_operating_points(&rb, reduced_still_picture_hdr, si) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ int num_bits_width = aom_rb_read_literal(&rb, 4) + 1;
+ int num_bits_height = aom_rb_read_literal(&rb, 4) + 1;
+ int max_frame_width = aom_rb_read_literal(&rb, num_bits_width) + 1;
+ int max_frame_height = aom_rb_read_literal(&rb, num_bits_height) + 1;
+ si->w = max_frame_width;
+ si->h = max_frame_height;
+ got_sequence_header = 1;
+ } else if (obu_header.type == OBU_FRAME_HEADER ||
+ obu_header.type == OBU_FRAME) {
+ if (got_sequence_header && reduced_still_picture_hdr) {
+ found_keyframe = 1;
+ break;
+ } else {
+ // make sure we have enough bits to get the frame type out
+ if (data_sz < 1) return AOM_CODEC_CORRUPT_FRAME;
+ struct aom_read_bit_buffer rb = { data, data + data_sz, 0, NULL, NULL };
+ const int show_existing_frame = aom_rb_read_bit(&rb);
+ if (!show_existing_frame) {
+ const FRAME_TYPE frame_type = (FRAME_TYPE)aom_rb_read_literal(&rb, 2);
+ if (frame_type == KEY_FRAME) {
+ found_keyframe = 1;
+ break; // Stop here as no further OBUs will change the outcome.
+ }
+ }
+ }
+ }
+ // skip past any unread OBU header data
+ data += payload_size;
+ data_sz -= payload_size;
+ if (data_sz == 0) break; // exit if we're out of OBUs
+ status = aom_read_obu_header_and_size(
+ data, data_sz, si->is_annexb, &obu_header, &payload_size, &bytes_read);
+ if (status != AOM_CODEC_OK) return status;
+ }
+ if (got_sequence_header && found_keyframe) si->is_kf = 1;
+ if (is_intra_only != NULL) *is_intra_only = intra_only_flag;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_peek_si(const uint8_t *data, size_t data_sz,
+ aom_codec_stream_info_t *si) {
+ return decoder_peek_si_internal(data, data_sz, si, NULL);
+}
+
+static aom_codec_err_t decoder_get_si(aom_codec_alg_priv_t *ctx,
+ aom_codec_stream_info_t *si) {
+ memcpy(si, &ctx->si, sizeof(*si));
+
+ return AOM_CODEC_OK;
+}
+
+static void set_error_detail(aom_codec_alg_priv_t *ctx,
+ const char *const error) {
+ ctx->base.err_detail = error;
+}
+
+static aom_codec_err_t update_error_state(
+ aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) {
+ if (error->error_code)
+ set_error_detail(ctx, error->has_detail ? error->detail : NULL);
+
+ return error->error_code;
+}
+
+static void init_buffer_callbacks(aom_codec_alg_priv_t *ctx) {
+ int i;
+
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ AVxWorker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ cm->new_fb_idx = INVALID_IDX;
+ cm->byte_alignment = ctx->byte_alignment;
+ cm->skip_loop_filter = ctx->skip_loop_filter;
+ cm->skip_film_grain = ctx->skip_film_grain;
+
+ if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+ pool->get_fb_cb = ctx->get_ext_fb_cb;
+ pool->release_fb_cb = ctx->release_ext_fb_cb;
+ pool->cb_priv = ctx->ext_priv;
+ } else {
+ pool->get_fb_cb = av1_get_frame_buffer;
+ pool->release_fb_cb = av1_release_frame_buffer;
+
+ if (av1_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to initialize internal frame buffers");
+
+ pool->cb_priv = &pool->int_frame_buffers;
+ }
+ }
+}
+
+static void set_default_ppflags(aom_postproc_cfg_t *cfg) {
+ cfg->post_proc_flag = AOM_DEBLOCK | AOM_DEMACROBLOCK;
+ cfg->deblocking_level = 4;
+ cfg->noise_level = 0;
+}
+
+static int frame_worker_hook(void *arg1, void *arg2) {
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
+ const uint8_t *data = frame_worker_data->data;
+ (void)arg2;
+
+ int result = av1_receive_compressed_data(frame_worker_data->pbi,
+ frame_worker_data->data_size, &data);
+ frame_worker_data->data_end = data;
+
+ if (result != 0) {
+ // Check decode result in serial decode.
+ frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
+ frame_worker_data->pbi->need_resync = 1;
+ }
+ return !result;
+}
+
+static aom_codec_err_t init_decoder(aom_codec_alg_priv_t *ctx) {
+ int i;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ ctx->last_show_frame = -1;
+ ctx->next_submit_worker_id = 0;
+ ctx->last_submit_worker_id = 0;
+ ctx->next_output_worker_id = 0;
+ ctx->need_resync = 1;
+ ctx->num_frame_workers = 1;
+ if (ctx->num_frame_workers > MAX_DECODE_THREADS)
+ ctx->num_frame_workers = MAX_DECODE_THREADS;
+ ctx->available_threads = ctx->num_frame_workers;
+ ctx->flushed = 0;
+
+ ctx->buffer_pool = (BufferPool *)aom_calloc(1, sizeof(BufferPool));
+ if (ctx->buffer_pool == NULL) return AOM_CODEC_MEM_ERROR;
+
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
+ set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+ return AOM_CODEC_MEM_ERROR;
+ }
+#endif
+
+ ctx->frame_workers = (AVxWorker *)aom_malloc(ctx->num_frame_workers *
+ sizeof(*ctx->frame_workers));
+ if (ctx->frame_workers == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_workers");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ for (i = 0; i < ctx->num_frame_workers; ++i) {
+ AVxWorker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *frame_worker_data = NULL;
+ winterface->init(worker);
+ worker->data1 = aom_memalign(32, sizeof(FrameWorkerData));
+ if (worker->data1 == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi = av1_decoder_create(ctx->buffer_pool);
+ if (frame_worker_data->pbi == NULL) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ frame_worker_data->pbi->common.options = &ctx->cfg.cfg;
+ frame_worker_data->pbi->frame_worker_owner = worker;
+ frame_worker_data->worker_id = i;
+ frame_worker_data->scratch_buffer = NULL;
+ frame_worker_data->scratch_buffer_size = 0;
+ frame_worker_data->frame_context_ready = 0;
+ frame_worker_data->received_frame = 0;
+#if CONFIG_MULTITHREAD
+ if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
+ set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
+ return AOM_CODEC_MEM_ERROR;
+ }
+#endif
+ frame_worker_data->pbi->allow_lowbitdepth = ctx->cfg.allow_lowbitdepth;
+
+ // If decoding in serial mode, FrameWorker thread could create tile worker
+ // thread or loopfilter thread.
+ frame_worker_data->pbi->max_threads = ctx->cfg.threads;
+ frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
+ frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+ frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+ frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+ frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+ frame_worker_data->pbi->operating_point = ctx->operating_point;
+ frame_worker_data->pbi->output_all_layers = ctx->output_all_layers;
+ frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
+
+ worker->hook = frame_worker_hook;
+ if (!winterface->reset(worker)) {
+ set_error_detail(ctx, "Frame Worker thread creation failed");
+ return AOM_CODEC_MEM_ERROR;
+ }
+ }
+
+ // If postprocessing was enabled by the application and a
+ // configuration has not been provided, default it.
+ if (!ctx->postproc_cfg_set && (ctx->base.init_flags & AOM_CODEC_USE_POSTPROC))
+ set_default_ppflags(&ctx->postproc_cfg);
+
+ init_buffer_callbacks(ctx);
+
+ return AOM_CODEC_OK;
+}
+
+static INLINE void check_resync(aom_codec_alg_priv_t *const ctx,
+ const AV1Decoder *const pbi) {
+ // Clear resync flag if worker got a key frame or intra only frame.
+ if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
+ (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
+ ctx->need_resync = 0;
+}
+
+static aom_codec_err_t decode_one(aom_codec_alg_priv_t *ctx,
+ const uint8_t **data, size_t data_sz,
+ void *user_priv) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Determine the stream parameters. Note that we rely on peek_si to
+ // validate that we have a buffer that does not wrap around the top
+ // of the heap.
+ if (!ctx->si.h) {
+ int is_intra_only = 0;
+ ctx->si.is_annexb = ctx->is_annexb;
+ const aom_codec_err_t res =
+ decoder_peek_si_internal(*data, data_sz, &ctx->si, &is_intra_only);
+ if (res != AOM_CODEC_OK) return res;
+
+ if (!ctx->si.is_kf && !is_intra_only) return AOM_CODEC_ERROR;
+ }
+
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->data = *data;
+ frame_worker_data->data_size = data_sz;
+ frame_worker_data->user_priv = user_priv;
+ frame_worker_data->received_frame = 1;
+
+#if CONFIG_INSPECTION
+ frame_worker_data->pbi->inspect_cb = ctx->inspect_cb;
+ frame_worker_data->pbi->inspect_ctx = ctx->inspect_ctx;
+#endif
+
+ frame_worker_data->pbi->common.large_scale_tile = ctx->tile_mode;
+ frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+ frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+ frame_worker_data->pbi->ext_tile_debug = ctx->ext_tile_debug;
+ frame_worker_data->pbi->row_mt = ctx->row_mt;
+ frame_worker_data->pbi->ext_refs = ctx->ext_refs;
+
+ frame_worker_data->pbi->common.is_annexb = ctx->is_annexb;
+
+ worker->had_error = 0;
+ winterface->execute(worker);
+
+ // Update data pointer after decode.
+ *data = frame_worker_data->data_end;
+
+ if (worker->had_error)
+ return update_error_state(ctx, &frame_worker_data->pbi->common.error);
+
+ check_resync(ctx, frame_worker_data->pbi);
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t decoder_decode(aom_codec_alg_priv_t *ctx,
+ const uint8_t *data, size_t data_sz,
+ void *user_priv) {
+ aom_codec_err_t res = AOM_CODEC_OK;
+
+ // Release any pending output frames from the previous decoder_decode call.
+ // We need to do this even if the decoder is being flushed or the input
+ // arguments are invalid.
+ if (ctx->frame_workers) {
+ BufferPool *const pool = ctx->buffer_pool;
+ RefCntBuffer *const frame_bufs = pool->frame_bufs;
+ lock_buffer_pool(pool);
+ for (int i = 0; i < ctx->num_frame_workers; ++i) {
+ AVxWorker *const worker = &ctx->frame_workers[i];
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ struct AV1Decoder *pbi = frame_worker_data->pbi;
+ for (size_t j = 0; j < pbi->num_output_frames; j++) {
+ decrease_ref_count((int)pbi->output_frame_index[j], frame_bufs, pool);
+ }
+ pbi->num_output_frames = 0;
+ }
+ unlock_buffer_pool(ctx->buffer_pool);
+ }
+
+ /* Sanity checks */
+ /* NULL data ptr allowed if data_sz is 0 too */
+ if (data == NULL && data_sz == 0) {
+ ctx->flushed = 1;
+ return AOM_CODEC_OK;
+ }
+ if (data == NULL || data_sz == 0) return AOM_CODEC_INVALID_PARAM;
+
+ // Reset flushed when receiving a valid frame.
+ ctx->flushed = 0;
+
+ // Initialize the decoder workers on the first frame.
+ if (ctx->frame_workers == NULL) {
+ res = init_decoder(ctx);
+ if (res != AOM_CODEC_OK) return res;
+ }
+
+ const uint8_t *data_start = data;
+ const uint8_t *data_end = data + data_sz;
+
+ if (ctx->is_annexb) {
+ // read the size of this temporal unit
+ size_t length_of_size;
+ uint64_t temporal_unit_size;
+ if (aom_uleb_decode(data_start, data_sz, &temporal_unit_size,
+ &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (temporal_unit_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ data_end = data_start + temporal_unit_size;
+ }
+
+ // Decode in serial mode.
+ while (data_start < data_end) {
+ uint64_t frame_size;
+ if (ctx->is_annexb) {
+ // read the size of this frame unit
+ size_t length_of_size;
+ if (aom_uleb_decode(data_start, (size_t)(data_end - data_start),
+ &frame_size, &length_of_size) != 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ data_start += length_of_size;
+ if (frame_size > (size_t)(data_end - data_start))
+ return AOM_CODEC_CORRUPT_FRAME;
+ } else {
+ frame_size = (uint64_t)(data_end - data_start);
+ }
+
+ res = decode_one(ctx, &data_start, (size_t)frame_size, user_priv);
+ if (res != AOM_CODEC_OK) return res;
+
+ // Allow extra zero bytes after the frame end
+ while (data_start < data_end) {
+ const uint8_t marker = data_start[0];
+ if (marker) break;
+ ++data_start;
+ }
+ }
+
+ return res;
+}
+
+// If grain_params->apply_grain is false, returns img. Otherwise, adds film
+// grain to img, saves the result in *grain_img_ptr (allocating *grain_img_ptr
+// if necessary), and returns *grain_img_ptr.
+static aom_image_t *add_grain_if_needed(aom_image_t *img,
+ aom_image_t **grain_img_ptr,
+ aom_film_grain_t *grain_params) {
+ if (!grain_params->apply_grain) return img;
+
+ aom_image_t *grain_img_buf = *grain_img_ptr;
+
+ const int w_even = ALIGN_POWER_OF_TWO(img->d_w, 1);
+ const int h_even = ALIGN_POWER_OF_TWO(img->d_h, 1);
+
+ if (grain_img_buf) {
+ const int alloc_w = ALIGN_POWER_OF_TWO(grain_img_buf->d_w, 1);
+ const int alloc_h = ALIGN_POWER_OF_TWO(grain_img_buf->d_h, 1);
+ if (w_even != alloc_w || h_even != alloc_h ||
+ img->fmt != grain_img_buf->fmt) {
+ aom_img_free(grain_img_buf);
+ grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
+ }
+ }
+ if (!grain_img_buf) {
+ grain_img_buf = aom_img_alloc(NULL, img->fmt, w_even, h_even, 16);
+ *grain_img_ptr = grain_img_buf;
+ }
+
+ if (grain_img_buf) {
+ grain_img_buf->user_priv = img->user_priv;
+ if (av1_add_film_grain(grain_params, img, grain_img_buf)) {
+ aom_img_free(grain_img_buf);
+ grain_img_buf = NULL;
+ *grain_img_ptr = NULL;
+ }
+ }
+
+ return grain_img_buf;
+}
+
+static aom_image_t *decoder_get_frame(aom_codec_alg_priv_t *ctx,
+ aom_codec_iter_t *iter) {
+ aom_image_t *img = NULL;
+
+ if (!iter) {
+ return NULL;
+ }
+
+ // To avoid having to allocate any extra storage, treat 'iter' as
+ // simply a pointer to an integer index
+ uintptr_t *index = (uintptr_t *)iter;
+
+ if (ctx->frame_workers != NULL) {
+ do {
+ YV12_BUFFER_CONFIG *sd;
+ // NOTE(david.barker): This code does not support multiple worker threads
+ // yet. We should probably move the iteration over threads into *iter
+ // instead of using ctx->next_output_worker_id.
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ ctx->next_output_worker_id =
+ (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+ // Wait for the frame from worker thread.
+ if (winterface->sync(worker)) {
+ // Check if worker has received any frames.
+ if (frame_worker_data->received_frame == 1) {
+ ++ctx->available_threads;
+ frame_worker_data->received_frame = 0;
+ check_resync(ctx, frame_worker_data->pbi);
+ }
+ aom_film_grain_t *grain_params;
+ if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd,
+ &grain_params) == 0) {
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ AV1_COMMON *const cm = &pbi->common;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ ctx->last_show_frame = cm->new_fb_idx;
+ if (ctx->need_resync) return NULL;
+ yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv);
+
+ if (!pbi->ext_tile_debug && cm->large_scale_tile) {
+ *index += 1; // Advance the iterator to point to the next image
+ img = &ctx->img;
+ img->img_data = pbi->tile_list_output;
+ img->sz = pbi->tile_list_size;
+ return img;
+ }
+
+ const int num_planes = av1_num_planes(cm);
+ if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+ pbi->dec_tile_row >= 0) {
+ const int tile_row = AOMMIN(pbi->dec_tile_row, cm->tile_rows - 1);
+ const int mi_row = tile_row * cm->tile_height;
+ const int ssy = ctx->img.y_chroma_shift;
+ int plane;
+ ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+ if (num_planes > 1) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ ctx->img.planes[plane] +=
+ mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane];
+ }
+ }
+ ctx->img.d_h =
+ AOMMIN(cm->tile_height, cm->mi_rows - mi_row) * MI_SIZE;
+ }
+
+ if (pbi->ext_tile_debug && cm->single_tile_decoding &&
+ pbi->dec_tile_col >= 0) {
+ const int tile_col = AOMMIN(pbi->dec_tile_col, cm->tile_cols - 1);
+ const int mi_col = tile_col * cm->tile_width;
+ const int ssx = ctx->img.x_chroma_shift;
+ const int is_hbd =
+ (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0;
+ int plane;
+ ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd);
+ if (num_planes > 1) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ ctx->img.planes[plane] +=
+ mi_col * (MI_SIZE >> ssx) * (1 + is_hbd);
+ }
+ }
+ ctx->img.d_w =
+ AOMMIN(cm->tile_width, cm->mi_cols - mi_col) * MI_SIZE;
+ }
+
+ ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+ img = &ctx->img;
+ img->temporal_id = cm->temporal_layer_id;
+ img->spatial_id = cm->spatial_layer_id;
+ if (cm->skip_film_grain) grain_params->apply_grain = 0;
+ aom_image_t *res = add_grain_if_needed(
+ img, &ctx->image_with_grain[*index], grain_params);
+ if (!res) {
+ aom_internal_error(&pbi->common.error, AOM_CODEC_CORRUPT_FRAME,
+ "Grain systhesis failed\n");
+ }
+ *index += 1; // Advance the iterator to point to the next image
+ return res;
+ }
+ } else {
+ // Decoding failed. Release the worker thread.
+ frame_worker_data->received_frame = 0;
+ ++ctx->available_threads;
+ ctx->need_resync = 1;
+ if (ctx->flushed != 1) return NULL;
+ }
+ } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+ }
+ return NULL;
+}
+
+static aom_codec_err_t decoder_set_fb_fn(
+ aom_codec_alg_priv_t *ctx, aom_get_frame_buffer_cb_fn_t cb_get,
+ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
+ if (cb_get == NULL || cb_release == NULL) {
+ return AOM_CODEC_INVALID_PARAM;
+ } else if (ctx->frame_workers == NULL) {
+ // If the decoder has already been initialized, do not accept changes to
+ // the frame buffer functions.
+ ctx->get_ext_fb_cb = cb_get;
+ ctx->release_ext_fb_cb = cb_release;
+ ctx->ext_priv = cb_priv;
+ return AOM_CODEC_OK;
+ }
+
+ return AOM_CODEC_ERROR;
+}
+
+static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *const data = va_arg(args, av1_ref_frame_t *);
+
+ if (data) {
+ av1_ref_frame_t *const frame = data;
+ YV12_BUFFER_CONFIG sd;
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ image2yuvconfig(&frame->img, &sd);
+ return av1_set_reference_dec(&frame_worker_data->pbi->common, frame->idx,
+ frame->use_external_ref, &sd);
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const av1_ref_frame_t *const frame = va_arg(args, av1_ref_frame_t *);
+ if (frame) {
+ YV12_BUFFER_CONFIG sd;
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ image2yuvconfig(&frame->img, &sd);
+ return av1_copy_reference_dec(frame_worker_data->pbi, frame->idx, &sd);
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_reference(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ref_frame_t *data = va_arg(args, av1_ref_frame_t *);
+ if (data) {
+ YV12_BUFFER_CONFIG *fb;
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+ if (fb == NULL) return AOM_CODEC_ERROR;
+ yuvconfig2image(&data->img, fb, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *new_img = va_arg(args, aom_image_t *);
+ if (new_img) {
+ YV12_BUFFER_CONFIG new_frame;
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+ if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+ yuvconfig2image(new_img, &new_frame, NULL);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_copy_new_frame_image(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_image_t *img = va_arg(args, aom_image_t *);
+ if (img) {
+ YV12_BUFFER_CONFIG new_frame;
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+ if (av1_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+ YV12_BUFFER_CONFIG sd;
+ image2yuvconfig(img, &sd);
+ return av1_copy_new_frame_dec(&frame_worker_data->pbi->common, &new_frame,
+ &sd);
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_set_postproc(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_set_dbg_options(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+}
+
+static aom_codec_err_t ctrl_get_last_ref_updates(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const update_info = va_arg(args, int *);
+
+ if (update_info) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ *update_info = frame_worker_data->pbi->refresh_frame_flags;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_last_quantizer(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const arg = va_arg(args, int *);
+ if (arg == NULL) return AOM_CODEC_INVALID_PARAM;
+ *arg =
+ ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_frame_corrupted(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *corrupted = va_arg(args, int *);
+
+ if (corrupted) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ AV1Decoder *const pbi = frame_worker_data->pbi;
+ RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+ if (pbi->seen_frame_header && pbi->num_output_frames == 0)
+ return AOM_CODEC_ERROR;
+ if (ctx->last_show_frame >= 0)
+ *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const frame_size = va_arg(args, int *);
+
+ if (frame_size) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ frame_size[0] = cm->width;
+ frame_size[1] = cm->height;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_frame_header_info(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_tile_data *const frame_header_info = va_arg(args, aom_tile_data *);
+
+ if (frame_header_info) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ frame_header_info->coded_tile_data_size = pbi->obu_size_hdr.size;
+ frame_header_info->coded_tile_data = pbi->obu_size_hdr.data;
+ frame_header_info->extra_size = pbi->frame_header_size;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_data(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_tile_data *const tile_data = va_arg(args, aom_tile_data *);
+
+ if (tile_data) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1Decoder *pbi = frame_worker_data->pbi;
+ tile_data->coded_tile_data_size =
+ pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size;
+ tile_data->coded_tile_data =
+ pbi->tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_ext_ref_ptr(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ av1_ext_ref_frame_t *const data = va_arg(args, av1_ext_ref_frame_t *);
+
+ if (data) {
+ av1_ext_ref_frame_t *const ext_frames = data;
+ ctx->ext_refs.num = ext_frames->num;
+ for (int i = 0; i < ctx->ext_refs.num; i++) {
+ image2yuvconfig(ext_frames->img++, &ctx->ext_refs.refs[i]);
+ }
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_INVALID_PARAM;
+ }
+}
+
+static aom_codec_err_t ctrl_get_render_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ int *const render_size = va_arg(args, int *);
+
+ if (render_size) {
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ render_size[0] = cm->render_width;
+ render_size[1] = cm->render_height;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_bit_depth(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const bit_depth = va_arg(args, unsigned int *);
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+ if (bit_depth) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ *bit_depth = cm->seq_params.bit_depth;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_img_fmt_t get_img_format(int subsampling_x, int subsampling_y,
+ int use_highbitdepth) {
+ aom_img_fmt_t fmt = 0;
+
+ if (subsampling_x == 0 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I444;
+ else if (subsampling_x == 1 && subsampling_y == 0)
+ fmt = AOM_IMG_FMT_I422;
+ else if (subsampling_x == 1 && subsampling_y == 1)
+ fmt = AOM_IMG_FMT_I420;
+
+ if (use_highbitdepth) fmt |= AOM_IMG_FMT_HIGHBITDEPTH;
+ return fmt;
+}
+
+static aom_codec_err_t ctrl_get_img_format(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ aom_img_fmt_t *const img_fmt = va_arg(args, aom_img_fmt_t *);
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+ if (img_fmt) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+
+ *img_fmt = get_img_format(cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y,
+ cm->seq_params.use_highbitdepth);
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_get_tile_size(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ unsigned int *const tile_size = va_arg(args, unsigned int *);
+ AVxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
+
+ if (tile_size) {
+ if (worker) {
+ FrameWorkerData *const frame_worker_data =
+ (FrameWorkerData *)worker->data1;
+ const AV1_COMMON *const cm = &frame_worker_data->pbi->common;
+ *tile_size =
+ ((cm->tile_width * MI_SIZE) << 16) + cm->tile_height * MI_SIZE;
+ return AOM_CODEC_OK;
+ } else {
+ return AOM_CODEC_ERROR;
+ }
+ }
+ return AOM_CODEC_INVALID_PARAM;
+}
+
+static aom_codec_err_t ctrl_set_invert_tile_order(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->invert_tile_order = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_byte_alignment(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ const int legacy_byte_alignment = 0;
+ const int min_byte_alignment = 32;
+ const int max_byte_alignment = 1024;
+ const int byte_alignment = va_arg(args, int);
+
+ if (byte_alignment != legacy_byte_alignment &&
+ (byte_alignment < min_byte_alignment ||
+ byte_alignment > max_byte_alignment ||
+ (byte_alignment & (byte_alignment - 1)) != 0))
+ return AOM_CODEC_INVALID_PARAM;
+
+ ctx->byte_alignment = byte_alignment;
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+ }
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_loop_filter(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_loop_filter = va_arg(args, int);
+
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_skip_film_grain(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->skip_film_grain = va_arg(args, int);
+
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ frame_worker_data->pbi->common.skip_film_grain = ctx->skip_film_grain;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_get_accounting(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_ACCOUNTING
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ if (ctx->frame_workers) {
+ AVxWorker *const worker = ctx->frame_workers;
+ FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+ AV1Decoder *pbi = frame_worker_data->pbi;
+ Accounting **acct = va_arg(args, Accounting **);
+ *acct = &pbi->accounting;
+ return AOM_CODEC_OK;
+ }
+ return AOM_CODEC_ERROR;
+#endif
+}
+static aom_codec_err_t ctrl_set_decode_tile_row(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->decode_tile_row = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_decode_tile_col(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->decode_tile_col = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_tile_mode(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->tile_mode = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_is_annexb(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->is_annexb = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_operating_point(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->operating_point = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_output_all_layers(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->output_all_layers = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_inspection_callback(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+#if !CONFIG_INSPECTION
+ (void)ctx;
+ (void)args;
+ return AOM_CODEC_INCAPABLE;
+#else
+ aom_inspect_init *init = va_arg(args, aom_inspect_init *);
+ ctx->inspect_cb = init->inspect_cb;
+ ctx->inspect_ctx = init->inspect_ctx;
+ return AOM_CODEC_OK;
+#endif
+}
+
+static aom_codec_err_t ctrl_ext_tile_debug(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->ext_tile_debug = va_arg(args, int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_err_t ctrl_set_row_mt(aom_codec_alg_priv_t *ctx,
+ va_list args) {
+ ctx->row_mt = va_arg(args, unsigned int);
+ return AOM_CODEC_OK;
+}
+
+static aom_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
+ { AV1_COPY_REFERENCE, ctrl_copy_reference },
+
+ // Setters
+ { AV1_SET_REFERENCE, ctrl_set_reference },
+ { AOM_SET_POSTPROC, ctrl_set_postproc },
+ { AOM_SET_DBG_COLOR_REF_FRAME, ctrl_set_dbg_options },
+ { AOM_SET_DBG_COLOR_MB_MODES, ctrl_set_dbg_options },
+ { AOM_SET_DBG_COLOR_B_MODES, ctrl_set_dbg_options },
+ { AOM_SET_DBG_DISPLAY_MV, ctrl_set_dbg_options },
+ { AV1_INVERT_TILE_DECODE_ORDER, ctrl_set_invert_tile_order },
+ { AV1_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment },
+ { AV1_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter },
+ { AV1_SET_DECODE_TILE_ROW, ctrl_set_decode_tile_row },
+ { AV1_SET_DECODE_TILE_COL, ctrl_set_decode_tile_col },
+ { AV1_SET_TILE_MODE, ctrl_set_tile_mode },
+ { AV1D_SET_IS_ANNEXB, ctrl_set_is_annexb },
+ { AV1D_SET_OPERATING_POINT, ctrl_set_operating_point },
+ { AV1D_SET_OUTPUT_ALL_LAYERS, ctrl_set_output_all_layers },
+ { AV1_SET_INSPECTION_CALLBACK, ctrl_set_inspection_callback },
+ { AV1D_EXT_TILE_DEBUG, ctrl_ext_tile_debug },
+ { AV1D_SET_ROW_MT, ctrl_set_row_mt },
+ { AV1D_SET_EXT_REF_PTR, ctrl_set_ext_ref_ptr },
+ { AV1D_SET_SKIP_FILM_GRAIN, ctrl_set_skip_film_grain },
+
+ // Getters
+ { AOMD_GET_FRAME_CORRUPTED, ctrl_get_frame_corrupted },
+ { AOMD_GET_LAST_QUANTIZER, ctrl_get_last_quantizer },
+ { AOMD_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates },
+ { AV1D_GET_BIT_DEPTH, ctrl_get_bit_depth },
+ { AV1D_GET_IMG_FORMAT, ctrl_get_img_format },
+ { AV1D_GET_TILE_SIZE, ctrl_get_tile_size },
+ { AV1D_GET_DISPLAY_SIZE, ctrl_get_render_size },
+ { AV1D_GET_FRAME_SIZE, ctrl_get_frame_size },
+ { AV1_GET_ACCOUNTING, ctrl_get_accounting },
+ { AV1_GET_NEW_FRAME_IMAGE, ctrl_get_new_frame_image },
+ { AV1_COPY_NEW_FRAME_IMAGE, ctrl_copy_new_frame_image },
+ { AV1_GET_REFERENCE, ctrl_get_reference },
+ { AV1D_GET_FRAME_HEADER_INFO, ctrl_get_frame_header_info },
+ { AV1D_GET_TILE_DATA, ctrl_get_tile_data },
+
+ { -1, NULL },
+};
+
+#ifndef VERSION_STRING
+#define VERSION_STRING
+#endif
+CODEC_INTERFACE(aom_codec_av1_dx) = {
+ "AOMedia Project AV1 Decoder" VERSION_STRING,
+ AOM_CODEC_INTERNAL_ABI_VERSION,
+ AOM_CODEC_CAP_DECODER |
+ AOM_CODEC_CAP_EXTERNAL_FRAME_BUFFER, // aom_codec_caps_t
+ decoder_init, // aom_codec_init_fn_t
+ decoder_destroy, // aom_codec_destroy_fn_t
+ decoder_ctrl_maps, // aom_codec_ctrl_fn_map_t
+ {
+ // NOLINT
+ decoder_peek_si, // aom_codec_peek_si_fn_t
+ decoder_get_si, // aom_codec_get_si_fn_t
+ decoder_decode, // aom_codec_decode_fn_t
+ decoder_get_frame, // aom_codec_get_frame_fn_t
+ decoder_set_fb_fn, // aom_codec_set_fb_fn_t
+ },
+ {
+ // NOLINT
+ 0,
+ NULL, // aom_codec_enc_cfg_map_t
+ NULL, // aom_codec_encode_fn_t
+ NULL, // aom_codec_get_cx_data_fn_t
+ NULL, // aom_codec_enc_config_set_fn_t
+ NULL, // aom_codec_get_global_headers_fn_t
+ NULL, // aom_codec_get_preview_frame_fn_t
+ NULL // aom_codec_enc_mr_get_mem_loc_fn_t
+ }
+};
diff --git a/third_party/aom/av1/av1_iface_common.h b/third_party/aom/av1/av1_iface_common.h
new file mode 100644
index 0000000000..4a7af580b8
--- /dev/null
+++ b/third_party/aom/av1/av1_iface_common.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_AV1_IFACE_COMMON_H_
+#define AOM_AV1_AV1_IFACE_COMMON_H_
+
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+static void yuvconfig2image(aom_image_t *img, const YV12_BUFFER_CONFIG *yv12,
+ void *user_priv) {
+ /* aom_img_wrap() doesn't allow specifying independent strides for
+ * the Y, U, and V planes, nor other alignment adjustments that
+ * might be representable by a YV12_BUFFER_CONFIG, so we just
+ * initialize all the fields.
+ */
+ int bps;
+ if (!yv12->subsampling_y) {
+ if (!yv12->subsampling_x) {
+ img->fmt = AOM_IMG_FMT_I444;
+ bps = 24;
+ } else {
+ img->fmt = AOM_IMG_FMT_I422;
+ bps = 16;
+ }
+ } else {
+ img->fmt = AOM_IMG_FMT_I420;
+ bps = 12;
+ }
+ img->cp = yv12->color_primaries;
+ img->tc = yv12->transfer_characteristics;
+ img->mc = yv12->matrix_coefficients;
+ img->monochrome = yv12->monochrome;
+ img->csp = yv12->chroma_sample_position;
+ img->range = yv12->color_range;
+ img->bit_depth = 8;
+ img->w = yv12->y_width;
+ img->h = yv12->y_height;
+ img->d_w = yv12->y_crop_width;
+ img->d_h = yv12->y_crop_height;
+ img->r_w = yv12->render_width;
+ img->r_h = yv12->render_height;
+ img->x_chroma_shift = yv12->subsampling_x;
+ img->y_chroma_shift = yv12->subsampling_y;
+ img->planes[AOM_PLANE_Y] = yv12->y_buffer;
+ img->planes[AOM_PLANE_U] = yv12->u_buffer;
+ img->planes[AOM_PLANE_V] = yv12->v_buffer;
+ img->planes[AOM_PLANE_ALPHA] = NULL;
+ img->stride[AOM_PLANE_Y] = yv12->y_stride;
+ img->stride[AOM_PLANE_U] = yv12->uv_stride;
+ img->stride[AOM_PLANE_V] = yv12->uv_stride;
+ img->stride[AOM_PLANE_ALPHA] = yv12->y_stride;
+ if (yv12->flags & YV12_FLAG_HIGHBITDEPTH) {
+ // aom_image_t uses byte strides and a pointer to the first byte
+ // of the image.
+ img->fmt = (aom_img_fmt_t)(img->fmt | AOM_IMG_FMT_HIGHBITDEPTH);
+ img->bit_depth = yv12->bit_depth;
+ img->planes[AOM_PLANE_Y] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->y_buffer);
+ img->planes[AOM_PLANE_U] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->u_buffer);
+ img->planes[AOM_PLANE_V] = (uint8_t *)CONVERT_TO_SHORTPTR(yv12->v_buffer);
+ img->planes[AOM_PLANE_ALPHA] = NULL;
+ img->stride[AOM_PLANE_Y] = 2 * yv12->y_stride;
+ img->stride[AOM_PLANE_U] = 2 * yv12->uv_stride;
+ img->stride[AOM_PLANE_V] = 2 * yv12->uv_stride;
+ img->stride[AOM_PLANE_ALPHA] = 2 * yv12->y_stride;
+ }
+ img->bps = bps;
+ img->user_priv = user_priv;
+ img->img_data = yv12->buffer_alloc;
+ img->img_data_owner = 0;
+ img->self_allocd = 0;
+}
+
+static aom_codec_err_t image2yuvconfig(const aom_image_t *img,
+ YV12_BUFFER_CONFIG *yv12) {
+ yv12->y_buffer = img->planes[AOM_PLANE_Y];
+ yv12->u_buffer = img->planes[AOM_PLANE_U];
+ yv12->v_buffer = img->planes[AOM_PLANE_V];
+
+ yv12->y_crop_width = img->d_w;
+ yv12->y_crop_height = img->d_h;
+ yv12->render_width = img->r_w;
+ yv12->render_height = img->r_h;
+ yv12->y_width = img->w;
+ yv12->y_height = img->h;
+
+ yv12->uv_width =
+ img->x_chroma_shift == 1 ? (1 + yv12->y_width) / 2 : yv12->y_width;
+ yv12->uv_height =
+ img->y_chroma_shift == 1 ? (1 + yv12->y_height) / 2 : yv12->y_height;
+ yv12->uv_crop_width = yv12->uv_width;
+ yv12->uv_crop_height = yv12->uv_height;
+
+ yv12->y_stride = img->stride[AOM_PLANE_Y];
+ yv12->uv_stride = img->stride[AOM_PLANE_U];
+ yv12->color_primaries = img->cp;
+ yv12->transfer_characteristics = img->tc;
+ yv12->matrix_coefficients = img->mc;
+ yv12->monochrome = img->monochrome;
+ yv12->chroma_sample_position = img->csp;
+ yv12->color_range = img->range;
+
+ if (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) {
+ // In aom_image_t
+ // planes point to uint8 address of start of data
+ // stride counts uint8s to reach next row
+ // In YV12_BUFFER_CONFIG
+ // y_buffer, u_buffer, v_buffer point to uint16 address of data
+ // stride and border counts in uint16s
+ // This means that all the address calculations in the main body of code
+ // should work correctly.
+ // However, before we do any pixel operations we need to cast the address
+ // to a uint16 ponter and double its value.
+ yv12->y_buffer = CONVERT_TO_BYTEPTR(yv12->y_buffer);
+ yv12->u_buffer = CONVERT_TO_BYTEPTR(yv12->u_buffer);
+ yv12->v_buffer = CONVERT_TO_BYTEPTR(yv12->v_buffer);
+ yv12->y_stride >>= 1;
+ yv12->uv_stride >>= 1;
+ yv12->flags = YV12_FLAG_HIGHBITDEPTH;
+ } else {
+ yv12->flags = 0;
+ }
+ yv12->border = (yv12->y_stride - img->w) / 2;
+ yv12->subsampling_x = img->x_chroma_shift;
+ yv12->subsampling_y = img->y_chroma_shift;
+ return AOM_CODEC_OK;
+}
+
+#endif // AOM_AV1_AV1_IFACE_COMMON_H_
diff --git a/third_party/aom/av1/common/alloccommon.c b/third_party/aom/av1/common/alloccommon.c
new file mode 100644
index 0000000000..1bf81c91d4
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.c
@@ -0,0 +1,300 @@
+/*
+ *
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/onyxc_int.h"
+
+int av1_get_MBs(int width, int height) {
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+ const int mi_cols = aligned_width >> MI_SIZE_LOG2;
+ const int mi_rows = aligned_height >> MI_SIZE_LOG2;
+
+ const int mb_cols = (mi_cols + 2) >> 2;
+ const int mb_rows = (mi_rows + 2) >> 2;
+ return mb_rows * mb_cols;
+}
+
+#if LOOP_FILTER_BITMASK
+static int alloc_loop_filter_mask(AV1_COMMON *cm) {
+ aom_free(cm->lf.lfm);
+ cm->lf.lfm = NULL;
+
+ // Each lfm holds bit masks for all the 4x4 blocks in a max
+ // 64x64 (128x128 for ext_partitions) region. The stride
+ // and rows are rounded up / truncated to a multiple of 16
+ // (32 for ext_partition).
+ cm->lf.lfm_stride = (cm->mi_cols + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2;
+ cm->lf.lfm_num = ((cm->mi_rows + (MI_SIZE_64X64 - 1)) >> MIN_MIB_SIZE_LOG2) *
+ cm->lf.lfm_stride;
+ cm->lf.lfm =
+ (LoopFilterMask *)aom_calloc(cm->lf.lfm_num, sizeof(*cm->lf.lfm));
+ if (!cm->lf.lfm) return 1;
+
+ unsigned int i;
+ for (i = 0; i < cm->lf.lfm_num; ++i) av1_zero(cm->lf.lfm[i]);
+
+ return 0;
+}
+
+static void free_loop_filter_mask(AV1_COMMON *cm) {
+ if (cm->lf.lfm == NULL) return;
+
+ aom_free(cm->lf.lfm);
+ cm->lf.lfm = NULL;
+ cm->lf.lfm_num = 0;
+ cm->lf.lfm_stride = 0;
+}
+#endif
+
+void av1_set_mb_mi(AV1_COMMON *cm, int width, int height) {
+ // Ensure that the decoded width and height are both multiples of
+ // 8 luma pixels (note: this may only be a multiple of 4 chroma pixels if
+ // subsampling is used).
+ // This simplifies the implementation of various experiments,
+ // eg. cdef, which operates on units of 8x8 luma pixels.
+ const int aligned_width = ALIGN_POWER_OF_TWO(width, 3);
+ const int aligned_height = ALIGN_POWER_OF_TWO(height, 3);
+
+ cm->mi_cols = aligned_width >> MI_SIZE_LOG2;
+ cm->mi_rows = aligned_height >> MI_SIZE_LOG2;
+ cm->mi_stride = calc_mi_size(cm->mi_cols);
+
+ cm->mb_cols = (cm->mi_cols + 2) >> 2;
+ cm->mb_rows = (cm->mi_rows + 2) >> 2;
+ cm->MBs = cm->mb_rows * cm->mb_cols;
+
+#if LOOP_FILTER_BITMASK
+ alloc_loop_filter_mask(cm);
+#endif
+}
+
+void av1_free_ref_frame_buffers(BufferPool *pool) {
+ int i;
+
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (pool->frame_bufs[i].ref_count > 0 &&
+ pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
+ pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
+ pool->frame_bufs[i].ref_count = 0;
+ }
+ aom_free(pool->frame_bufs[i].mvs);
+ pool->frame_bufs[i].mvs = NULL;
+ aom_free(pool->frame_bufs[i].seg_map);
+ pool->frame_bufs[i].seg_map = NULL;
+ aom_free_frame_buffer(&pool->frame_bufs[i].buf);
+ }
+}
+
+// Assumes cm->rst_info[p].restoration_unit_size is already initialized
+void av1_alloc_restoration_buffers(AV1_COMMON *cm) {
+ const int num_planes = av1_num_planes(cm);
+ for (int p = 0; p < num_planes; ++p)
+ av1_alloc_restoration_struct(cm, &cm->rst_info[p], p > 0);
+
+ if (cm->rst_tmpbuf == NULL) {
+ CHECK_MEM_ERROR(cm, cm->rst_tmpbuf,
+ (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+ }
+
+ if (cm->rlbs == NULL) {
+ CHECK_MEM_ERROR(cm, cm->rlbs, aom_malloc(sizeof(RestorationLineBuffers)));
+ }
+
+ // For striped loop restoration, we divide each row of tiles into "stripes",
+ // of height 64 luma pixels but with an offset by RESTORATION_UNIT_OFFSET
+ // luma pixels to match the output from CDEF. We will need to store 2 *
+ // RESTORATION_CTX_VERT lines of data for each stripe, and also need to be
+ // able to quickly answer the question "Where is the <n>'th stripe for tile
+ // row <m>?" To make that efficient, we generate the rst_last_stripe array.
+ int num_stripes = 0;
+ for (int i = 0; i < cm->tile_rows; ++i) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, i);
+ const int mi_h = tile_info.mi_row_end - tile_info.mi_row_start;
+ const int ext_h = RESTORATION_UNIT_OFFSET + (mi_h << MI_SIZE_LOG2);
+ const int tile_stripes = (ext_h + 63) / 64;
+ num_stripes += tile_stripes;
+ cm->rst_end_stripe[i] = num_stripes;
+ }
+
+ // Now we need to allocate enough space to store the line buffers for the
+ // stripes
+ const int frame_w = cm->superres_upscaled_width;
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+
+ for (int p = 0; p < num_planes; ++p) {
+ const int is_uv = p > 0;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int plane_w = ((frame_w + ss_x) >> ss_x) + 2 * RESTORATION_EXTRA_HORZ;
+ const int stride = ALIGN_POWER_OF_TWO(plane_w, 5);
+ const int buf_size = num_stripes * stride * RESTORATION_CTX_VERT
+ << use_highbd;
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+
+ if (buf_size != boundaries->stripe_boundary_size ||
+ boundaries->stripe_boundary_above == NULL ||
+ boundaries->stripe_boundary_below == NULL) {
+ aom_free(boundaries->stripe_boundary_above);
+ aom_free(boundaries->stripe_boundary_below);
+
+ CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_above,
+ (uint8_t *)aom_memalign(32, buf_size));
+ CHECK_MEM_ERROR(cm, boundaries->stripe_boundary_below,
+ (uint8_t *)aom_memalign(32, buf_size));
+
+ boundaries->stripe_boundary_size = buf_size;
+ }
+ boundaries->stripe_boundary_stride = stride;
+ }
+}
+
+void av1_free_restoration_buffers(AV1_COMMON *cm) {
+ int p;
+ for (p = 0; p < MAX_MB_PLANE; ++p)
+ av1_free_restoration_struct(&cm->rst_info[p]);
+ aom_free(cm->rst_tmpbuf);
+ cm->rst_tmpbuf = NULL;
+ aom_free(cm->rlbs);
+ cm->rlbs = NULL;
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[p].boundaries;
+ aom_free(boundaries->stripe_boundary_above);
+ aom_free(boundaries->stripe_boundary_below);
+ boundaries->stripe_boundary_above = NULL;
+ boundaries->stripe_boundary_below = NULL;
+ }
+
+ aom_free_frame_buffer(&cm->rst_frame);
+}
+
+void av1_free_above_context_buffers(AV1_COMMON *cm,
+ int num_free_above_contexts) {
+ int i;
+ const int num_planes = cm->num_allocated_above_context_planes;
+
+ for (int tile_row = 0; tile_row < num_free_above_contexts; tile_row++) {
+ for (i = 0; i < num_planes; i++) {
+ aom_free(cm->above_context[i][tile_row]);
+ cm->above_context[i][tile_row] = NULL;
+ }
+ aom_free(cm->above_seg_context[tile_row]);
+ cm->above_seg_context[tile_row] = NULL;
+
+ aom_free(cm->above_txfm_context[tile_row]);
+ cm->above_txfm_context[tile_row] = NULL;
+ }
+ for (i = 0; i < num_planes; i++) {
+ aom_free(cm->above_context[i]);
+ cm->above_context[i] = NULL;
+ }
+ aom_free(cm->above_seg_context);
+ cm->above_seg_context = NULL;
+
+ aom_free(cm->above_txfm_context);
+ cm->above_txfm_context = NULL;
+
+ cm->num_allocated_above_contexts = 0;
+ cm->num_allocated_above_context_mi_col = 0;
+ cm->num_allocated_above_context_planes = 0;
+}
+
+void av1_free_context_buffers(AV1_COMMON *cm) {
+ cm->free_mi(cm);
+
+ av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+
+#if LOOP_FILTER_BITMASK
+ free_loop_filter_mask(cm);
+#endif
+}
+
+int av1_alloc_above_context_buffers(AV1_COMMON *cm,
+ int num_alloc_above_contexts) {
+ const int num_planes = av1_num_planes(cm);
+ int plane_idx;
+ const int aligned_mi_cols =
+ ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+
+ // Allocate above context buffers
+ cm->num_allocated_above_contexts = num_alloc_above_contexts;
+ cm->num_allocated_above_context_mi_col = aligned_mi_cols;
+ cm->num_allocated_above_context_planes = num_planes;
+ for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ cm->above_context[plane_idx] = (ENTROPY_CONTEXT **)aom_calloc(
+ num_alloc_above_contexts, sizeof(cm->above_context[0]));
+ if (!cm->above_context[plane_idx]) return 1;
+ }
+
+ cm->above_seg_context = (PARTITION_CONTEXT **)aom_calloc(
+ num_alloc_above_contexts, sizeof(cm->above_seg_context));
+ if (!cm->above_seg_context) return 1;
+
+ cm->above_txfm_context = (TXFM_CONTEXT **)aom_calloc(
+ num_alloc_above_contexts, sizeof(cm->above_txfm_context));
+ if (!cm->above_txfm_context) return 1;
+
+ for (int tile_row = 0; tile_row < num_alloc_above_contexts; tile_row++) {
+ for (plane_idx = 0; plane_idx < num_planes; plane_idx++) {
+ cm->above_context[plane_idx][tile_row] = (ENTROPY_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*cm->above_context[0][tile_row]));
+ if (!cm->above_context[plane_idx][tile_row]) return 1;
+ }
+
+ cm->above_seg_context[tile_row] = (PARTITION_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*cm->above_seg_context[tile_row]));
+ if (!cm->above_seg_context[tile_row]) return 1;
+
+ cm->above_txfm_context[tile_row] = (TXFM_CONTEXT *)aom_calloc(
+ aligned_mi_cols, sizeof(*cm->above_txfm_context[tile_row]));
+ if (!cm->above_txfm_context[tile_row]) return 1;
+ }
+
+ return 0;
+}
+
+int av1_alloc_context_buffers(AV1_COMMON *cm, int width, int height) {
+ int new_mi_size;
+
+ av1_set_mb_mi(cm, width, height);
+ new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+ if (cm->mi_alloc_size < new_mi_size) {
+ cm->free_mi(cm);
+ if (cm->alloc_mi(cm, new_mi_size)) goto fail;
+ }
+
+ return 0;
+
+fail:
+ // clear the mi_* values to force a realloc on resync
+ av1_set_mb_mi(cm, 0, 0);
+ av1_free_context_buffers(cm);
+ return 1;
+}
+
+void av1_remove_common(AV1_COMMON *cm) {
+ av1_free_context_buffers(cm);
+
+ aom_free(cm->fc);
+ cm->fc = NULL;
+ aom_free(cm->frame_contexts);
+ cm->frame_contexts = NULL;
+}
+
+void av1_init_context_buffers(AV1_COMMON *cm) { cm->setup_mi(cm); }
diff --git a/third_party/aom/av1/common/alloccommon.h b/third_party/aom/av1/common/alloccommon.h
new file mode 100644
index 0000000000..8e58969814
--- /dev/null
+++ b/third_party/aom/av1/common/alloccommon.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ALLOCCOMMON_H_
+#define AOM_AV1_COMMON_ALLOCCOMMON_H_
+
+#define INVALID_IDX -1 // Invalid buffer index.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct BufferPool;
+
+void av1_remove_common(struct AV1Common *cm);
+
+int av1_alloc_above_context_buffers(struct AV1Common *cm,
+ int num_alloc_above_contexts);
+void av1_free_above_context_buffers(struct AV1Common *cm,
+ int num_free_above_contexts);
+int av1_alloc_context_buffers(struct AV1Common *cm, int width, int height);
+void av1_init_context_buffers(struct AV1Common *cm);
+void av1_free_context_buffers(struct AV1Common *cm);
+
+void av1_free_ref_frame_buffers(struct BufferPool *pool);
+void av1_alloc_restoration_buffers(struct AV1Common *cm);
+void av1_free_restoration_buffers(struct AV1Common *cm);
+
+int av1_alloc_state_buffers(struct AV1Common *cm, int width, int height);
+void av1_free_state_buffers(struct AV1Common *cm);
+
+void av1_set_mb_mi(struct AV1Common *cm, int width, int height);
+int av1_get_MBs(int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ALLOCCOMMON_H_
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
new file mode 100644
index 0000000000..bad411743d
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.c
@@ -0,0 +1,3231 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+#include "av1/common/arm/av1_inv_txfm_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+// 1D functions
+static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
+ { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
+ { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
+ { av1_idct32_new, NULL, NULL },
+ { av1_idct64_new, NULL, NULL },
+};
+
+static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ int16x8_t temp_output;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
+ temp_output = vaddq_s16(temp_output, in[j]);
+ vst1_u8(output, vqmovun_s16(temp_output));
+ output += stride;
+ }
+}
+
+static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
+ int16x8_t res0,
+ int16x8_t res1) {
+ int16x8_t temp_output[2];
+ uint8x16_t temp_output_8q;
+ temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
+ temp_output[0] = vaddq_s16(temp_output[0], res0);
+ temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
+ temp_output[1] = vaddq_s16(temp_output[1], res1);
+ temp_output_8q =
+ vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
+ return temp_output_8q;
+}
+
+static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
+ uint8_t *output, int stride,
+ int flipud, int height) {
+ uint8x16_t temp_output_8q;
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ temp_output_8q = vld1q_u8(output + i * stride);
+ temp_output_8q =
+ lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
+ vst1q_u8((output + i * stride), temp_output_8q);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
+ int value) {
+ for (int i = 0; i < size; i++) {
+ a[i] = vdupq_n_s16((int16_t)value);
+ }
+}
+
+static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
+ int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0_l, s0_h, s1_l, s1_h;
+ int16x4_t v0[2], v1[2];
+
+ s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
+ s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
+ s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
+ s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
+
+ v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
+ const int16x8_t in1, const int16x4_t c,
+ int16x8_t *t0, int16x8_t *t1) {
+ int32x4_t s0[2], s1[2];
+ int16x4_t v0[2], v1[2];
+
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+
+ v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
+
+ *t0 = vcombine_s16(v0[0], v0[1]);
+ *t1 = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
+ int32x4_t t0[2], t1[2];
+ int16x4_t v0[2], v1[2];
+
+ // Don't add/sub before multiply, which will overflow in iadst8.
+ const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
+ const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
+ const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
+ const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
+
+ t0[0] = vaddq_s32(x0_lo, x1_lo);
+ t0[1] = vaddq_s32(x0_hi, x1_hi);
+ t1[0] = vsubq_s32(x0_lo, x1_lo);
+ t1[1] = vsubq_s32(x0_hi, x1_hi);
+
+ v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
+ v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
+ v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
+ v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
+
+ x[0] = vcombine_s16(v0[0], v0[1]);
+ x[1] = vcombine_s16(v1[0], v1[1]);
+}
+
+static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
+ int16_t *const c2,
+ int16_t *const c3) {
+ int16x4_t val = vdup_n_s16((int16_t)0);
+ val = vld1_lane_s16(c0, val, 0);
+ val = vld1_lane_s16(c1, val, 1);
+ val = vld1_lane_s16(c2, val, 2);
+ val = vld1_lane_s16(c3, val, 3);
+ return val;
+}
+
+static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // Stage 1
+ x[0] = in[7];
+ x[1] = in[0];
+ x[2] = in[5];
+ x[3] = in[2];
+ x[4] = in[3];
+ x[5] = in[4];
+ x[6] = in[1];
+ x[7] = in[6];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s4);
+ x[1] = vqaddq_s16(s1, s5);
+ x[2] = vqaddq_s16(s2, s6);
+ x[3] = vqaddq_s16(s3, s7);
+ x[4] = vqsubq_s16(s0, s4);
+ x[5] = vqsubq_s16(s1, s5);
+ x[6] = vqsubq_s16(s2, s6);
+ x[7] = vqsubq_s16(s3, s7);
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ s2 = x[2];
+ s3 = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
+
+ // Stage 5
+ x[0] = vqaddq_s16(s0, s2);
+ x[1] = vqaddq_s16(s1, s3);
+ x[2] = vqsubq_s16(s0, s2);
+ x[3] = vqsubq_s16(s1, s3);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[8];
+ int16x8_t s0, s1, s4, s5;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+
+ btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[4] = s0;
+ x[5] = s1;
+
+ // Stage 4
+ s0 = x[0];
+ s1 = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
+
+ // Stage 5
+ x[0] = s0;
+ x[1] = s1;
+ x[2] = s0;
+ x[3] = s1;
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+
+ // stage 6
+ btf_16_half_neon(x + 2, c2);
+ btf_16_half_neon(x + 6, c2);
+
+ // Stage 7
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[4]);
+ out[2] = x[6];
+ out[3] = vnegq_s16(x[2]);
+ out[4] = x[3];
+ out[5] = vnegq_s16(x[7]);
+ out[6] = x[5];
+ out[7] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
+ int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[8], step2[8];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+ btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
+
+ // stage 3
+ btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+
+ // stage 4
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
+
+ // stage 5
+ out[0] = vqaddq_s16(step1[0], step2[7]);
+ out[1] = vqaddq_s16(step1[1], step1[6]);
+ out[2] = vqaddq_s16(step1[2], step1[5]);
+ out[3] = vqaddq_s16(step1[3], step2[4]);
+ out[4] = vqsubq_s16(step1[3], step2[4]);
+ out[5] = vqsubq_s16(step1[2], step1[5]);
+ out[6] = vqsubq_s16(step1[1], step1[6]);
+ out[7] = vqsubq_s16(step1[0], step2[7]);
+}
+
+static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
+
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 4
+ // stage 5
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+}
+
+void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
+ for (int i = 0; i < size; i++) {
+ arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
+ }
+}
+
+static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
+ int16x8_t temp[8];
+ for (int i = 0; i < size; ++i) {
+ temp[i] = input[size - 1 - i];
+ }
+ for (int i = 0; i < size; ++i) {
+ input[i] = temp[i];
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
+ int16x8_t *const a,
+ int out_size) {
+ for (int i = 0; i < 8; ++i) {
+ a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
+ vmovn_s32(vld1q_s32(input + 4)));
+ input += out_size;
+ }
+}
+
+static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ output[0] = vmulq_n_s16(input[0], (int16_t)2);
+ output[1] = vmulq_n_s16(input[1], (int16_t)2);
+ output[2] = vmulq_n_s16(input[2], (int16_t)2);
+ output[3] = vmulq_n_s16(input[3], (int16_t)2);
+ output[4] = vmulq_n_s16(input[4], (int16_t)2);
+ output[5] = vmulq_n_s16(input[5], (int16_t)2);
+ output[6] = vmulq_n_s16(input[6], (int16_t)2);
+ output[7] = vmulq_n_s16(input[7], (int16_t)2);
+}
+
+static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
+ int size) {
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+
+ for (int z = 0; z < size; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ int32x4_t out_low, out_high;
+ int16x4_t low, high;
+ int16_t scale = (int16_t)(2 * NewSqrt2);
+
+ for (int z = 0; z < 16; ++z) {
+ out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
+ out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
+
+ low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
+ high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
+
+ output[z] = vcombine_s16(low, high);
+ }
+}
+
+static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ (void)cos_bit;
+
+ for (int z = 0; z < 32; ++z) {
+ output[z] = vmulq_n_s16(input[z], (int16_t)4);
+ }
+}
+
+static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 4
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+}
+
+static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
+ btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
+ btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
+ btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
+
+ step2[0] = in[0];
+ step2[1] = in[8];
+ step2[2] = in[4];
+ step2[3] = in[12];
+ step2[4] = in[2];
+ step2[5] = in[10];
+ step2[6] = in[6];
+ step2[7] = in[14];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
+ btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
+ btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[16], step2[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[2] = in[4];
+ step2[4] = in[2];
+ step2[6] = in[6];
+
+ btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
+ btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
+ btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
+ btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
+
+ // stage 3
+
+ btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
+ btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[8] = vqaddq_s16(step2[8], step2[9]);
+ step1[9] = vqsubq_s16(step2[8], step2[9]);
+ step1[10] = vqsubq_s16(step2[11], step2[10]);
+ step1[11] = vqaddq_s16(step2[11], step2[10]);
+ step1[12] = vqaddq_s16(step2[12], step2[13]);
+ step1[13] = vqsubq_s16(step2[12], step2[13]);
+ step1[14] = vqsubq_s16(step2[15], step2[14]);
+ step1[15] = vqaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+
+ btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
+ btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
+ btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
+ &step2[10], &step2[13]);
+
+ step2[4] = vqaddq_s16(step1[4], step1[5]);
+ step2[5] = vqsubq_s16(step1[4], step1[5]);
+ step2[6] = vqsubq_s16(step1[7], step1[6]);
+ step2[7] = vqaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
+ step1[0] = vqaddq_s16(step2[0], step2[3]);
+ step1[1] = vqaddq_s16(step2[1], step2[2]);
+ step1[2] = vqsubq_s16(step2[1], step2[2]);
+ step1[3] = vqsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ step1[7] = step2[7];
+ step1[8] = vqaddq_s16(step2[8], step2[11]);
+ step1[9] = vqaddq_s16(step2[9], step2[10]);
+ step1[10] = vqsubq_s16(step2[9], step2[10]);
+ step1[11] = vqsubq_s16(step2[8], step2[11]);
+ step1[12] = vqsubq_s16(step2[15], step2[12]);
+ step1[13] = vqsubq_s16(step2[14], step2[13]);
+ step1[14] = vqaddq_s16(step2[14], step2[13]);
+ step1[15] = vqaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
+ btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[7]);
+ step2[1] = vqaddq_s16(step1[1], step1[6]);
+ step2[2] = vqaddq_s16(step1[2], step1[5]);
+ step2[3] = vqaddq_s16(step1[3], step1[4]);
+ step2[4] = vqsubq_s16(step1[3], step1[4]);
+ step2[5] = vqsubq_s16(step1[2], step1[5]);
+ step2[6] = vqsubq_s16(step1[1], step1[6]);
+ step2[7] = vqsubq_s16(step1[0], step1[7]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+}
+
+static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[0] = in[15];
+ x[1] = in[0];
+ x[2] = in[13];
+ x[3] = in[2];
+ x[4] = in[11];
+ x[5] = in[4];
+ x[6] = in[9];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[9] = in[8];
+ x[10] = in[5];
+ x[11] = in[10];
+ x[12] = in[3];
+ x[13] = in[12];
+ x[14] = in[1];
+ x[15] = in[14];
+
+ // Stage 2
+ btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
+ btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
+ btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
+ btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
+ btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
+ btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
+ btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[10];
+ int16x8_t s0, s1, s4, s5;
+ int16x8_t s8, s9, s12, s13;
+
+ // Stage 1
+ x[1] = in[0];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+
+ // Stage 3
+ x[0] = s0;
+ x[1] = s1;
+ x[8] = s0;
+ x[9] = s1;
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+
+ // Stage 5
+ x[0] = t[0];
+ x[1] = t[1];
+ x[4] = t[0];
+ x[5] = t[1];
+ x[8] = s8;
+ x[9] = s9;
+ x[12] = s8;
+ x[13] = s9;
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ t[8] = x[8];
+ t[9] = x[9];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+
+ // Stage 7
+ x[0] = t[0];
+ x[1] = t[1];
+ x[2] = t[0];
+ x[3] = t[1];
+ x[4] = s4;
+ x[5] = s5;
+ x[6] = s4;
+ x[7] = s5;
+ x[8] = t[8];
+ x[9] = t[9];
+ x[10] = t[8];
+ x[11] = t[9];
+ x[12] = s12;
+ x[13] = s13;
+ x[14] = s12;
+ x[15] = s13;
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ int16x8_t x[16];
+ int16x8_t t[14];
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
+
+ // Stage 1
+ x[1] = in[0];
+ x[3] = in[2];
+ x[5] = in[4];
+ x[7] = in[6];
+ x[8] = in[7];
+ x[10] = in[5];
+ x[12] = in[3];
+ x[14] = in[1];
+
+ // Stage 2
+ btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
+ btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
+ btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
+ btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
+
+ btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
+ btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
+ btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
+ btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
+
+ // Stage 3
+ x[0] = vqaddq_s16(s0, s8);
+ x[1] = vqaddq_s16(s1, s9);
+ x[2] = vqaddq_s16(s2, s10);
+ x[3] = vqaddq_s16(s3, s11);
+ x[4] = vqaddq_s16(s4, s12);
+ x[5] = vqaddq_s16(s5, s13);
+ x[6] = vqaddq_s16(s6, s14);
+ x[7] = vqaddq_s16(s7, s15);
+ x[8] = vqsubq_s16(s0, s8);
+ x[9] = vqsubq_s16(s1, s9);
+ x[10] = vqsubq_s16(s2, s10);
+ x[11] = vqsubq_s16(s3, s11);
+ x[12] = vqsubq_s16(s4, s12);
+ x[13] = vqsubq_s16(s5, s13);
+ x[14] = vqsubq_s16(s6, s14);
+ x[15] = vqsubq_s16(s7, s15);
+
+ // Stage 4
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ t[4] = x[4];
+ t[5] = x[5];
+ t[6] = x[6];
+ t[7] = x[7];
+ btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
+ btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
+ btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
+ btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
+
+ // Stage 5
+ x[0] = vqaddq_s16(t[0], t[4]);
+ x[1] = vqaddq_s16(t[1], t[5]);
+ x[2] = vqaddq_s16(t[2], t[6]);
+ x[3] = vqaddq_s16(t[3], t[7]);
+ x[4] = vqsubq_s16(t[0], t[4]);
+ x[5] = vqsubq_s16(t[1], t[5]);
+ x[6] = vqsubq_s16(t[2], t[6]);
+ x[7] = vqsubq_s16(t[3], t[7]);
+ x[8] = vqaddq_s16(s8, s12);
+ x[9] = vqaddq_s16(s9, s13);
+ x[10] = vqaddq_s16(s10, s14);
+ x[11] = vqaddq_s16(s11, s15);
+ x[12] = vqsubq_s16(s8, s12);
+ x[13] = vqsubq_s16(s9, s13);
+ x[14] = vqsubq_s16(s10, s14);
+ x[15] = vqsubq_s16(s11, s15);
+
+ // stage 6
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
+ btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
+ t[8] = x[8];
+ t[9] = x[9];
+ t[10] = x[10];
+ t[11] = x[11];
+ btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
+ btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
+
+ // Stage 7
+ x[0] = vqaddq_s16(t[0], t[2]);
+ x[1] = vqaddq_s16(t[1], t[3]);
+ x[2] = vqsubq_s16(t[0], t[2]);
+ x[3] = vqsubq_s16(t[1], t[3]);
+ x[4] = vqaddq_s16(s4, s6);
+ x[5] = vqaddq_s16(s5, s7);
+ x[6] = vqsubq_s16(s4, s6);
+ x[7] = vqsubq_s16(s5, s7);
+ x[8] = vqaddq_s16(t[8], t[10]);
+ x[9] = vqaddq_s16(t[9], t[11]);
+ x[10] = vqsubq_s16(t[8], t[10]);
+ x[11] = vqsubq_s16(t[9], t[11]);
+ x[12] = vqaddq_s16(s12, s14);
+ x[13] = vqaddq_s16(s13, s15);
+ x[14] = vqsubq_s16(s12, s14);
+ x[15] = vqsubq_s16(s13, s15);
+
+ // Stage 8
+ btf_16_half_neon(x + 2, c);
+ btf_16_half_neon(x + 6, c);
+ btf_16_half_neon(x + 10, c);
+ btf_16_half_neon(x + 14, c);
+
+ // Stage 9
+ out[0] = x[0];
+ out[1] = vnegq_s16(x[8]);
+ out[2] = x[12];
+ out[3] = vnegq_s16(x[4]);
+ out[4] = x[6];
+ out[5] = vnegq_s16(x[14]);
+ out[6] = x[10];
+ out[7] = vnegq_s16(x[2]);
+ out[8] = x[3];
+ out[9] = vnegq_s16(x[11]);
+ out[10] = x[15];
+ out[11] = vnegq_s16(x[7]);
+ out[12] = x[5];
+ out[13] = vnegq_s16(x[13]);
+ out[14] = x[9];
+ out[15] = vnegq_s16(x[1]);
+}
+
+static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
+ (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
+ (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
+ const int16x4_t c2 =
+ create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
+ (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
+ const int16x4_t c3 =
+ create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
+ (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
+ const int16x4_t c4 =
+ create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
+ (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
+ const int16x4_t c5 =
+ create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
+ (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
+ const int16x4_t c6 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c7 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 2
+
+ btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
+ btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
+ btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
+ btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
+ btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
+ btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
+
+ step2[0] = in[0];
+ step2[1] = in[16];
+ step2[2] = in[8];
+ step2[3] = in[24];
+ step2[4] = in[4];
+ step2[5] = in[20];
+ step2[6] = in[12];
+ step2[7] = in[28];
+ step2[8] = in[2];
+ step2[9] = in[18];
+ step2[10] = in[10];
+ step2[11] = in[26];
+ step2[12] = in[6];
+ step2[13] = in[22];
+ step2[14] = in[14];
+ step2[15] = in[30];
+
+ // stage 3
+
+ btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
+ btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
+ btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
+ btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
+
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ step1[4] = step2[4];
+ step1[5] = step2[5];
+ step1[6] = step2[6];
+ step1[7] = step2[7];
+
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
+ btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
+ btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[1], step1[2]);
+ step2[2] = vqsubq_s16(step1[1], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1;
+ int32x4_t t32[2];
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
+ step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+
+ out[0] = step1;
+ out[1] = step1;
+ out[2] = step1;
+ out[3] = step1;
+ out[4] = step1;
+ out[5] = step1;
+ out[6] = step1;
+ out[7] = step1;
+ out[8] = step1;
+ out[9] = step1;
+ out[10] = step1;
+ out[11] = step1;
+ out[12] = step1;
+ out[13] = step1;
+ out[14] = step1;
+ out[15] = step1;
+ out[16] = step1;
+ out[17] = step1;
+ out[18] = step1;
+ out[19] = step1;
+ out[20] = step1;
+ out[21] = step1;
+ out[22] = step1;
+ out[23] = step1;
+ out[24] = step1;
+ out[25] = step1;
+ out[26] = step1;
+ out[27] = step1;
+ out[28] = step1;
+ out[29] = step1;
+ out[30] = step1;
+ out[31] = step1;
+}
+
+static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ step2[0] = in[0];
+ step2[4] = in[4];
+ step2[8] = in[2];
+ step2[12] = in[6];
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] = step2[4];
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[16] = step2[16];
+ step1[17] = step2[16];
+ step1[18] = step2[19];
+ step1[19] = step2[19];
+ step1[20] = step2[20];
+ step1[21] = step2[20];
+ step1[22] = step2[23];
+ step1[23] = step2[23];
+ step1[24] = step2[24];
+ step1[25] = step2[24];
+ step1[26] = step2[27];
+ step1[27] = step2[27];
+ step1[28] = step2[28];
+ step1[29] = step2[28];
+ step1[30] = step2[31];
+ step1[31] = step2[31];
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[8] = step1[8];
+ step2[9] = step1[8];
+ step2[10] = step1[11];
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[13] = step1[12];
+ step2[14] = step1[15];
+ step2[15] = step1[15];
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = step2[4];
+ step1[5] = step2[4];
+ step1[6] = step2[7];
+ step1[7] = step2[7];
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = step1[0];
+ step2[1] = step1[0];
+ step2[2] = step1[0];
+ step2[3] = step1[0];
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
+ int8_t cos_bit, int bit) {
+ (void)bit;
+ const int32_t *cospi = cospi_arr(cos_bit);
+ int16x8_t step1[32], step2[32];
+ int32x4_t t32[16];
+ const int16x4_t c0 =
+ create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
+ (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
+ const int16x4_t c1 =
+ create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
+ (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
+
+ // stage 1
+ // stage 2
+
+ btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
+ btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
+ btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
+ btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
+ btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
+ btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
+ btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
+ btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
+
+ step2[0] = in[0];
+ step2[2] = in[8];
+ step2[4] = in[4];
+ step2[6] = in[12];
+ step2[8] = in[2];
+ step2[10] = in[10];
+ step2[12] = in[6];
+ step2[14] = in[14];
+
+ // stage 3
+
+ btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
+ btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
+ btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
+ btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
+
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = step2[4];
+ step1[6] = step2[6];
+ step1[16] = vqaddq_s16(step2[16], step2[17]);
+ step1[17] = vqsubq_s16(step2[16], step2[17]);
+ step1[18] = vqsubq_s16(step2[19], step2[18]);
+ step1[19] = vqaddq_s16(step2[19], step2[18]);
+ step1[20] = vqaddq_s16(step2[20], step2[21]);
+ step1[21] = vqsubq_s16(step2[20], step2[21]);
+ step1[22] = vqsubq_s16(step2[23], step2[22]);
+ step1[23] = vqaddq_s16(step2[23], step2[22]);
+ step1[24] = vqaddq_s16(step2[24], step2[25]);
+ step1[25] = vqsubq_s16(step2[24], step2[25]);
+ step1[26] = vqsubq_s16(step2[27], step2[26]);
+ step1[27] = vqaddq_s16(step2[27], step2[26]);
+ step1[28] = vqaddq_s16(step2[28], step2[29]);
+ step1[29] = vqsubq_s16(step2[28], step2[29]);
+ step1[30] = vqsubq_s16(step2[31], step2[30]);
+ step1[31] = vqaddq_s16(step2[31], step2[30]);
+
+ // stage 4
+
+ btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
+ btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
+ btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
+ btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
+ &step2[18], &step2[29]);
+ btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
+ &step2[22], &step2[25]);
+
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[8] = vqaddq_s16(step1[8], step1[9]);
+ step2[9] = vqsubq_s16(step1[8], step1[9]);
+ step2[10] = vqsubq_s16(step1[11], step1[10]);
+ step2[11] = vqaddq_s16(step1[11], step1[10]);
+ step2[12] = vqaddq_s16(step1[12], step1[13]);
+ step2[13] = vqsubq_s16(step1[12], step1[13]);
+ step2[14] = vqsubq_s16(step1[15], step1[14]);
+ step2[15] = vqaddq_s16(step1[15], step1[14]);
+ step2[16] = step1[16];
+ step2[19] = step1[19];
+ step2[20] = step1[20];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[27] = step1[27];
+ step2[28] = step1[28];
+ step2[31] = step1[31];
+
+ // stage 5
+
+ t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
+ t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
+
+ step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
+ vrshrn_n_s32(t32[1], INV_COS_BIT));
+
+ btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
+ btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
+ btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
+ &step1[10], &step1[13]);
+
+ step1[4] = vqaddq_s16(step2[4], step2[5]);
+ step1[5] = vqsubq_s16(step2[4], step2[5]);
+ step1[6] = vqsubq_s16(step2[7], step2[6]);
+ step1[7] = vqaddq_s16(step2[7], step2[6]);
+ step1[8] = step2[8];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[19]);
+ step1[17] = vqaddq_s16(step2[17], step2[18]);
+ step1[18] = vqsubq_s16(step2[17], step2[18]);
+ step1[19] = vqsubq_s16(step2[16], step2[19]);
+ step1[20] = vqsubq_s16(step2[23], step2[20]);
+ step1[21] = vqsubq_s16(step2[22], step2[21]);
+ step1[22] = vqaddq_s16(step2[22], step2[21]);
+ step1[23] = vqaddq_s16(step2[23], step2[20]);
+ step1[24] = vqaddq_s16(step2[24], step2[27]);
+ step1[25] = vqaddq_s16(step2[25], step2[26]);
+ step1[26] = vqsubq_s16(step2[25], step2[26]);
+ step1[27] = vqsubq_s16(step2[24], step2[27]);
+ step1[28] = vqsubq_s16(step2[31], step2[28]);
+ step1[29] = vqsubq_s16(step2[30], step2[29]);
+ step1[30] = vqaddq_s16(step2[30], step2[29]);
+ step1[31] = vqaddq_s16(step2[31], step2[28]);
+
+ // stage 6
+
+ btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
+ btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
+ btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
+ &step2[20], &step2[27]);
+ btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
+ &step2[21], &step2[26]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[3]);
+ step2[1] = vqaddq_s16(step1[0], step1[2]);
+ step2[2] = vqsubq_s16(step1[0], step1[2]);
+ step2[3] = vqsubq_s16(step1[0], step1[3]);
+ step2[4] = step1[4];
+ step2[7] = step1[7];
+ step2[8] = vqaddq_s16(step1[8], step1[11]);
+ step2[9] = vqaddq_s16(step1[9], step1[10]);
+ step2[10] = vqsubq_s16(step1[9], step1[10]);
+ step2[11] = vqsubq_s16(step1[8], step1[11]);
+ step2[12] = vqsubq_s16(step1[15], step1[12]);
+ step2[13] = vqsubq_s16(step1[14], step1[13]);
+ step2[14] = vqaddq_s16(step1[14], step1[13]);
+ step2[15] = vqaddq_s16(step1[15], step1[12]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[22] = step1[22];
+ step2[23] = step1[23];
+ step2[24] = step1[24];
+ step2[25] = step1[25];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 7
+
+ btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
+ btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
+
+ step1[0] = vqaddq_s16(step2[0], step2[7]);
+ step1[1] = vqaddq_s16(step2[1], step2[6]);
+ step1[2] = vqaddq_s16(step2[2], step2[5]);
+ step1[3] = vqaddq_s16(step2[3], step2[4]);
+ step1[4] = vqsubq_s16(step2[3], step2[4]);
+ step1[5] = vqsubq_s16(step2[2], step2[5]);
+ step1[6] = vqsubq_s16(step2[1], step2[6]);
+ step1[7] = vqsubq_s16(step2[0], step2[7]);
+ step1[8] = step2[8];
+ step1[9] = step2[9];
+ step1[14] = step2[14];
+ step1[15] = step2[15];
+ step1[16] = vqaddq_s16(step2[16], step2[23]);
+ step1[17] = vqaddq_s16(step2[17], step2[22]);
+ step1[18] = vqaddq_s16(step2[18], step2[21]);
+ step1[19] = vqaddq_s16(step2[19], step2[20]);
+ step1[20] = vqsubq_s16(step2[19], step2[20]);
+ step1[21] = vqsubq_s16(step2[18], step2[21]);
+ step1[22] = vqsubq_s16(step2[17], step2[22]);
+ step1[23] = vqsubq_s16(step2[16], step2[23]);
+ step1[24] = vqsubq_s16(step2[31], step2[24]);
+ step1[25] = vqsubq_s16(step2[30], step2[25]);
+ step1[26] = vqsubq_s16(step2[29], step2[26]);
+ step1[27] = vqsubq_s16(step2[28], step2[27]);
+ step1[28] = vqaddq_s16(step2[27], step2[28]);
+ step1[29] = vqaddq_s16(step2[26], step2[29]);
+ step1[30] = vqaddq_s16(step2[25], step2[30]);
+ step1[31] = vqaddq_s16(step2[24], step2[31]);
+
+ // stage 8
+
+ btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
+ btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
+ btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
+ btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
+
+ step2[0] = vqaddq_s16(step1[0], step1[15]);
+ step2[1] = vqaddq_s16(step1[1], step1[14]);
+ step2[2] = vqaddq_s16(step1[2], step1[13]);
+ step2[3] = vqaddq_s16(step1[3], step1[12]);
+ step2[4] = vqaddq_s16(step1[4], step1[11]);
+ step2[5] = vqaddq_s16(step1[5], step1[10]);
+ step2[6] = vqaddq_s16(step1[6], step1[9]);
+ step2[7] = vqaddq_s16(step1[7], step1[8]);
+ step2[8] = vqsubq_s16(step1[7], step1[8]);
+ step2[9] = vqsubq_s16(step1[6], step1[9]);
+ step2[10] = vqsubq_s16(step1[5], step1[10]);
+ step2[11] = vqsubq_s16(step1[4], step1[11]);
+ step2[12] = vqsubq_s16(step1[3], step1[12]);
+ step2[13] = vqsubq_s16(step1[2], step1[13]);
+ step2[14] = vqsubq_s16(step1[1], step1[14]);
+ step2[15] = vqsubq_s16(step1[0], step1[15]);
+ step2[16] = step1[16];
+ step2[17] = step1[17];
+ step2[18] = step1[18];
+ step2[19] = step1[19];
+ step2[28] = step1[28];
+ step2[29] = step1[29];
+ step2[30] = step1[30];
+ step2[31] = step1[31];
+
+ // stage 9
+
+ out[0] = vqaddq_s16(step2[0], step2[31]);
+ out[1] = vqaddq_s16(step2[1], step2[30]);
+ out[2] = vqaddq_s16(step2[2], step2[29]);
+ out[3] = vqaddq_s16(step2[3], step2[28]);
+ out[4] = vqaddq_s16(step2[4], step2[27]);
+ out[5] = vqaddq_s16(step2[5], step2[26]);
+ out[6] = vqaddq_s16(step2[6], step2[25]);
+ out[7] = vqaddq_s16(step2[7], step2[24]);
+ out[8] = vqaddq_s16(step2[8], step2[23]);
+ out[9] = vqaddq_s16(step2[9], step2[22]);
+ out[10] = vqaddq_s16(step2[10], step2[21]);
+ out[11] = vqaddq_s16(step2[11], step2[20]);
+ out[12] = vqaddq_s16(step2[12], step2[19]);
+ out[13] = vqaddq_s16(step2[13], step2[18]);
+ out[14] = vqaddq_s16(step2[14], step2[17]);
+ out[15] = vqaddq_s16(step2[15], step2[16]);
+ out[16] = vqsubq_s16(step2[15], step2[16]);
+ out[17] = vqsubq_s16(step2[14], step2[17]);
+ out[18] = vqsubq_s16(step2[13], step2[18]);
+ out[19] = vqsubq_s16(step2[12], step2[19]);
+ out[20] = vqsubq_s16(step2[11], step2[20]);
+ out[21] = vqsubq_s16(step2[10], step2[21]);
+ out[22] = vqsubq_s16(step2[9], step2[22]);
+ out[23] = vqsubq_s16(step2[8], step2[23]);
+ out[24] = vqsubq_s16(step2[7], step2[24]);
+ out[25] = vqsubq_s16(step2[6], step2[25]);
+ out[26] = vqsubq_s16(step2[5], step2[26]);
+ out[27] = vqsubq_s16(step2[4], step2[27]);
+ out[28] = vqsubq_s16(step2[3], step2[28]);
+ out[29] = vqsubq_s16(step2[2], step2[29]);
+ out[30] = vqsubq_s16(step2[1], step2[30]);
+ out[31] = vqsubq_s16(step2[0], step2[31]);
+}
+
+// Functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_neon
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { av1_idct4_new, av1_idct4_new, NULL, NULL },
+ { av1_iadst4_new, av1_iadst4_new, NULL, NULL },
+ { av1_iidentity4_c, av1_iidentity4_c, NULL, NULL },
+ },
+ { { av1_idct8_new, av1_idct8_new, NULL, NULL },
+ { av1_iadst8_new, av1_iadst8_new, NULL, NULL },
+ { av1_iidentity8_c, av1_iidentity8_c, NULL, NULL } },
+ {
+ { av1_idct16_new, av1_idct16_new, av1_idct16_new, NULL },
+ { av1_iadst16_new, av1_iadst16_new, av1_iadst16_new, NULL },
+ { av1_iidentity16_c, av1_iidentity16_c, av1_iidentity16_c, NULL },
+ },
+ { { av1_idct32_new, av1_idct32_new, av1_idct32_new, av1_idct32_new },
+ { NULL, NULL, NULL, NULL },
+ { av1_iidentity32_c, av1_iidentity32_c, av1_iidentity32_c,
+ av1_iidentity32_c } },
+ { { av1_idct64_new, av1_idct64_new, av1_idct64_new, av1_idct64_new },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static const transform_neon
+ lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
+ { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
+ { identity8_new_neon, identity8_new_neon, NULL, NULL } },
+ {
+ { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
+ { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
+ NULL },
+ { identity16_new_neon, identity16_new_neon, identity16_new_neon,
+ NULL },
+ },
+ { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
+ idct32_new_neon },
+ { NULL, NULL, NULL, NULL },
+ { identity32_new_neon, identity32_new_neon, identity32_new_neon,
+ identity32_new_neon } },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_idtx_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = buf[r * txfm_size_col + c];
+
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[32 * 4];
+ int16x8_t b[32 * 4];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_wxh_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(
+ &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_wxh_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // row tx
+ int row_start = (buf_size_nonzero_h_div8 * 8);
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ // col tx
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[16 * 2];
+ int16x8_t b[16 * 2];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
+ 0);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
+ int32_t *temp_in = txfm_buf;
+
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ int r, bd = 8;
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < txfm_size_row; i++) {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ for (int c = 0; c < txfm_size_col; ++c) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+ int32_t *temp_in = txfm_buf;
+
+ int eobx, eoby, ud_flip, lr_flip, row_start;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
+ const int bd = 8;
+ int r;
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ row_start = (buf_size_nonzero_h_div8 << 3);
+
+ for (int i = 0; i < row_start; i++) {
+ if (abs(rect_type) == 1) {
+ for (int j = 0; j < txfm_size_col; j++)
+ temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
+ row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
+ } else {
+ row_txfm(input, buf_ptr, cos_bit_row, stage_range);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Doing memset for the rows which are not processed in row transform.
+ memset(buf_ptr, 0,
+ sizeof(int32_t) * txfm_size_col * (txfm_size_row - row_start));
+
+ for (int c = 0; c < txfm_size_col; c++) {
+ if (lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+
+ if (ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int16x8_t a[64 * 8];
+ int16x8_t b[64 * 8];
+ int eobx, eoby, ud_flip, lr_flip;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const int32_t *input_1;
+ int temp_b = 0;
+
+ const transform_neon row_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_neon col_txfm =
+ lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ input_1 = input;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
+ transpose_s16_8x8q(&a[k], &a[k]);
+ input_1 += 8;
+ }
+ input += (txfm_size_col * 8);
+ if (abs(rect_type) == 1) {
+ int y = i * txfm_size_col;
+ round_shift_for_rect(&a[y], &a[y], txfm_size_col);
+ }
+ row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
+ av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
+ -shift[0]);
+ if (lr_flip == 1) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ flip_buf_ud_neon(&a[k], 8);
+ transpose_s16_8x8q(
+ &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
+ }
+ temp_b += 8;
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ int k = j * 8 + i * txfm_size_col;
+ transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
+ }
+ temp_b += 8;
+ }
+ }
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
+ av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
+ -shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_wxh_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_wxh_idtx_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_wxh_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_wxh_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_universe_neon(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+
+ default:
+ lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ int row;
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+
+ case TX_16X64: {
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X16: {
+ int32_t mod_input[64 * 16];
+ for (row = 0; row < 16; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_32X64: {
+ lowbd_inv_txfm2d_add_wxh_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X32: {
+ int32_t mod_input[64 * 32];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ case TX_64X64: {
+ int32_t mod_input[64 * 64];
+ for (row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ lowbd_inv_txfm2d_add_wxh_universe_neon(mod_input, output, stride, tx_type,
+ tx_size, eob);
+ } break;
+
+ default:
+ lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
new file mode 100644
index 0000000000..9ec658291c
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_inv_txfm_neon.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+#define AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+
+typedef void (*transform_1d_neon)(const int32_t *input, int32_t *output,
+ const int8_t cos_bit,
+ const int8_t *stage_ptr);
+typedef void (*transform_neon)(int16x8_t *input, int16x8_t *output,
+ int8_t cos_bit, int bit);
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+#endif // AOM_AV1_COMMON_ARM_AV1_INV_TXFM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/av1_txfm_neon.c b/third_party/aom/av1/common/arm/av1_txfm_neon.c
new file mode 100644
index 0000000000..de3c547248
--- /dev/null
+++ b/third_party/aom/av1/common/arm/av1_txfm_neon.c
@@ -0,0 +1,28 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+
+void av1_round_shift_array_neon(int32_t *arr, int size, int bit) {
+ assert(!(size % 4));
+ if (!bit) return;
+ const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
+ for (int i = 0; i < size; i += 4) {
+ int32x4_t tmp_q_s32 = vld1q_s32(arr);
+ tmp_q_s32 = vrshlq_s32(tmp_q_s32, dup_bits_n_32x4);
+ vst1q_s32(arr, tmp_q_s32);
+ arr += 4;
+ }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
new file mode 100644
index 0000000000..7134f183e3
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_hmask_neon.c
@@ -0,0 +1,134 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 2);
+ assert(w >= 2);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ uint8x8_t tmp0, tmp1;
+ uint8x16_t res_q;
+ uint16x8_t res, res_low, res_high;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+ const uint8x8_t vdup_64 = vdup_n_u8((uint8_t)64);
+
+ if (w >= 16) {
+ const uint8x16_t vdup_64_q = vdupq_n_u8((uint8_t)64);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __builtin_prefetch(src0);
+ __builtin_prefetch(src1);
+ const uint8x16_t tmp0_q = vld1q_u8(src0);
+ const uint8x16_t tmp1_q = vld1q_u8(src1);
+ const uint8x16_t m_q = vld1q_u8(mask);
+ const uint8x16_t max_minus_m_q = vsubq_u8(vdup_64_q, m_q);
+ res_low = vmull_u8(vget_low_u8(m_q), vget_low_u8(tmp0_q));
+ res_low =
+ vmlal_u8(res_low, vget_low_u8(max_minus_m_q), vget_low_u8(tmp1_q));
+ res_high = vmull_u8(vget_high_u8(m_q), vget_high_u8(tmp0_q));
+ res_high = vmlal_u8(res_high, vget_high_u8(max_minus_m_q),
+ vget_high_u8(tmp1_q));
+ res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+ vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+ vst1q_u8(dst, res_q);
+ src0 += 16;
+ src1 += 16;
+ dst += 16;
+ mask += 16;
+ }
+ src0 += src0_stride - w;
+ src1 += src1_stride - w;
+ dst += dst_stride - w;
+ mask -= w;
+ }
+ } else if (w == 8) {
+ const uint8x8_t m = vld1_u8(mask);
+ const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+ for (int i = 0; i < h; ++i) {
+ __builtin_prefetch(src0);
+ __builtin_prefetch(src1);
+ tmp0 = vld1_u8(src0);
+ tmp1 = vld1_u8(src1);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ } else if (w == 4) {
+ const uint8x8_t m = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)mask));
+ const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+ for (int i = 0; i < h; i += 2) {
+ __builtin_prefetch(src0 + 0 * src0_stride);
+ __builtin_prefetch(src0 + 1 * src0_stride);
+ __builtin_prefetch(src1 + 0 * src1_stride);
+ __builtin_prefetch(src1 + 1 * src1_stride);
+ load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+ tmp0 = vreinterpret_u8_u32(tmp0_32);
+ load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+ tmp1 = vreinterpret_u8_u32(tmp1_32);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_lane_u32(
+ (uint32_t *)(dst + (0 * dst_stride)),
+ vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+ vst1_lane_u32(
+ (uint32_t *)(dst + (1 * dst_stride)),
+ vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ src0 += (2 * src0_stride);
+ src1 += (2 * src1_stride);
+ dst += (2 * dst_stride);
+ }
+ } else if (w == 2) {
+ const uint8x8_t m = vreinterpret_u8_u16(vld1_dup_u16((uint16_t *)mask));
+ const uint8x8_t max_minus_m = vsub_u8(vdup_64, m);
+ for (int i = 0; i < h; i += 2) {
+ __builtin_prefetch(src0 + 0 * src0_stride);
+ __builtin_prefetch(src0 + 1 * src0_stride);
+ __builtin_prefetch(src1 + 0 * src1_stride);
+ __builtin_prefetch(src1 + 1 * src1_stride);
+ load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+ tmp0 = vreinterpret_u8_u16(tmp0_16);
+ load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+ tmp1 = vreinterpret_u8_u16(tmp1_16);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_lane_u16(
+ (uint16_t *)(dst + (0 * dst_stride)),
+ vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+ vst1_lane_u16(
+ (uint16_t *)(dst + (1 * dst_stride)),
+ vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ src0 += (2 * src0_stride);
+ src1 += (2 * src1_stride);
+ dst += (2 * dst_stride);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
new file mode 100644
index 0000000000..194e94c8c0
--- /dev/null
+++ b/third_party/aom/av1/common/arm/blend_a64_vmask_neon.c
@@ -0,0 +1,141 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_neon(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ uint8x8_t tmp0, tmp1;
+ uint8x16_t tmp0_q, tmp1_q, res_q;
+ uint16x8_t res, res_low, res_high;
+ uint32x2_t tmp0_32 = vdup_n_u32(0), tmp1_32 = vdup_n_u32(0);
+ uint16x4_t tmp0_16 = vdup_n_u16(0), tmp1_16 = vdup_n_u16(0);
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 2);
+ assert(w >= 2);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (w >= 16) {
+ for (int i = 0; i < h; ++i) {
+ const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+ const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+ for (int j = 0; j < w; j += 16) {
+ __builtin_prefetch(src0);
+ __builtin_prefetch(src1);
+ tmp0_q = vld1q_u8(src0);
+ tmp1_q = vld1q_u8(src1);
+ res_low = vmull_u8(m, vget_low_u8(tmp0_q));
+ res_low = vmlal_u8(res_low, max_minus_m, vget_low_u8(tmp1_q));
+ res_high = vmull_u8(m, vget_high_u8(tmp0_q));
+ res_high = vmlal_u8(res_high, max_minus_m, vget_high_u8(tmp1_q));
+ res_q = vcombine_u8(vrshrn_n_u16(res_low, AOM_BLEND_A64_ROUND_BITS),
+ vrshrn_n_u16(res_high, AOM_BLEND_A64_ROUND_BITS));
+ vst1q_u8(dst, res_q);
+ src0 += 16;
+ src1 += 16;
+ dst += 16;
+ }
+ src0 += src0_stride - w;
+ src1 += src1_stride - w;
+ dst += dst_stride - w;
+ }
+ } else if (w == 8) {
+ for (int i = 0; i < h; ++i) {
+ __builtin_prefetch(src0);
+ __builtin_prefetch(src1);
+ const uint8x8_t m = vdup_n_u8((uint8_t)mask[i]);
+ const uint8x8_t max_minus_m = vdup_n_u8(64 - (uint8_t)mask[i]);
+ tmp0 = vld1_u8(src0);
+ tmp1 = vld1_u8(src1);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_u8(dst, vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ dst += dst_stride;
+ }
+ } else if (w == 4) {
+ for (int i = 0; i < h; i += 2) {
+ __builtin_prefetch(src0 + 0 * src0_stride);
+ __builtin_prefetch(src0 + 1 * src0_stride);
+ __builtin_prefetch(src1 + 0 * src1_stride);
+ __builtin_prefetch(src1 + 1 * src1_stride);
+ const uint16x4_t m1 = vdup_n_u16((uint16_t)mask[i]);
+ const uint16x4_t m2 = vdup_n_u16((uint16_t)mask[i + 1]);
+ const uint8x8_t m = vmovn_u16(vcombine_u16(m1, m2));
+ const uint16x4_t max_minus_m1 = vdup_n_u16(64 - (uint16_t)mask[i]);
+ const uint16x4_t max_minus_m2 = vdup_n_u16(64 - (uint16_t)mask[i + 1]);
+ const uint8x8_t max_minus_m =
+ vmovn_u16(vcombine_u16(max_minus_m1, max_minus_m2));
+ load_unaligned_u8_4x2(src0, src0_stride, &tmp0_32);
+ tmp0 = vreinterpret_u8_u32(tmp0_32);
+ load_unaligned_u8_4x2(src1, src1_stride, &tmp1_32);
+ tmp1 = vreinterpret_u8_u32(tmp1_32);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_lane_u32(
+ (uint32_t *)(dst + (0 * dst_stride)),
+ vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+ vst1_lane_u32(
+ (uint32_t *)(dst + (1 * dst_stride)),
+ vreinterpret_u32_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ src0 += (2 * src0_stride);
+ src1 += (2 * src1_stride);
+ dst += (2 * dst_stride);
+ }
+ } else if (w == 2) {
+ for (int i = 0; i < h; i += 2) {
+ __builtin_prefetch(src0 + 0 * src0_stride);
+ __builtin_prefetch(src0 + 1 * src0_stride);
+ __builtin_prefetch(src1 + 0 * src1_stride);
+ __builtin_prefetch(src1 + 1 * src1_stride);
+ const uint8x8_t m1 = vdup_n_u8(mask[i]);
+ const uint8x8_t m2 = vdup_n_u8(mask[i + 1]);
+ const uint16x4x2_t m_trn =
+ vtrn_u16(vreinterpret_u16_u8(m1), vreinterpret_u16_u8(m2));
+ const uint8x8_t m = vreinterpret_u8_u16(m_trn.val[0]);
+ const uint8x8_t max_minus_m1 = vdup_n_u8(64 - mask[i]);
+ const uint8x8_t max_minus_m2 = vdup_n_u8(64 - mask[i + 1]);
+ const uint16x4x2_t max_minus_m_trn = vtrn_u16(
+ vreinterpret_u16_u8(max_minus_m1), vreinterpret_u16_u8(max_minus_m2));
+ const uint8x8_t max_minus_m = vreinterpret_u8_u16(max_minus_m_trn.val[0]);
+ load_unaligned_u8_2x2(src0, src0_stride, &tmp0_16);
+ tmp0 = vreinterpret_u8_u16(tmp0_16);
+ load_unaligned_u8_2x2(src1, src1_stride, &tmp1_16);
+ tmp1 = vreinterpret_u8_u16(tmp1_16);
+ res = vmull_u8(m, tmp0);
+ res = vmlal_u8(res, max_minus_m, tmp1);
+ vst1_lane_u16(
+ (uint16_t *)(dst + (0 * dst_stride)),
+ vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 0);
+ vst1_lane_u16(
+ (uint16_t *)(dst + (1 * dst_stride)),
+ vreinterpret_u16_u8(vrshrn_n_u16(res, AOM_BLEND_A64_ROUND_BITS)), 1);
+ src0 += (2 * src0_stride);
+ src1 += (2 * src1_stride);
+ dst += (2 * dst_stride);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/cfl_neon.c b/third_party/aom/av1/common/arm/cfl_neon.c
new file mode 100644
index 0000000000..39025b5e54
--- /dev/null
+++ b/third_party/aom/av1/common/arm/cfl_neon.c
@@ -0,0 +1,584 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+static INLINE void vldsubstq_s16(int16_t *dst, const uint16_t *src, int offset,
+ int16x8_t sub) {
+ vst1q_s16(dst + offset,
+ vsubq_s16(vreinterpretq_s16_u16(vld1q_u16(src + offset)), sub));
+}
+
+static INLINE uint16x8_t vldaddq_u16(const uint16_t *buf, size_t offset) {
+ return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
+}
+
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+ return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_u16(uint16_t *ptr, uint16x4_t val) {
+ *((uint32_t *)ptr) = vreinterpret_u32_u16(val)[0];
+}
+
+// Store half of a vector.
+static INLINE void vsth_u8(uint8_t *ptr, uint8x8_t val) {
+ *((uint32_t *)ptr) = vreinterpret_u32_u8(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+ const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+ vsth_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+ } else if (width == 8) {
+ const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+ const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+ vst1_u16(pred_buf_q3, vshl_n_u16(sum, 1));
+ } else if (width == 16) {
+ const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+ const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(sum, 1));
+ } else {
+ const uint8x8x4_t top = vld4_u8(input);
+ const uint8x8x4_t bot = vld4_u8(input + input_stride);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t top_0 = vaddl_u8(top.val[0], top.val[1]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t bot_0 = vaddl_u8(bot.val[0], bot.val[1]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t top_1 = vaddl_u8(top.val[2], top.val[3]);
+ // equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ const uint16x8_t bot_1 = vaddl_u8(bot.val[2], bot.val[3]);
+ uint16x8x2_t sum;
+ sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+ sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+ vsth_u16(pred_buf_q3, vshl_n_u16(top, 2));
+ } else if (width == 8) {
+ const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+ vst1_u16(pred_buf_q3, vshl_n_u16(top, 2));
+ } else if (width == 16) {
+ const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 2));
+ } else {
+ const uint8x8x4_t top = vld4_u8(input);
+ uint16x8x2_t sum;
+ // vaddl_u8 is equivalent to a vpaddlq_u8 (because vld4q interleaves)
+ sum.val[0] = vshlq_n_u16(vaddl_u8(top.val[0], top.val[1]), 2);
+ sum.val[1] = vshlq_n_u16(vaddl_u8(top.val[2], top.val[3]), 2);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_lbd_neon(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x8_t top = vshll_n_u8(vldh_dup_u8(input), 3);
+ vst1_u16(pred_buf_q3, vget_low_u16(top));
+ } else if (width == 8) {
+ const uint16x8_t top = vshll_n_u8(vld1_u8(input), 3);
+ vst1q_u16(pred_buf_q3, top);
+ } else {
+ const uint8x16_t top = vld1q_u8(input);
+ vst1q_u16(pred_buf_q3, vshll_n_u8(vget_low_u8(top), 3));
+ vst1q_u16(pred_buf_q3 + 8, vshll_n_u8(vget_high_u8(top), 3));
+ if (width == 32) {
+ const uint8x16_t next_top = vld1q_u8(input + 16);
+ vst1q_u16(pred_buf_q3 + 16, vshll_n_u8(vget_low_u8(next_top), 3));
+ vst1q_u16(pred_buf_q3 + 24, vshll_n_u8(vget_high_u8(next_top), 3));
+ }
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+#ifndef __aarch64__
+uint16x8_t vpaddq_u16(uint16x8_t a, uint16x8_t b) {
+ return vcombine_u16(vpadd_u16(vget_low_u16(a), vget_high_u16(a)),
+ vpadd_u16(vget_low_u16(b), vget_high_u16(b)));
+}
+#endif
+
+static void cfl_luma_subsampling_420_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ const uint16x4_t bot = vld1_u16(input + input_stride);
+ const uint16x4_t sum = vadd_u16(top, bot);
+ const uint16x4_t hsum = vpadd_u16(sum, sum);
+ vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+ } else if (width < 32) {
+ const uint16x8_t top = vld1q_u16(input);
+ const uint16x8_t bot = vld1q_u16(input + input_stride);
+ const uint16x8_t sum = vaddq_u16(top, bot);
+ if (width == 8) {
+ const uint16x4_t hsum = vget_low_u16(vpaddq_u16(sum, sum));
+ vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 1));
+ } else {
+ const uint16x8_t top_1 = vld1q_u16(input + 8);
+ const uint16x8_t bot_1 = vld1q_u16(input + 8 + input_stride);
+ const uint16x8_t sum_1 = vaddq_u16(top_1, bot_1);
+ const uint16x8_t hsum = vpaddq_u16(sum, sum_1);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 1));
+ }
+ } else {
+ const uint16x8x4_t top = vld4q_u16(input);
+ const uint16x8x4_t bot = vld4q_u16(input + input_stride);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t top_0 = vaddq_u16(top.val[0], top.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t bot_0 = vaddq_u16(bot.val[0], bot.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t top_1 = vaddq_u16(top.val[2], top.val[3]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t bot_1 = vaddq_u16(bot.val[2], bot.val[3]);
+ uint16x8x2_t sum;
+ sum.val[0] = vshlq_n_u16(vaddq_u16(top_0, bot_0), 1);
+ sum.val[1] = vshlq_n_u16(vaddq_u16(top_1, bot_1), 1);
+ vst2q_u16(pred_buf_q3, sum);
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_422_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ const uint16x4_t hsum = vpadd_u16(top, top);
+ vsth_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+ } else if (width == 8) {
+ const uint16x4x2_t top = vld2_u16(input);
+ // equivalent to a vpadd_u16 (because vld2 interleaves)
+ const uint16x4_t hsum = vadd_u16(top.val[0], top.val[1]);
+ vst1_u16(pred_buf_q3, vshl_n_u16(hsum, 2));
+ } else if (width == 16) {
+ const uint16x8x2_t top = vld2q_u16(input);
+ // equivalent to a vpaddq_u16 (because vld2q interleaves)
+ const uint16x8_t hsum = vaddq_u16(top.val[0], top.val[1]);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(hsum, 2));
+ } else {
+ const uint16x8x4_t top = vld4q_u16(input);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t hsum_0 = vaddq_u16(top.val[0], top.val[1]);
+ // equivalent to a vpaddq_u16 (because vld4q interleaves)
+ const uint16x8_t hsum_1 = vaddq_u16(top.val[2], top.val[3]);
+ uint16x8x2_t result = { { vshlq_n_u16(hsum_0, 2),
+ vshlq_n_u16(hsum_1, 2) } };
+ vst2q_u16(pred_buf_q3, result);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vld1_u16(input);
+ vst1_u16(pred_buf_q3, vshl_n_u16(top, 3));
+ } else if (width == 8) {
+ const uint16x8_t top = vld1q_u16(input);
+ vst1q_u16(pred_buf_q3, vshlq_n_u16(top, 3));
+ } else if (width == 16) {
+ uint16x8x2_t top = vld2q_u16(input);
+ top.val[0] = vshlq_n_u16(top.val[0], 3);
+ top.val[1] = vshlq_n_u16(top.val[1], 3);
+ vst2q_u16(pred_buf_q3, top);
+ } else {
+ uint16x8x4_t top = vld4q_u16(input);
+ top.val[0] = vshlq_n_u16(top.val[0], 3);
+ top.val[1] = vshlq_n_u16(top.val[1], 3);
+ top.val[2] = vshlq_n_u16(top.val[2], 3);
+ top.val[3] = vshlq_n_u16(top.val[3], 3);
+ vst4q_u16(pred_buf_q3, top);
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
+static INLINE void subtract_average_neon(const uint16_t *src, int16_t *dst,
+ int width, int height,
+ int round_offset,
+ const int num_pel_log2) {
+ const uint16_t *const end = src + height * CFL_BUF_LINE;
+
+ // Round offset is not needed, because NEON will handle the rounding.
+ (void)round_offset;
+
+ // To optimize the use of the CPU pipeline, we process 4 rows per iteration
+ const int step = 4 * CFL_BUF_LINE;
+
+ // At this stage, the prediction buffer contains scaled reconstructed luma
+ // pixels, which are positive integer and only require 15 bits. By using
+ // unsigned integer for the sum, we can do one addition operation inside 16
+ // bits (8 lanes) before having to convert to 32 bits (4 lanes).
+ const uint16_t *sum_buf = src;
+ uint32x4_t sum_32x4 = { 0, 0, 0, 0 };
+ do {
+ // For all widths, we load, add and combine the data so it fits in 4 lanes.
+ if (width == 4) {
+ const uint16x4_t a0 =
+ vadd_u16(vld1_u16(sum_buf), vld1_u16(sum_buf + CFL_BUF_LINE));
+ const uint16x4_t a1 = vadd_u16(vld1_u16(sum_buf + 2 * CFL_BUF_LINE),
+ vld1_u16(sum_buf + 3 * CFL_BUF_LINE));
+ sum_32x4 = vaddq_u32(sum_32x4, vaddl_u16(a0, a1));
+ } else if (width == 8) {
+ const uint16x8_t a0 = vldaddq_u16(sum_buf, CFL_BUF_LINE);
+ const uint16x8_t a1 =
+ vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, CFL_BUF_LINE);
+ sum_32x4 = vpadalq_u16(sum_32x4, a0);
+ sum_32x4 = vpadalq_u16(sum_32x4, a1);
+ } else {
+ const uint16x8_t row0 = vldaddq_u16(sum_buf, 8);
+ const uint16x8_t row1 = vldaddq_u16(sum_buf + CFL_BUF_LINE, 8);
+ const uint16x8_t row2 = vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE, 8);
+ const uint16x8_t row3 = vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE, 8);
+ sum_32x4 = vpadalq_u16(sum_32x4, row0);
+ sum_32x4 = vpadalq_u16(sum_32x4, row1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row2);
+ sum_32x4 = vpadalq_u16(sum_32x4, row3);
+
+ if (width == 32) {
+ const uint16x8_t row0_1 = vldaddq_u16(sum_buf + 16, 8);
+ const uint16x8_t row1_1 = vldaddq_u16(sum_buf + CFL_BUF_LINE + 16, 8);
+ const uint16x8_t row2_1 =
+ vldaddq_u16(sum_buf + 2 * CFL_BUF_LINE + 16, 8);
+ const uint16x8_t row3_1 =
+ vldaddq_u16(sum_buf + 3 * CFL_BUF_LINE + 16, 8);
+
+ sum_32x4 = vpadalq_u16(sum_32x4, row0_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row1_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row2_1);
+ sum_32x4 = vpadalq_u16(sum_32x4, row3_1);
+ }
+ }
+ sum_buf += step;
+ } while (sum_buf < end);
+
+ // Permute and add in such a way that each lane contains the block sum.
+ // [A+C+B+D, B+D+A+C, C+A+D+B, D+B+C+A]
+#ifdef __aarch64__
+ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+ sum_32x4 = vpaddq_u32(sum_32x4, sum_32x4);
+#else
+ uint32x4_t flip =
+ vcombine_u32(vget_high_u32(sum_32x4), vget_low_u32(sum_32x4));
+ sum_32x4 = vaddq_u32(sum_32x4, flip);
+ sum_32x4 = vaddq_u32(sum_32x4, vrev64q_u32(sum_32x4));
+#endif
+
+ // Computing the average could be done using scalars, but getting off the NEON
+ // engine introduces latency, so we use vqrshrn.
+ int16x4_t avg_16x4;
+ // Constant propagation makes for some ugly code.
+ switch (num_pel_log2) {
+ case 4: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 4)); break;
+ case 5: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 5)); break;
+ case 6: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 6)); break;
+ case 7: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 7)); break;
+ case 8: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 8)); break;
+ case 9: avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 9)); break;
+ case 10:
+ avg_16x4 = vreinterpret_s16_u16(vqrshrn_n_u32(sum_32x4, 10));
+ break;
+ default: assert(0);
+ }
+
+ if (width == 4) {
+ do {
+ vst1_s16(dst, vsub_s16(vreinterpret_s16_u16(vld1_u16(src)), avg_16x4));
+ src += CFL_BUF_LINE;
+ dst += CFL_BUF_LINE;
+ } while (src < end);
+ } else {
+ const int16x8_t avg_16x8 = vcombine_s16(avg_16x4, avg_16x4);
+ do {
+ vldsubstq_s16(dst, src, 0, avg_16x8);
+ vldsubstq_s16(dst, src, CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 3 * CFL_BUF_LINE, avg_16x8);
+
+ if (width > 8) {
+ vldsubstq_s16(dst, src, 8, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 8 + 3 * CFL_BUF_LINE, avg_16x8);
+ }
+ if (width == 32) {
+ vldsubstq_s16(dst, src, 16, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 16 + 3 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + 2 * CFL_BUF_LINE, avg_16x8);
+ vldsubstq_s16(dst, src, 24 + 3 * CFL_BUF_LINE, avg_16x8);
+ }
+ src += step;
+ dst += step;
+ } while (src < end);
+ }
+}
+
+CFL_SUB_AVG_FN(neon)
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+// practice, as scaled_luma is the multiplication of two absolute values.
+// * In the Intel equivalent, elements in a are zeroed out when the
+// corresponding elements in b are zero. Because vsign is used twice in a
+// row, with b in the first call becoming a in the second call, there's no
+// impact from not zeroing out.
+static int16x4_t vsign_s16(int16x4_t a, int16x4_t b) {
+ const int16x4_t mask = vshr_n_s16(b, 15);
+ return veor_s16(vadd_s16(a, mask), mask);
+}
+
+// Saturating negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+// Notes:
+// * Negating INT16_MIN results in INT16_MIN. However, this cannot occur in
+// practice, as scaled_luma is the multiplication of two absolute values.
+// * In the Intel equivalent, elements in a are zeroed out when the
+// corresponding elements in b are zero. Because vsignq is used twice in a
+// row, with b in the first call becoming a in the second call, there's no
+// impact from not zeroing out.
+static int16x8_t vsignq_s16(int16x8_t a, int16x8_t b) {
+ const int16x8_t mask = vshrq_n_s16(b, 15);
+ return veorq_s16(vaddq_s16(a, mask), mask);
+}
+
+static INLINE int16x4_t predict_w4(const int16_t *pred_buf_q3,
+ int16x4_t alpha_sign, int abs_alpha_q12,
+ int16x4_t dc) {
+ const int16x4_t ac_q3 = vld1_s16(pred_buf_q3);
+ const int16x4_t ac_sign = veor_s16(alpha_sign, ac_q3);
+ int16x4_t scaled_luma = vqrdmulh_n_s16(vabs_s16(ac_q3), abs_alpha_q12);
+ return vadd_s16(vsign_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8_t predict_w8(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ const int16x8_t ac_q3 = vld1q_s16(pred_buf_q3);
+ const int16x8_t ac_sign = veorq_s16(alpha_sign, ac_q3);
+ int16x8_t scaled_luma = vqrdmulhq_n_s16(vabsq_s16(ac_q3), abs_alpha_q12);
+ return vaddq_s16(vsignq_s16(scaled_luma, ac_sign), dc);
+}
+
+static INLINE int16x8x2_t predict_w16(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ // vld2q_s16 interleaves, which is not useful for prediction. vst1q_s16_x2
+ // does not interleave, but is not currently available in the compilier used
+ // by the AOM build system.
+ const int16x8x2_t ac_q3 = vld2q_s16(pred_buf_q3);
+ const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+ const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+ const int16x8_t scaled_luma_0 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+ const int16x8_t scaled_luma_1 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+ int16x8x2_t result;
+ result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+ result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+ return result;
+}
+
+static INLINE int16x8x4_t predict_w32(const int16_t *pred_buf_q3,
+ int16x8_t alpha_sign, int abs_alpha_q12,
+ int16x8_t dc) {
+ // vld4q_s16 interleaves, which is not useful for prediction. vst1q_s16_x4
+ // does not interleave, but is not currently available in the compilier used
+ // by the AOM build system.
+ const int16x8x4_t ac_q3 = vld4q_s16(pred_buf_q3);
+ const int16x8_t ac_sign_0 = veorq_s16(alpha_sign, ac_q3.val[0]);
+ const int16x8_t ac_sign_1 = veorq_s16(alpha_sign, ac_q3.val[1]);
+ const int16x8_t ac_sign_2 = veorq_s16(alpha_sign, ac_q3.val[2]);
+ const int16x8_t ac_sign_3 = veorq_s16(alpha_sign, ac_q3.val[3]);
+ const int16x8_t scaled_luma_0 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[0]), abs_alpha_q12);
+ const int16x8_t scaled_luma_1 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[1]), abs_alpha_q12);
+ const int16x8_t scaled_luma_2 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[2]), abs_alpha_q12);
+ const int16x8_t scaled_luma_3 =
+ vqrdmulhq_n_s16(vabsq_s16(ac_q3.val[3]), abs_alpha_q12);
+ int16x8x4_t result;
+ result.val[0] = vaddq_s16(vsignq_s16(scaled_luma_0, ac_sign_0), dc);
+ result.val[1] = vaddq_s16(vsignq_s16(scaled_luma_1, ac_sign_1), dc);
+ result.val[2] = vaddq_s16(vsignq_s16(scaled_luma_2, ac_sign_2), dc);
+ result.val[3] = vaddq_s16(vsignq_s16(scaled_luma_3, ac_sign_3), dc);
+ return result;
+}
+
+static INLINE void cfl_predict_lbd_neon(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+ const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+ if (width == 4) {
+ const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+ const int16x4_t dc = vdup_n_s16(*dst);
+ do {
+ const int16x4_t pred =
+ predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vsth_u8(dst, vqmovun_s16(vcombine_s16(pred, pred)));
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ } else {
+ const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+ const int16x8_t dc = vdupq_n_s16(*dst);
+ do {
+ if (width == 8) {
+ vst1_u8(dst, vqmovun_s16(predict_w8(pred_buf_q3, alpha_sign,
+ abs_alpha_q12, dc)));
+ } else if (width == 16) {
+ const int16x8x2_t pred =
+ predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ const uint8x8x2_t predun = { { vqmovun_s16(pred.val[0]),
+ vqmovun_s16(pred.val[1]) } };
+ vst2_u8(dst, predun);
+ } else {
+ const int16x8x4_t pred =
+ predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ const uint8x8x4_t predun = {
+ { vqmovun_s16(pred.val[0]), vqmovun_s16(pred.val[1]),
+ vqmovun_s16(pred.val[2]), vqmovun_s16(pred.val[3]) }
+ };
+ vst4_u8(dst, predun);
+ }
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ }
+}
+
+CFL_PREDICT_FN(neon, lbd)
+
+static INLINE uint16x4_t clamp_s16(int16x4_t a, int16x4_t max) {
+ return vreinterpret_u16_s16(vmax_s16(vmin_s16(a, max), vdup_n_s16(0)));
+}
+
+static INLINE uint16x8_t clampq_s16(int16x8_t a, int16x8_t max) {
+ return vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(a, max), vdupq_n_s16(0)));
+}
+
+static INLINE uint16x8x2_t clamp2q_s16(int16x8x2_t a, int16x8_t max) {
+ uint16x8x2_t result;
+ result.val[0] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+ result.val[1] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+ return result;
+}
+
+static INLINE uint16x8x4_t clamp4q_s16(int16x8x4_t a, int16x8_t max) {
+ uint16x8x4_t result;
+ result.val[0] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[0], max), vdupq_n_s16(0)));
+ result.val[1] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[1], max), vdupq_n_s16(0)));
+ result.val[2] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[2], max), vdupq_n_s16(0)));
+ result.val[3] = vreinterpretq_u16_s16(
+ vmaxq_s16(vminq_s16(a.val[3], max), vdupq_n_s16(0)));
+ return result;
+}
+
+static INLINE void cfl_predict_hbd_neon(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const int max = (1 << bd) - 1;
+ const int16_t abs_alpha_q12 = abs(alpha_q3) << 9;
+ const int16_t *const end = pred_buf_q3 + height * CFL_BUF_LINE;
+ if (width == 4) {
+ const int16x4_t alpha_sign = vdup_n_s16(alpha_q3);
+ const int16x4_t dc = vdup_n_s16(*dst);
+ const int16x4_t max_16x4 = vdup_n_s16(max);
+ do {
+ const int16x4_t scaled_luma =
+ predict_w4(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst1_u16(dst, clamp_s16(scaled_luma, max_16x4));
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ } else {
+ const int16x8_t alpha_sign = vdupq_n_s16(alpha_q3);
+ const int16x8_t dc = vdupq_n_s16(*dst);
+ const int16x8_t max_16x8 = vdupq_n_s16(max);
+ do {
+ if (width == 8) {
+ const int16x8_t pred =
+ predict_w8(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst1q_u16(dst, clampq_s16(pred, max_16x8));
+ } else if (width == 16) {
+ const int16x8x2_t pred =
+ predict_w16(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst2q_u16(dst, clamp2q_s16(pred, max_16x8));
+ } else {
+ const int16x8x4_t pred =
+ predict_w32(pred_buf_q3, alpha_sign, abs_alpha_q12, dc);
+ vst4q_u16(dst, clamp4q_s16(pred, max_16x8));
+ }
+ dst += dst_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+ }
+}
+
+CFL_PREDICT_FN(neon, hbd)
diff --git a/third_party/aom/av1/common/arm/convolve_neon.c b/third_party/aom/av1/common/arm/convolve_neon.c
new file mode 100644
index 0000000000..d0c4f8ff67
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.c
@@ -0,0 +1,1455 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+static INLINE int16x4_t convolve8_4x4(const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7,
+ const int16_t *filter) {
+ int16x4_t sum;
+
+ sum = vmul_n_s16(s0, filter[0]);
+ sum = vmla_n_s16(sum, s1, filter[1]);
+ sum = vmla_n_s16(sum, s2, filter[2]);
+ sum = vmla_n_s16(sum, s5, filter[5]);
+ sum = vmla_n_s16(sum, s6, filter[6]);
+ sum = vmla_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+ sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+ return sum;
+}
+
+static INLINE uint8x8_t convolve8_horiz_8x8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+ const int16x8_t shift_round_0, const int16x8_t shift_by_bits) {
+ int16x8_t sum;
+
+ sum = vmulq_n_s16(s0, filter[0]);
+ sum = vmlaq_n_s16(sum, s1, filter[1]);
+ sum = vmlaq_n_s16(sum, s2, filter[2]);
+ sum = vmlaq_n_s16(sum, s5, filter[5]);
+ sum = vmlaq_n_s16(sum, s6, filter[6]);
+ sum = vmlaq_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+ sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+ sum = vqrshlq_s16(sum, shift_round_0);
+ sum = vqrshlq_s16(sum, shift_by_bits);
+
+ return vqmovun_s16(sum);
+}
+
+#if !defined(__aarch64__)
+static INLINE uint8x8_t convolve8_horiz_4x1(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+ const int16x4_t shift_round_0, const int16x4_t shift_by_bits) {
+ int16x4_t sum;
+
+ sum = vmul_n_s16(s0, filter[0]);
+ sum = vmla_n_s16(sum, s1, filter[1]);
+ sum = vmla_n_s16(sum, s2, filter[2]);
+ sum = vmla_n_s16(sum, s5, filter[5]);
+ sum = vmla_n_s16(sum, s6, filter[6]);
+ sum = vmla_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqadd_s16(sum, vmul_n_s16(s3, filter[3]));
+ sum = vqadd_s16(sum, vmul_n_s16(s4, filter[4]));
+
+ sum = vqrshl_s16(sum, shift_round_0);
+ sum = vqrshl_s16(sum, shift_by_bits);
+
+ return vqmovun_s16(vcombine_s16(sum, sum));
+}
+#endif // !defined(__arch64__)
+
+static INLINE uint8x8_t convolve8_vert_8x4(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16_t *filter) {
+ int16x8_t sum;
+
+ sum = vmulq_n_s16(s0, filter[0]);
+ sum = vmlaq_n_s16(sum, s1, filter[1]);
+ sum = vmlaq_n_s16(sum, s2, filter[2]);
+ sum = vmlaq_n_s16(sum, s5, filter[5]);
+ sum = vmlaq_n_s16(sum, s6, filter[6]);
+ sum = vmlaq_n_s16(sum, s7, filter[7]);
+ /* filter[3] can take a max value of 128. So the max value of the result :
+ * 128*255 + sum > 16 bits
+ */
+ sum = vqaddq_s16(sum, vmulq_n_s16(s3, filter[3]));
+ sum = vqaddq_s16(sum, vmulq_n_s16(s4, filter[4]));
+
+ return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+static INLINE uint16x4_t convolve8_vert_4x4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+ const int32x4_t round_shift_vec, const int32x4_t offset_const,
+ const int32x4_t sub_const_vec) {
+ int32x4_t sum0;
+ uint16x4_t res;
+ const int32x4_t zero = vdupq_n_s32(0);
+
+ sum0 = vmull_n_s16(s0, y_filter[0]);
+ sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+ sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+ sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+ sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+ sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+ sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+ sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+ sum0 = vaddq_s32(sum0, offset_const);
+ sum0 = vqrshlq_s32(sum0, round_shift_vec);
+ sum0 = vsubq_s32(sum0, sub_const_vec);
+ sum0 = vmaxq_s32(sum0, zero);
+
+ res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+ return res;
+}
+
+static INLINE uint8x8_t convolve8_vert_8x4_s32(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16_t *y_filter,
+ const int32x4_t round_shift_vec, const int32x4_t offset_const,
+ const int32x4_t sub_const_vec, const int16x8_t vec_round_bits) {
+ int32x4_t sum0, sum1;
+ uint16x8_t res;
+ const int32x4_t zero = vdupq_n_s32(0);
+
+ sum0 = vmull_n_s16(vget_low_s16(s0), y_filter[0]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s1), y_filter[1]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s2), y_filter[2]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), y_filter[3]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s4), y_filter[4]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s5), y_filter[5]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s6), y_filter[6]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s7), y_filter[7]);
+
+ sum1 = vmull_n_s16(vget_high_s16(s0), y_filter[0]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s1), y_filter[1]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s2), y_filter[2]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), y_filter[3]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s4), y_filter[4]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s5), y_filter[5]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s6), y_filter[6]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s7), y_filter[7]);
+
+ sum0 = vaddq_s32(sum0, offset_const);
+ sum1 = vaddq_s32(sum1, offset_const);
+ sum0 = vqrshlq_s32(sum0, round_shift_vec);
+ sum1 = vqrshlq_s32(sum1, round_shift_vec);
+ sum0 = vsubq_s32(sum0, sub_const_vec);
+ sum1 = vsubq_s32(sum1, sub_const_vec);
+ sum0 = vmaxq_s32(sum0, zero);
+ sum1 = vmaxq_s32(sum1, zero);
+ res = vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(sum0)),
+ vqmovn_u32(vreinterpretq_u32_s32(sum1)));
+
+ res = vqrshlq_u16(res, vec_round_bits);
+
+ return vqmovn_u16(res);
+}
+
+void av1_convolve_x_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
+ const int8_t bits = FILTER_BITS - conv_params->round_0;
+
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)filter_params_y;
+
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+#endif
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+ const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0);
+ const int16x8_t shift_by_bits = vdupq_n_s16(-bits);
+
+ src -= horiz_offset;
+#if defined(__aarch64__)
+ if (h == 4) {
+ uint8x8_t d01, d23;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+ int16x8_t d01_temp, d23_temp;
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ src += 7;
+
+ do {
+ load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, x_filter);
+
+ d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, x_filter);
+
+ d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, x_filter);
+
+ d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, x_filter);
+
+ d01_temp = vqrshlq_s16(vcombine_s16(d0, d1), shift_round_0);
+ d23_temp = vqrshlq_s16(vcombine_s16(d2, d3), shift_round_0);
+
+ d01_temp = vqrshlq_s16(d01_temp, shift_by_bits);
+ d23_temp = vqrshlq_s16(d23_temp, shift_by_bits);
+
+ d01 = vqmovun_s16(d01_temp);
+ d23 = vqmovun_s16(d23_temp);
+
+ transpose_u8_4x4(&d01, &d23);
+
+ if (w != 2) {
+ vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride), // 00 01 02 03
+ vreinterpret_u32_u8(d01), 0);
+ vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride), // 10 11 12 13
+ vreinterpret_u32_u8(d23), 0);
+ vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride), // 20 21 22 23
+ vreinterpret_u32_u8(d01), 1);
+ vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride), // 30 31 32 33
+ vreinterpret_u32_u8(d23), 1);
+ } else {
+ vst1_lane_u16((uint16_t *)(dst + 0 * dst_stride), // 00 01
+ vreinterpret_u16_u8(d01), 0);
+ vst1_lane_u16((uint16_t *)(dst + 1 * dst_stride), // 10 11
+ vreinterpret_u16_u8(d23), 0);
+ vst1_lane_u16((uint16_t *)(dst + 2 * dst_stride), // 20 21
+ vreinterpret_u16_u8(d01), 2);
+ vst1_lane_u16((uint16_t *)(dst + 3 * dst_stride), // 30 31
+ vreinterpret_u16_u8(d23), 2);
+ }
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ src += 4;
+ dst += 4;
+ w -= 4;
+ } while (w > 0);
+ } else {
+#endif
+ int width;
+ const uint8_t *s;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t t4, t5, t6, t7;
+#endif
+
+ if (w <= 4) {
+#if defined(__aarch64__)
+ do {
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
+ src += 8 * src_stride;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
+
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ shift_round_0, shift_by_bits);
+ t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ shift_round_0, shift_by_bits);
+ t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ shift_round_0, shift_by_bits);
+ t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ shift_round_0, shift_by_bits);
+
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ if ((w == 4) && (h > 4)) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+ 0); // 10 11 12 13
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+ 0); // 20 21 22 23
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+ 0); // 30 31 32 33
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 1); // 40 41 42 43
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+ 1); // 50 51 52 53
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2),
+ 1); // 60 61 62 63
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3),
+ 1); // 70 71 72 73
+ dst += dst_stride;
+ } else if ((w == 4) && (h == 2)) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1),
+ 0); // 10 11 12 13
+ dst += dst_stride;
+ } else if ((w == 2) && (h > 4)) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 0); // 20 21
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 0); // 30 31
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 2); // 40 41
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 2); // 50 51
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t2), 2); // 60 61
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t3), 2); // 70 71
+ dst += dst_stride;
+ } else if ((w == 2) && (h == 2)) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t1), 0); // 10 11
+ dst += dst_stride;
+ }
+ h -= 8;
+ } while (h > 0);
+#else
+ int16x8_t tt0;
+ int16x4_t x0, x1, x2, x3, x4, x5, x6, x7;
+ const int16x4_t shift_round_0_low = vget_low_s16(shift_round_0);
+ const int16x4_t shift_by_bits_low = vget_low_s16(shift_by_bits);
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ x4 = vget_high_s16(tt0); // a4 a5 a6 a7
+
+ t0 = vld1_u8(src + 8); // a8 a9 a10 a11 a12 a13 a14 a15
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ x7 = vget_low_s16(tt0); // a8 a9 a10 a11
+
+ x1 = vext_s16(x0, x4, 1); // a1 a2 a3 a4
+ x2 = vext_s16(x0, x4, 2); // a2 a3 a4 a5
+ x3 = vext_s16(x0, x4, 3); // a3 a4 a5 a6
+ x5 = vext_s16(x4, x7, 1); // a5 a6 a7 a8
+ x6 = vext_s16(x4, x7, 2); // a6 a7 a8 a9
+ x7 = vext_s16(x4, x7, 3); // a7 a8 a9 a10
+
+ src += src_stride;
+
+ t0 = convolve8_horiz_4x1(x0, x1, x2, x3, x4, x5, x6, x7, x_filter,
+ shift_round_0_low, shift_by_bits_low);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(t0), 0); // 00 01
+ dst += dst_stride;
+ }
+ h -= 1;
+ } while (h > 0);
+#endif
+ } else {
+ uint8_t *d;
+ int16x8_t s11;
+#if defined(__aarch64__)
+ int16x8_t s12, s13, s14;
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d = dst;
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(dst + 4 * dst_stride);
+ __builtin_prefetch(dst + 5 * dst_stride);
+ __builtin_prefetch(dst + 6 * dst_stride);
+ __builtin_prefetch(dst + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ t0 = convolve8_horiz_8x8(s0, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t1 = convolve8_horiz_8x8(s1, s2, s3, s4, s5, s6, s7, s8, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t2 = convolve8_horiz_8x8(s2, s3, s4, s5, s6, s7, s8, s9, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t3 = convolve8_horiz_8x8(s3, s4, s5, s6, s7, s8, s9, s10, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t4 = convolve8_horiz_8x8(s4, s5, s6, s7, s8, s9, s10, s11, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t5 = convolve8_horiz_8x8(s5, s6, s7, s8, s9, s10, s11, s12, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t6 = convolve8_horiz_8x8(s6, s7, s8, s9, s10, s11, s12, s13, x_filter,
+ shift_round_0, shift_by_bits);
+
+ t7 = convolve8_horiz_8x8(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter, shift_round_0, shift_by_bits);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ if (h != 2) {
+ store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+ } else {
+ store_row2_u8_8x8(d, dst_stride, t0, t1);
+ }
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst += 8 * dst_stride;
+ h -= 8;
+ } while (h > 0);
+#else
+ do {
+ t0 = vld1_u8(src); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+
+ width = w;
+ s = src + 8;
+ d = dst;
+ __builtin_prefetch(dst);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s11 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(s11, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(s11, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(s11, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(s11, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(s11, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(s11, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(s11, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ t0 = convolve8_horiz_8x8(s11, s1, s2, s3, s4, s5, s6, s7, x_filter,
+ shift_round_0, shift_by_bits);
+ vst1_u8(d, t0);
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 1;
+ } while (h > 0);
+#endif
+ }
+#if defined(__aarch64__)
+ }
+#endif
+}
+
+void av1_convolve_y_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+
+ src -= vert_offset * src_stride;
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+ if (w <= 4) {
+ uint8x8_t d01;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ uint8x8_t d23;
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
+ s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ do {
+ s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+#if defined(__aarch64__)
+ s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+ s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
+ src += src_stride;
+
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(dst + 1 * dst_stride);
+ __builtin_prefetch(dst + 2 * dst_stride);
+ __builtin_prefetch(dst + 3 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ d1 = convolve8_4x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ d2 = convolve8_4x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ d3 = convolve8_4x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+ d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+ if ((w == 4) && (h != 2)) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+ 0); // 20 21 22 23
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23),
+ 1); // 30 31 32 33
+ dst += dst_stride;
+ } else if ((w == 4) && (h == 2)) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ dst += dst_stride;
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ dst += dst_stride;
+ } else if ((w == 2) && (h != 2)) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 0); // 20 21
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d23), 2); // 30 31
+ dst += dst_stride;
+ } else if ((w == 2) && (h == 2)) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0); // 00 01
+ dst += dst_stride;
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 2); // 10 11
+ dst += dst_stride;
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ h -= 4;
+#else
+ __builtin_prefetch(dst + 0 * dst_stride);
+ __builtin_prefetch(src + 0 * src_stride);
+
+ d0 = convolve8_4x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ d01 = vqrshrun_n_s16(vcombine_s16(d0, d0), FILTER_BITS);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
+ dst += dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)dst, vreinterpret_u16_u8(d01), 0);
+ dst += dst_stride;
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ h -= 1;
+#endif
+ } while (h > 0);
+ } else {
+ int height;
+ const uint8_t *s;
+ uint8_t *d;
+ uint8x8_t t0;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3;
+ int16x8_t s8, s9, s10;
+#endif
+ do {
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ s = src;
+ s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ d = dst;
+ height = h;
+
+ do {
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+#if defined(__aarch64__)
+ s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+ s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+ s += src_stride;
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+ t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+ t1 = convolve8_vert_8x4(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+ t2 = convolve8_vert_8x4(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+ t3 = convolve8_vert_8x4(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+ if (h != 2) {
+ vst1_u8(d, t0);
+ d += dst_stride;
+ vst1_u8(d, t1);
+ d += dst_stride;
+ vst1_u8(d, t2);
+ d += dst_stride;
+ vst1_u8(d, t3);
+ d += dst_stride;
+ } else {
+ vst1_u8(d, t0);
+ d += dst_stride;
+ vst1_u8(d, t1);
+ d += dst_stride;
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ __builtin_prefetch(d);
+ __builtin_prefetch(s);
+
+ t0 = convolve8_vert_8x4(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
+ } while (height > 0);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+}
+
+void av1_convolve_2d_sr_neon(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int im_dst_stride;
+ int width, height;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+ const int bd = 8;
+ const int im_h = h + filter_params_y->taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const uint8_t *s;
+ int16_t *dst_ptr;
+
+ dst_ptr = im_block;
+ im_dst_stride = im_stride;
+ height = im_h;
+ width = w;
+
+ const int16_t round_bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+ int16_t x_filter_tmp[8];
+ int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+ // filter coeffs are even, so downshifting by 1 to reduce intermediate
+ // precision requirements.
+ filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+ vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+ assert(conv_params->round_0 > 0);
+
+ if (w <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+#endif
+
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
+
+ do {
+ s = src_ptr;
+
+#if defined(__aarch64__)
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+ s += 7;
+
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+ if (w == 4) {
+ vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+ vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+ vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+ vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+ vreinterpret_u32_s16(d0), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+ vreinterpret_u32_s16(d1), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+ vreinterpret_u32_s16(d2), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+ vreinterpret_u32_s16(d3), 0);
+ }
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * im_dst_stride;
+ height -= 4;
+#else
+ int16x8_t tt0;
+
+ __builtin_prefetch(s);
+
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s0 = vget_low_s16(tt0);
+ s4 = vget_high_s16(tt0);
+
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ if (w == 4) {
+ vst1_s16(dst_ptr, d0);
+ dst_ptr += im_dst_stride;
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_s16(d0), 0);
+ dst_ptr += im_dst_stride;
+ }
+
+ src_ptr += src_stride;
+ height -= 1;
+#endif
+ } while (height > 0);
+ } else {
+ int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
+ int16x8_t s11, s12, s13, s14;
+#endif
+
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+
+#if defined(__aarch64__)
+ do {
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_tmp, horiz_const, shift_round_0);
+
+ transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7);
+
+ store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
+ res6, res7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * im_dst_stride;
+ height -= 8;
+ } while (height > 0);
+#else
+ do {
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t sum = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(sum, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(sum, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(sum, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(sum, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(sum, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(sum, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(sum, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(sum, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += im_dst_stride;
+ height -= 1;
+ } while (height > 0);
+#endif
+ }
+
+ // vertical
+ {
+ uint8_t *dst_u8_ptr, *d_u8;
+ int16_t *v_src_ptr, *v_s;
+
+ const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+ const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+
+ src_stride = im_stride;
+ v_src_ptr = im_block;
+ dst_u8_ptr = dst;
+
+ height = h;
+ width = w;
+
+ if (width <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t d0;
+ uint16x8_t dd0;
+ uint8x8_t d01;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t d1, d2, d3;
+ uint16x8_t dd1;
+ uint8x8_t d23;
+#endif
+
+ d_u8 = dst_u8_ptr;
+ v_s = v_src_ptr;
+
+ __builtin_prefetch(v_s + 0 * im_stride);
+ __builtin_prefetch(v_s + 1 * im_stride);
+ __builtin_prefetch(v_s + 2 * im_stride);
+ __builtin_prefetch(v_s + 3 * im_stride);
+ __builtin_prefetch(v_s + 4 * im_stride);
+ __builtin_prefetch(v_s + 5 * im_stride);
+ __builtin_prefetch(v_s + 6 * im_stride);
+ __builtin_prefetch(v_s + 7 * im_stride);
+
+ load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
+
+ do {
+#if defined(__aarch64__)
+ load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ v_s += (im_stride << 2);
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 1 * dst_stride);
+ __builtin_prefetch(d_u8 + 2 * dst_stride);
+ __builtin_prefetch(d_u8 + 3 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+ dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+ d01 = vqmovn_u16(dd0);
+ d23 = vqmovn_u16(dd1);
+
+ if ((w == 4) && (h != 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 0); // 20 21 22 23
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 1); // 30 31 32 33
+ d_u8 += dst_stride;
+ } else if ((w == 2) && (h != 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 0); // 20 21
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 2); // 30 31
+ d_u8 += dst_stride;
+ } else if ((w == 4) && (h == 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ d_u8 += dst_stride;
+ } else if ((w == 2) && (h == 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
+ d_u8 += dst_stride;
+ }
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ s7 = vld1_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+ d01 = vqmovn_u16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
+ } while (height > 0);
+ } else {
+ // if width is a multiple of 8 & height is a multiple of 4
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
+
+ do {
+ __builtin_prefetch(v_src_ptr + 0 * im_stride);
+ __builtin_prefetch(v_src_ptr + 1 * im_stride);
+ __builtin_prefetch(v_src_ptr + 2 * im_stride);
+ __builtin_prefetch(v_src_ptr + 3 * im_stride);
+ __builtin_prefetch(v_src_ptr + 4 * im_stride);
+ __builtin_prefetch(v_src_ptr + 5 * im_stride);
+ __builtin_prefetch(v_src_ptr + 6 * im_stride);
+ __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+ v_s = v_src_ptr;
+ load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
+
+ d_u8 = dst_u8_ptr;
+ height = h;
+
+ do {
+#if defined(__aarch64__)
+ load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ v_s += (im_stride << 2);
+
+ __builtin_prefetch(d_u8 + 4 * dst_stride);
+ __builtin_prefetch(d_u8 + 5 * dst_stride);
+ __builtin_prefetch(d_u8 + 6 * dst_stride);
+ __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+ res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ if (h != 2) {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res2);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res3);
+ d_u8 += dst_stride;
+ } else {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
+ y_filter, round_shift_vec, offset_const,
+ sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
+ } while (height > 0);
+ v_src_ptr += 8;
+ dst_u8_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ }
+}
+void av1_convolve_2d_copy_sr_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ const uint8_t *src1;
+ uint8_t *dst1;
+ int y;
+
+ if (!(w & 0x0F)) {
+ for (y = 0; y < h; ++y) {
+ src1 = src;
+ dst1 = dst;
+ for (int x = 0; x < (w >> 4); ++x) {
+ vst1q_u8(dst1, vld1q_u8(src1));
+ src1 += 16;
+ dst1 += 16;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x07)) {
+ for (y = 0; y < h; ++y) {
+ vst1_u8(dst, vld1_u8(src));
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x03)) {
+ for (y = 0; y < h; ++y) {
+ vst1_lane_u32((uint32_t *)(dst), vreinterpret_u32_u8(vld1_u8(src)), 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else if (!(w & 0x01)) {
+ for (y = 0; y < h; ++y) {
+ vst1_lane_u16((uint16_t *)(dst), vreinterpret_u16_u8(vld1_u8(src)), 0);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/convolve_neon.h b/third_party/aom/av1/common/arm/convolve_neon.h
new file mode 100644
index 0000000000..f382984f27
--- /dev/null
+++ b/third_party/aom/av1/common/arm/convolve_neon.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+#define AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
+
+#include <arm_neon.h>
+
+#define HORIZ_EXTRA_ROWS ((SUBPEL_TAPS + 7) & ~0x07)
+
+static INLINE uint8x8_t wiener_convolve8_vert_4x8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, int16_t *filter_y, const int bd,
+ const int round1_bits) {
+ int16x8_t ss0, ss1, ss2;
+ int32x4_t sum0, sum1;
+ uint16x4_t tmp0, tmp1;
+ uint16x8_t tmp;
+ uint8x8_t res;
+
+ const int32_t round_const = (1 << (bd + round1_bits - 1));
+ const int32x4_t round_bits = vdupq_n_s32(-round1_bits);
+ const int32x4_t zero = vdupq_n_s32(0);
+ const int32x4_t round_vec = vdupq_n_s32(round_const);
+
+ ss0 = vaddq_s16(s0, s6);
+ ss1 = vaddq_s16(s1, s5);
+ ss2 = vaddq_s16(s2, s4);
+
+ sum0 = vmull_n_s16(vget_low_s16(ss0), filter_y[0]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(ss1), filter_y[1]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(ss2), filter_y[2]);
+ sum0 = vmlal_n_s16(sum0, vget_low_s16(s3), filter_y[3]);
+
+ sum1 = vmull_n_s16(vget_high_s16(ss0), filter_y[0]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(ss1), filter_y[1]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(ss2), filter_y[2]);
+ sum1 = vmlal_n_s16(sum1, vget_high_s16(s3), filter_y[3]);
+
+ sum0 = vsubq_s32(sum0, round_vec);
+ sum1 = vsubq_s32(sum1, round_vec);
+
+ /* right shift & rounding */
+ sum0 = vrshlq_s32(sum0, round_bits);
+ sum1 = vrshlq_s32(sum1, round_bits);
+
+ sum0 = vmaxq_s32(sum0, zero);
+ sum1 = vmaxq_s32(sum1, zero);
+
+ /* from int32x4_t to uint8x8_t */
+ tmp0 = vqmovn_u32(vreinterpretq_u32_s32(sum0));
+ tmp1 = vqmovn_u32(vreinterpretq_u32_s32(sum1));
+ tmp = vcombine_u16(tmp0, tmp1);
+ res = vqmovn_u16(tmp);
+
+ return res;
+}
+
+static INLINE uint16x8_t wiener_convolve8_horiz_8x8(
+ const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, int16_t *filter_x, const int bd,
+ const int round0_bits) {
+ int16x8_t sum;
+ uint16x8_t res;
+ int32x4_t sum_0, sum_1;
+ int32x4_t s3_0, s3_1;
+ const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+ const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+
+ /* for the purpose of right shift by { conv_params->round_0 } */
+ const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+
+ const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+ const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+ sum = vmulq_n_s16(s0, filter_x[0]);
+ sum = vmlaq_n_s16(sum, s1, filter_x[1]);
+ sum = vmlaq_n_s16(sum, s2, filter_x[2]);
+
+ /* sum from 16x8 to 2 32x4 registers */
+ sum_0 = vmovl_s16(vget_low_s16(sum));
+ sum_1 = vmovl_s16(vget_high_s16(sum));
+
+ /* s[3]*128 -- and filter coef max can be 128
+ * then max value possible = 128*128*255 exceeding 16 bit
+ */
+
+ s3_0 = vmull_n_s16(vget_low_s16(s3), filter_x[3]);
+ s3_1 = vmull_n_s16(vget_high_s16(s3), filter_x[3]);
+ sum_0 = vaddq_s32(sum_0, s3_0);
+ sum_1 = vaddq_s32(sum_1, s3_1);
+
+ /* Add the constant value */
+ sum_0 = vaddq_s32(sum_0, round_vec_0);
+ sum_1 = vaddq_s32(sum_1, round_vec_0);
+
+ /* right shift & rounding & saturating */
+ sum_0 = vqrshlq_s32(sum_0, round_bits);
+ sum_1 = vqrshlq_s32(sum_1, round_bits);
+
+ /* Clipping to max value */
+ sum_0 = vminq_s32(sum_0, round_vec_1);
+ sum_1 = vminq_s32(sum_1, round_vec_1);
+
+ res = vcombine_u16(vqmovun_s32(sum_0), vqmovun_s32(sum_1));
+ return res;
+}
+
+static INLINE uint16x4_t wiener_convolve8_horiz_4x8(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, int16_t *filter_x, const int bd,
+ const int round0_bits) {
+ uint16x4_t res;
+ int32x4_t sum_0, s3_0;
+ int16x4_t sum, temp0, temp1, temp2;
+
+ const int32_t round_const_0 = (1 << (bd + FILTER_BITS - 1));
+ const int32_t round_const_1 = (1 << ((bd) + 1 + FILTER_BITS - round0_bits));
+ const int32x4_t round_bits = vdupq_n_s32(-round0_bits);
+ const int32x4_t zero = vdupq_n_s32(0);
+ const int32x4_t round_vec_0 = vdupq_n_s32(round_const_0);
+ const int32x4_t round_vec_1 = vdupq_n_s32(round_const_1);
+
+ temp0 = vadd_s16(s0, s6);
+ temp1 = vadd_s16(s1, s5);
+ temp2 = vadd_s16(s2, s4);
+
+ sum = vmul_n_s16(temp0, filter_x[0]);
+ sum = vmla_n_s16(sum, temp1, filter_x[1]);
+ sum = vmla_n_s16(sum, temp2, filter_x[2]);
+ sum_0 = vmovl_s16(sum);
+
+ /* s[3]*128 -- and filter coff max can be 128.
+ * then max value possible = 128*128*255 Therefore, 32 bits are required to
+ * hold the result.
+ */
+ s3_0 = vmull_n_s16(s3, filter_x[3]);
+ sum_0 = vaddq_s32(sum_0, s3_0);
+
+ sum_0 = vaddq_s32(sum_0, round_vec_0);
+ sum_0 = vrshlq_s32(sum_0, round_bits);
+
+ sum_0 = vmaxq_s32(sum_0, zero);
+ sum_0 = vminq_s32(sum_0, round_vec_1);
+ res = vqmovun_s32(sum_0);
+ return res;
+}
+
+static INLINE int16x8_t
+convolve8_8x8_s16(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+ const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7, const int16_t *filter,
+ const int16x8_t horiz_const, const int16x8_t shift_round_0) {
+ int16x8_t sum;
+ int16x8_t res;
+
+ sum = horiz_const;
+ sum = vmlaq_n_s16(sum, s0, filter[0]);
+ sum = vmlaq_n_s16(sum, s1, filter[1]);
+ sum = vmlaq_n_s16(sum, s2, filter[2]);
+ sum = vmlaq_n_s16(sum, s3, filter[3]);
+ sum = vmlaq_n_s16(sum, s4, filter[4]);
+ sum = vmlaq_n_s16(sum, s5, filter[5]);
+ sum = vmlaq_n_s16(sum, s6, filter[6]);
+ sum = vmlaq_n_s16(sum, s7, filter[7]);
+
+ res = vqrshlq_s16(sum, shift_round_0);
+
+ return res;
+}
+
+static INLINE int16x4_t
+convolve8_4x4_s16(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *filter,
+ const int16x4_t horiz_const, const int16x4_t shift_round_0) {
+ int16x4_t sum;
+ sum = horiz_const;
+ sum = vmla_n_s16(sum, s0, filter[0]);
+ sum = vmla_n_s16(sum, s1, filter[1]);
+ sum = vmla_n_s16(sum, s2, filter[2]);
+ sum = vmla_n_s16(sum, s3, filter[3]);
+ sum = vmla_n_s16(sum, s4, filter[4]);
+ sum = vmla_n_s16(sum, s5, filter[5]);
+ sum = vmla_n_s16(sum, s6, filter[6]);
+ sum = vmla_n_s16(sum, s7, filter[7]);
+
+ sum = vqrshl_s16(sum, shift_round_0);
+
+ return sum;
+}
+
+static INLINE uint16x4_t convolve8_4x4_s32(
+ const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+ const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7, const int16_t *y_filter,
+ const int32x4_t round_shift_vec, const int32x4_t offset_const) {
+ int32x4_t sum0;
+ uint16x4_t res;
+ const int32x4_t zero = vdupq_n_s32(0);
+
+ sum0 = vmull_n_s16(s0, y_filter[0]);
+ sum0 = vmlal_n_s16(sum0, s1, y_filter[1]);
+ sum0 = vmlal_n_s16(sum0, s2, y_filter[2]);
+ sum0 = vmlal_n_s16(sum0, s3, y_filter[3]);
+ sum0 = vmlal_n_s16(sum0, s4, y_filter[4]);
+ sum0 = vmlal_n_s16(sum0, s5, y_filter[5]);
+ sum0 = vmlal_n_s16(sum0, s6, y_filter[6]);
+ sum0 = vmlal_n_s16(sum0, s7, y_filter[7]);
+
+ sum0 = vaddq_s32(sum0, offset_const);
+ sum0 = vqrshlq_s32(sum0, round_shift_vec);
+ sum0 = vmaxq_s32(sum0, zero);
+ res = vmovn_u32(vreinterpretq_u32_s32(sum0));
+
+ return res;
+}
+
+#endif // AOM_AV1_COMMON_ARM_CONVOLVE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/jnt_convolve_neon.c b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
new file mode 100644
index 0000000000..e5674ef7c2
--- /dev/null
+++ b/third_party/aom/av1/common/arm/jnt_convolve_neon.c
@@ -0,0 +1,1740 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+#if !defined(__aarch64__)
+static INLINE void compute_avg_4x1(uint16x4_t res0, uint16x4_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const_vec,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0;
+ uint16x4_t tmp_u0;
+ uint32x4_t sum0;
+ int32x4_t dst0;
+ int16x8_t tmp4;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+
+ sum0 = vmull_n_u16(res0, fwd_offset);
+ sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), vmovl_s16(sub_const_vec));
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ } else {
+ const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+ tmp_u0 = vhadd_u16(res0, d0);
+
+ tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+
+ tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+
+ tmp4 = vcombine_s16(tmp0, tmp0);
+
+ *t0 = vqmovun_s16(tmp4);
+ }
+}
+
+static INLINE void compute_avg_8x1(uint16x8_t res0, uint16x8_t d0,
+ const uint16_t fwd_offset,
+ const uint16_t bck_offset,
+ const int16x4_t sub_const,
+ const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0) {
+ int16x4_t tmp0, tmp2;
+ int16x8_t f0;
+ uint32x4_t sum0, sum2;
+ int32x4_t dst0, dst2;
+
+ uint16x8_t tmp_u0;
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+ const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+ sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+ sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+
+ sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+ sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+ sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+ dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+ dst2 = vqrshlq_s32(dst2, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp2 = vqmovn_s32(dst2);
+
+ f0 = vcombine_s16(tmp0, tmp2);
+
+ *t0 = vqmovun_s16(f0);
+
+ } else {
+ const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+ const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+ tmp_u0 = vhaddq_u16(res0, d0);
+
+ f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+
+ f0 = vqrshlq_s16(f0, round_bits_vec);
+
+ *t0 = vqmovun_s16(f0);
+ }
+}
+#endif // !defined(__arch64__)
+
+static INLINE void compute_avg_4x4(
+ uint16x4_t res0, uint16x4_t res1, uint16x4_t res2, uint16x4_t res3,
+ uint16x4_t d0, uint16x4_t d1, uint16x4_t d2, uint16x4_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x4_t sub_const_vec, const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1) {
+ int16x4_t tmp0, tmp1, tmp2, tmp3;
+ uint16x4_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+ uint32x4_t sum0, sum1, sum2, sum3;
+
+ int32x4_t dst0, dst1, dst2, dst3;
+ int16x8_t tmp4, tmp5;
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t round_bits_vec = vdupq_n_s32((int32_t)(-round_bits));
+ const int32x4_t const_vec = vmovl_s16(sub_const_vec);
+
+ sum0 = vmull_n_u16(res0, fwd_offset);
+ sum0 = vmlal_n_u16(sum0, d0, bck_offset);
+ sum1 = vmull_n_u16(res1, fwd_offset);
+ sum1 = vmlal_n_u16(sum1, d1, bck_offset);
+ sum2 = vmull_n_u16(res2, fwd_offset);
+ sum2 = vmlal_n_u16(sum2, d2, bck_offset);
+ sum3 = vmull_n_u16(res3, fwd_offset);
+ sum3 = vmlal_n_u16(sum3, d3, bck_offset);
+
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+ sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+ sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+ sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), const_vec);
+ dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), const_vec);
+ dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), const_vec);
+ dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), const_vec);
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+ dst1 = vqrshlq_s32(dst1, round_bits_vec);
+ dst2 = vqrshlq_s32(dst2, round_bits_vec);
+ dst3 = vqrshlq_s32(dst3, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp1 = vqmovn_s32(dst1);
+ tmp2 = vqmovn_s32(dst2);
+ tmp3 = vqmovn_s32(dst3);
+ tmp4 = vcombine_s16(tmp0, tmp1);
+ tmp5 = vcombine_s16(tmp2, tmp3);
+ tmp4 = vmaxq_s16(tmp4, zero);
+ tmp5 = vmaxq_s16(tmp5, zero);
+
+ *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+ *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+ } else {
+ const int16x4_t round_bits_vec = vdup_n_s16(-round_bits);
+ tmp_u0 = vhadd_u16(res0, d0);
+ tmp_u1 = vhadd_u16(res1, d1);
+ tmp_u2 = vhadd_u16(res2, d2);
+ tmp_u3 = vhadd_u16(res3, d3);
+
+ tmp0 = vsub_s16(vreinterpret_s16_u16(tmp_u0), sub_const_vec);
+ tmp1 = vsub_s16(vreinterpret_s16_u16(tmp_u1), sub_const_vec);
+ tmp2 = vsub_s16(vreinterpret_s16_u16(tmp_u2), sub_const_vec);
+ tmp3 = vsub_s16(vreinterpret_s16_u16(tmp_u3), sub_const_vec);
+
+ tmp0 = vqrshl_s16(tmp0, round_bits_vec);
+ tmp1 = vqrshl_s16(tmp1, round_bits_vec);
+ tmp2 = vqrshl_s16(tmp2, round_bits_vec);
+ tmp3 = vqrshl_s16(tmp3, round_bits_vec);
+
+ tmp4 = vcombine_s16(tmp0, tmp1);
+ tmp5 = vcombine_s16(tmp2, tmp3);
+ tmp4 = vmaxq_s16(tmp4, zero);
+ tmp5 = vmaxq_s16(tmp5, zero);
+
+ *t0 = vqmovn_u16(vreinterpretq_u16_s16(tmp4));
+ *t1 = vqmovn_u16(vreinterpretq_u16_s16(tmp5));
+ }
+}
+
+static INLINE void compute_avg_8x4(
+ uint16x8_t res0, uint16x8_t res1, uint16x8_t res2, uint16x8_t res3,
+ uint16x8_t d0, uint16x8_t d1, uint16x8_t d2, uint16x8_t d3,
+ const uint16_t fwd_offset, const uint16_t bck_offset,
+ const int16x4_t sub_const, const int16_t round_bits,
+ const int use_jnt_comp_avg, uint8x8_t *t0, uint8x8_t *t1, uint8x8_t *t2,
+ uint8x8_t *t3) {
+ int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int16x8_t f0, f1, f2, f3;
+ uint32x4_t sum0, sum1, sum2, sum3;
+ uint32x4_t sum4, sum5, sum6, sum7;
+ int32x4_t dst0, dst1, dst2, dst3;
+ int32x4_t dst4, dst5, dst6, dst7;
+ uint16x8_t tmp_u0, tmp_u1, tmp_u2, tmp_u3;
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ if (use_jnt_comp_avg) {
+ const int32x4_t sub_const_vec = vmovl_s16(sub_const);
+ const int32x4_t round_bits_vec = vdupq_n_s32(-(int32_t)round_bits);
+
+ sum0 = vmull_n_u16(vget_low_u16(res0), fwd_offset);
+ sum0 = vmlal_n_u16(sum0, vget_low_u16(d0), bck_offset);
+ sum1 = vmull_n_u16(vget_low_u16(res1), fwd_offset);
+ sum1 = vmlal_n_u16(sum1, vget_low_u16(d1), bck_offset);
+ sum0 = vshrq_n_u32(sum0, DIST_PRECISION_BITS);
+ sum1 = vshrq_n_u32(sum1, DIST_PRECISION_BITS);
+
+ sum2 = vmull_n_u16(vget_high_u16(res0), fwd_offset);
+ sum2 = vmlal_n_u16(sum2, vget_high_u16(d0), bck_offset);
+ sum3 = vmull_n_u16(vget_high_u16(res1), fwd_offset);
+ sum3 = vmlal_n_u16(sum3, vget_high_u16(d1), bck_offset);
+ sum2 = vshrq_n_u32(sum2, DIST_PRECISION_BITS);
+ sum3 = vshrq_n_u32(sum3, DIST_PRECISION_BITS);
+
+ sum4 = vmull_n_u16(vget_low_u16(res2), fwd_offset);
+ sum4 = vmlal_n_u16(sum4, vget_low_u16(d2), bck_offset);
+ sum5 = vmull_n_u16(vget_low_u16(res3), fwd_offset);
+ sum5 = vmlal_n_u16(sum5, vget_low_u16(d3), bck_offset);
+ sum4 = vshrq_n_u32(sum4, DIST_PRECISION_BITS);
+ sum5 = vshrq_n_u32(sum5, DIST_PRECISION_BITS);
+
+ sum6 = vmull_n_u16(vget_high_u16(res2), fwd_offset);
+ sum6 = vmlal_n_u16(sum6, vget_high_u16(d2), bck_offset);
+ sum7 = vmull_n_u16(vget_high_u16(res3), fwd_offset);
+ sum7 = vmlal_n_u16(sum7, vget_high_u16(d3), bck_offset);
+ sum6 = vshrq_n_u32(sum6, DIST_PRECISION_BITS);
+ sum7 = vshrq_n_u32(sum7, DIST_PRECISION_BITS);
+
+ dst0 = vsubq_s32(vreinterpretq_s32_u32(sum0), sub_const_vec);
+ dst1 = vsubq_s32(vreinterpretq_s32_u32(sum1), sub_const_vec);
+ dst2 = vsubq_s32(vreinterpretq_s32_u32(sum2), sub_const_vec);
+ dst3 = vsubq_s32(vreinterpretq_s32_u32(sum3), sub_const_vec);
+ dst4 = vsubq_s32(vreinterpretq_s32_u32(sum4), sub_const_vec);
+ dst5 = vsubq_s32(vreinterpretq_s32_u32(sum5), sub_const_vec);
+ dst6 = vsubq_s32(vreinterpretq_s32_u32(sum6), sub_const_vec);
+ dst7 = vsubq_s32(vreinterpretq_s32_u32(sum7), sub_const_vec);
+
+ dst0 = vqrshlq_s32(dst0, round_bits_vec);
+ dst1 = vqrshlq_s32(dst1, round_bits_vec);
+ dst2 = vqrshlq_s32(dst2, round_bits_vec);
+ dst3 = vqrshlq_s32(dst3, round_bits_vec);
+ dst4 = vqrshlq_s32(dst4, round_bits_vec);
+ dst5 = vqrshlq_s32(dst5, round_bits_vec);
+ dst6 = vqrshlq_s32(dst6, round_bits_vec);
+ dst7 = vqrshlq_s32(dst7, round_bits_vec);
+
+ tmp0 = vqmovn_s32(dst0);
+ tmp1 = vqmovn_s32(dst1);
+ tmp2 = vqmovn_s32(dst2);
+ tmp3 = vqmovn_s32(dst3);
+ tmp4 = vqmovn_s32(dst4);
+ tmp5 = vqmovn_s32(dst5);
+ tmp6 = vqmovn_s32(dst6);
+ tmp7 = vqmovn_s32(dst7);
+
+ f0 = vcombine_s16(tmp0, tmp2);
+ f1 = vcombine_s16(tmp1, tmp3);
+ f2 = vcombine_s16(tmp4, tmp6);
+ f3 = vcombine_s16(tmp5, tmp7);
+
+ f0 = vmaxq_s16(f0, zero);
+ f1 = vmaxq_s16(f1, zero);
+ f2 = vmaxq_s16(f2, zero);
+ f3 = vmaxq_s16(f3, zero);
+
+ *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+ *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+ *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+ *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+
+ } else {
+ const int16x8_t sub_const_vec = vcombine_s16(sub_const, sub_const);
+ const int16x8_t round_bits_vec = vdupq_n_s16(-round_bits);
+
+ tmp_u0 = vhaddq_u16(res0, d0);
+ tmp_u1 = vhaddq_u16(res1, d1);
+ tmp_u2 = vhaddq_u16(res2, d2);
+ tmp_u3 = vhaddq_u16(res3, d3);
+
+ f0 = vsubq_s16(vreinterpretq_s16_u16(tmp_u0), sub_const_vec);
+ f1 = vsubq_s16(vreinterpretq_s16_u16(tmp_u1), sub_const_vec);
+ f2 = vsubq_s16(vreinterpretq_s16_u16(tmp_u2), sub_const_vec);
+ f3 = vsubq_s16(vreinterpretq_s16_u16(tmp_u3), sub_const_vec);
+
+ f0 = vqrshlq_s16(f0, round_bits_vec);
+ f1 = vqrshlq_s16(f1, round_bits_vec);
+ f2 = vqrshlq_s16(f2, round_bits_vec);
+ f3 = vqrshlq_s16(f3, round_bits_vec);
+
+ f0 = vmaxq_s16(f0, zero);
+ f1 = vmaxq_s16(f1, zero);
+ f2 = vmaxq_s16(f2, zero);
+ f3 = vmaxq_s16(f3, zero);
+
+ *t0 = vqmovn_u16(vreinterpretq_u16_s16(f0));
+ *t1 = vqmovn_u16(vreinterpretq_u16_s16(f1));
+ *t2 = vqmovn_u16(vreinterpretq_u16_s16(f2));
+ *t3 = vqmovn_u16(vreinterpretq_u16_s16(f3));
+ }
+}
+
+static INLINE void jnt_convolve_2d_horiz_neon(
+ const uint8_t *src, int src_stride, int16_t *im_block, const int im_stride,
+ int16_t *x_filter_tmp, const int im_h, int w, const int round_0) {
+ const int bd = 8;
+ const uint8_t *s;
+ int16_t *dst_ptr;
+ int dst_stride;
+ int width, height;
+
+ dst_ptr = im_block;
+ dst_stride = im_stride;
+ height = im_h;
+ width = w;
+
+ if (w == 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint8x8_t t0;
+
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x4_t shift_round_0 = vdup_n_s16(-(round_0));
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint8x8_t t1, t2, t3;
+#endif
+ do {
+ s = src;
+ __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ s += 7;
+
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s7 = vget_low_s16(tt0);
+ s8 = vget_low_s16(tt1);
+ s9 = vget_low_s16(tt2);
+ s10 = vget_low_s16(tt3);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+ vst1_s16((dst_ptr + 0 * dst_stride), d0);
+ vst1_s16((dst_ptr + 1 * dst_stride), d1);
+ vst1_s16((dst_ptr + 2 * dst_stride), d2);
+ vst1_s16((dst_ptr + 3 * dst_stride), d3);
+
+ src += 4 * src_stride;
+ dst_ptr += 4 * dst_stride;
+ height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(dst_ptr);
+ s += 8;
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+
+ vst1_s16(dst_ptr, d0);
+
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
+ } while (height > 0);
+ } else {
+ int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint8x8_t t0;
+
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x8_t shift_round_0 = vdupq_n_s16(-(round_0));
+ do {
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ __builtin_prefetch(src + 0 * src_stride);
+ __builtin_prefetch(src + 1 * src_stride);
+ __builtin_prefetch(src + 2 * src_stride);
+ __builtin_prefetch(src + 3 * src_stride);
+ __builtin_prefetch(src + 4 * src_stride);
+ __builtin_prefetch(src + 5 * src_stride);
+ __builtin_prefetch(src + 6 * src_stride);
+ __builtin_prefetch(src + 7 * src_stride);
+ load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src + 7;
+ d_tmp = dst_ptr;
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_tmp, horiz_const, shift_round_0);
+
+ transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7);
+
+ store_s16_8x8(d_tmp, dst_stride, res0, res1, res2, res3, res4, res5,
+ res6, res7);
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ height -= 8;
+#else
+ int16x8_t temp_0;
+ t0 = vld1_u8(src);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src + 8;
+ d_tmp = dst_ptr;
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, horiz_const, shift_round_0);
+ vst1q_s16(d_tmp, res0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src += src_stride;
+ dst_ptr += dst_stride;
+ height -= 1;
+#endif
+ } while (height > 0);
+ }
+}
+
+static INLINE void jnt_convolve_2d_vert_neon(
+ int16_t *im_block, const int im_stride, uint8_t *dst8, int dst8_stride,
+ ConvolveParams *conv_params, const int16_t *y_filter, int h, int w) {
+ uint8_t *dst_u8_ptr, *d_u8;
+ CONV_BUF_TYPE *dst_ptr, *dst;
+ int16_t *src_ptr, *s;
+ uint16_t *d;
+
+ const int bd = 8;
+ int height;
+ int dst_stride = conv_params->dst_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+
+ const int16_t round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset);
+ const int16x4_t sub_const_vec = vdup_n_s16(sub_const);
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t res4, d0;
+ uint8x8_t t0;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10;
+ uint16x4_t res5, res6, res7, d1, d2, d3;
+ uint8x8_t t1;
+#endif
+
+ dst = conv_params->dst;
+ src_ptr = im_block;
+ dst_u8_ptr = dst8;
+ dst_ptr = dst;
+ height = h;
+
+ do {
+ d = dst_ptr;
+ d_u8 = dst_u8_ptr;
+ s = src_ptr;
+ height = h;
+
+ __builtin_prefetch(s + 0 * im_stride);
+ __builtin_prefetch(s + 1 * im_stride);
+ __builtin_prefetch(s + 2 * im_stride);
+ __builtin_prefetch(s + 3 * im_stride);
+ __builtin_prefetch(s + 4 * im_stride);
+ __builtin_prefetch(s + 5 * im_stride);
+ __builtin_prefetch(s + 6 * im_stride);
+ __builtin_prefetch(s + 7 * im_stride);
+
+ load_s16_4x8(s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ s += (7 * im_stride);
+
+ do {
+#if defined(__aarch64__)
+ load_s16_4x4(s, im_stride, &s7, &s8, &s9, &s10);
+ s += (im_stride << 2);
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 4 * dst8_stride);
+ __builtin_prefetch(d_u8 + 5 * dst8_stride);
+ __builtin_prefetch(d_u8 + 6 * dst8_stride);
+ __builtin_prefetch(d_u8 + 7 * dst8_stride);
+
+ d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const);
+ d1 = convolve8_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_shift_vec, offset_const);
+ d2 = convolve8_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_shift_vec, offset_const);
+ d3 = convolve8_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_shift_vec, offset_const);
+
+ if (do_average) {
+ load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+ d += (dst_stride << 2);
+
+ compute_avg_4x4(res4, res5, res6, res7, d0, d1, d2, d3, fwd_offset,
+ bck_offset, sub_const_vec, round_bits, use_jnt_comp_avg,
+ &t0, &t1);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+ d_u8 += dst8_stride;
+
+ } else {
+ store_u16_4x4(d, dst_stride, d0, d1, d2, d3);
+ d += (dst_stride << 2);
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ s7 = vld1_s16(s);
+ s += (im_stride);
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+
+ d0 = convolve8_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const);
+
+ if (do_average) {
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, d0, fwd_offset, bck_offset, sub_const_vec,
+ round_bits, use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+
+ } else {
+ vst1_u16(d, d0);
+ d += (dst_stride);
+ }
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height--;
+#endif
+ } while (height > 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst_u8_ptr += 4;
+ w -= 4;
+ } while (w > 0);
+}
+
+void av1_jnt_convolve_2d_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ assert(!(w % 4));
+ assert(!(h % 4));
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+ const int im_h = h + filter_params_y->taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const int round_0 = conv_params->round_0 - 1;
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+ int16_t x_filter_tmp[8];
+ int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+ // filter coeffs are even, so downshifting by 1 to reduce intermediate
+ // precision requirements.
+ filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+ vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+ jnt_convolve_2d_horiz_neon(src_ptr, src_stride, im_block, im_stride,
+ x_filter_tmp, im_h, w, round_0);
+
+ jnt_convolve_2d_vert_neon(im_block, im_stride, dst8, dst8_stride, conv_params,
+ y_filter, h, w);
+}
+
+void av1_jnt_convolve_2d_copy_neon(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ uint8x8_t res0_8, res1_8, res2_8, res3_8, tmp_shift0, tmp_shift1, tmp_shift2,
+ tmp_shift3;
+ uint16x8_t res_q0, res_q1, res_q2, res_q3, tmp_q0, tmp_q1, tmp_q2, tmp_q3;
+ uint16x4_t tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7;
+ const uint8_t *src1, *src2;
+ uint8_t *dst8_1;
+ CONV_BUF_TYPE *dst = conv_params->dst, *dst_1, *dst_2;
+ const int dst_stride = conv_params->dst_stride;
+ int x, y;
+ const int16_t bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int16x4_t sub_const_vec = vdup_n_s16((int16_t)round_offset);
+ const uint16x8_t dup_round_offset16x8 = vdupq_n_u16((uint16_t)round_offset);
+ const int16x4_t dup_bits16x4 = vdup_n_s16(bits);
+ const int16x8_t dup_bits16x8 = vdupq_n_s16(bits);
+
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ if (!(w & 0x07)) {
+ for (y = 0; y < (h >> 2); ++y) {
+ src1 = src;
+ dst8_1 = dst8;
+ dst_1 = dst;
+ for (x = 0; x < (w >> 3); ++x) {
+ src2 = src1;
+ load_u8_8x4(src2, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+ res_q0 = vaddq_u16(vshlq_u16(vmovl_u8(res0_8), dup_bits16x8),
+ dup_round_offset16x8);
+ res_q1 = vaddq_u16(vshlq_u16(vmovl_u8(res1_8), dup_bits16x8),
+ dup_round_offset16x8);
+ res_q2 = vaddq_u16(vshlq_u16(vmovl_u8(res2_8), dup_bits16x8),
+ dup_round_offset16x8);
+ res_q3 = vaddq_u16(vshlq_u16(vmovl_u8(res3_8), dup_bits16x8),
+ dup_round_offset16x8);
+
+ if (conv_params->do_average) {
+ dst_2 = dst_1;
+ load_u16_8x4(dst_2, dst_stride, &tmp_q0, &tmp_q1, &tmp_q2, &tmp_q3);
+
+ compute_avg_8x4(tmp_q0, tmp_q1, tmp_q2, tmp_q3, res_q0, res_q1,
+ res_q2, res_q3, conv_params->fwd_offset,
+ conv_params->bck_offset, sub_const_vec, bits,
+ conv_params->use_jnt_comp_avg, &tmp_shift0,
+ &tmp_shift1, &tmp_shift2, &tmp_shift3);
+
+ vst1_u8(dst8_1 + (0 * dst8_stride), tmp_shift0);
+ vst1_u8(dst8_1 + (1 * dst8_stride), tmp_shift1);
+ vst1_u8(dst8_1 + (2 * dst8_stride), tmp_shift2);
+ vst1_u8(dst8_1 + (3 * dst8_stride), tmp_shift3);
+
+ } else {
+ vst1q_u16(dst_1 + (0 * dst_stride), res_q0);
+ vst1q_u16(dst_1 + (1 * dst_stride), res_q1);
+ vst1q_u16(dst_1 + (2 * dst_stride), res_q2);
+ vst1q_u16(dst_1 + (3 * dst_stride), res_q3);
+ }
+ src1 = src1 + 8;
+ dst_1 = dst_1 + 8;
+ dst8_1 = dst8_1 + 8;
+ }
+ src += src_stride * 4;
+ dst8 += dst8_stride * 4;
+ dst += dst_stride * 4;
+ }
+ } else if (!(w & 0x03)) {
+ for (y = 0; y < (h >> 2); ++y) {
+ src1 = src;
+ dst8_1 = dst8;
+ dst_1 = dst;
+
+ load_u8_8x4(src1, src_stride, &res0_8, &res1_8, &res2_8, &res3_8);
+
+ res4 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res0_8)), dup_bits16x4),
+ vreinterpret_u16_s16(sub_const_vec));
+ res5 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res1_8)), dup_bits16x4),
+ vreinterpret_u16_s16(sub_const_vec));
+ res6 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res2_8)), dup_bits16x4),
+ vreinterpret_u16_s16(sub_const_vec));
+ res7 = vadd_u16(vshl_u16(vget_low_u16(vmovl_u8(res3_8)), dup_bits16x4),
+ vreinterpret_u16_s16(sub_const_vec));
+ if (conv_params->do_average) {
+ load_u16_4x4(dst_1, dst_stride, &tmp4, &tmp5, &tmp6, &tmp7);
+
+ compute_avg_4x4(tmp4, tmp5, tmp6, tmp7, res4, res5, res6, res7,
+ conv_params->fwd_offset, conv_params->bck_offset,
+ sub_const_vec, bits, conv_params->use_jnt_comp_avg,
+ &tmp_shift0, &tmp_shift1);
+
+ vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 0);
+ dst8_1 += dst8_stride;
+ vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift0), 1);
+ dst8_1 += dst8_stride;
+ vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 0);
+ dst8_1 += dst8_stride;
+ vst1_lane_u32((uint32_t *)(dst8_1), vreinterpret_u32_u8(tmp_shift1), 1);
+
+ } else {
+ vst1_u16(dst_1, res4);
+ dst_1 += dst_stride;
+ vst1_u16(dst_1, res5);
+ dst_1 += dst_stride;
+ vst1_u16(dst_1, res6);
+ dst_1 += dst_stride;
+ vst1_u16(dst_1, res7);
+ }
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ dst8 += dst8_stride * 4;
+ }
+ }
+}
+
+void av1_jnt_convolve_x_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ assert(!(w % 4));
+ assert(!(h % 4));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+ const uint8_t *src_ptr = src - horiz_offset;
+
+ int16_t x_filter_tmp[8];
+ int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+ // filter coeffs are even, so downshifting by 1 to reduce intermediate
+ // precision requirements.
+ filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+ vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+ const uint8_t *s;
+ uint8_t *d_u8;
+ uint8_t *dst_u8_ptr;
+ CONV_BUF_TYPE *d, *dst_ptr;
+ int width, height;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+#endif
+ s = src_ptr;
+ dst_ptr = dst;
+ dst_u8_ptr = dst8;
+ width = w;
+ height = h;
+
+ if ((w == 4) || (h == 4)) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ int16x8_t tt0;
+ uint16x4_t res4;
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ int16x8_t tt1, tt2, tt3;
+ uint16x4_t res5, res6, res7;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0);
+ int16x8_t u0, u1;
+#else
+ int16x4_t temp_0;
+#endif
+ const int16x4_t zero = vdup_n_s16(0);
+ const int16x4_t round_offset_vec = vdup_n_s16(round_offset);
+ const int16x4_t shift_round_0 = vdup_n_s16(-conv_params->round_0 + 1);
+ const int16x4_t horiz_const = vdup_n_s16(bits);
+ do {
+ s = src_ptr;
+ d = dst_ptr;
+ d_u8 = dst_u8_ptr;
+ width = w;
+ __builtin_prefetch(s + 0 * src_stride);
+#if defined(__aarch64__)
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s0 = vget_low_s16(tt0);
+ s1 = vget_low_s16(tt1);
+ s2 = vget_low_s16(tt2);
+ s3 = vget_low_s16(tt3);
+ s4 = vget_high_s16(tt0);
+ s5 = vget_high_s16(tt1);
+ s6 = vget_high_s16(tt2);
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+ s += 7;
+ do {
+ load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+ t0 = vreinterpret_u8_u32(tu0);
+ t1 = vreinterpret_u8_u32(tu1);
+
+ transpose_u8_4x4(&t0, &t1);
+ u0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ u1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+
+ s7 = vget_low_s16(u0);
+ s8 = vget_low_s16(u1);
+ s9 = vget_high_s16(u0);
+ s10 = vget_high_s16(u1);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ zero, shift_round_0);
+ d0 = vrshl_s16(d0, horiz_const);
+ d0 = vadd_s16(d0, round_offset_vec);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ zero, shift_round_0);
+ d1 = vrshl_s16(d1, horiz_const);
+ d1 = vadd_s16(d1, round_offset_vec);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ zero, shift_round_0);
+ d2 = vrshl_s16(d2, horiz_const);
+ d2 = vadd_s16(d2, round_offset_vec);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ zero, shift_round_0);
+ d3 = vrshl_s16(d3, horiz_const);
+ d3 = vadd_s16(d3, round_offset_vec);
+
+ transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+
+ compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+ vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+ vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+ round_offset_vec, round_bits, use_jnt_comp_avg, &t0,
+ &t1);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ vst1_lane_u32((uint32_t *)(d_u8 + dst8_stride),
+ vreinterpret_u32_u8(t0),
+ 1); // 10 11 12 13
+ vst1_lane_u32((uint32_t *)(d_u8 + 2 * dst8_stride),
+ vreinterpret_u32_u8(t1),
+ 0); // 20 21 22 23
+ vst1_lane_u32((uint32_t *)(d_u8 + 3 * dst8_stride),
+ vreinterpret_u32_u8(t1),
+ 1); // 30 31 32 33
+ } else {
+ store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+ vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+ vreinterpret_u16_s16(d3));
+ }
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+
+ s += 4;
+ width -= 4;
+ d += 4;
+ d_u8 += 4;
+ } while (width > 0);
+ src_ptr += (src_stride << 2);
+ dst_ptr += (dst_stride << 2);
+ dst_u8_ptr += (dst8_stride << 2);
+ height -= 4;
+#else
+ t0 = vld1_u8(s); // a0 a1 a2 a3 a4 a5 a6 a7
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+ s0 = vget_low_s16(tt0); // a0 a1 a2 a3
+ s4 = vget_high_s16(tt0); // a4 a5 a6 a7
+ __builtin_prefetch(d);
+
+ s += 8;
+ do {
+ t0 = vld1_u8(s); // a8 a9 a10 a11
+
+ // a8 a9 a10 a11
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ temp_0 = s7;
+ s1 = vext_s16(s0, s4, 1); // a1 a2 a3 a4
+ s2 = vext_s16(s0, s4, 2); // a2 a3 a4 a5
+ s3 = vext_s16(s0, s4, 3); // a3 a4 a5 a6
+ s5 = vext_s16(s4, s7, 1); // a5 a6 a7 a8
+ s6 = vext_s16(s4, s7, 2); // a6 a7 a8 a9
+ s7 = vext_s16(s4, s7, 3); // a7 a8 a9 a10
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ zero, shift_round_0);
+ d0 = vrshl_s16(d0, horiz_const);
+ d0 = vadd_s16(d0, round_offset_vec);
+ s0 = s4;
+ s4 = temp_0;
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+ __builtin_prefetch(d_u8);
+
+ res4 = vld1_u16(d);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset_vec, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0),
+ 0); // 00 01 02 03
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ }
+
+ s += 4;
+ width -= 4;
+ d += 4;
+ d_u8 += 4;
+ } while (width > 0);
+ src_ptr += (src_stride);
+ dst_ptr += (dst_stride);
+ dst_u8_ptr += (dst8_stride);
+ height--;
+#endif
+ } while (height > 0);
+ } else {
+ CONV_BUF_TYPE *d_tmp;
+ uint8_t *d_u8_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
+ const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+ const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+ const int16x8_t shift_round_0 = vdupq_n_s16(-conv_params->round_0 + 1);
+ const int16x8_t horiz_const = vdupq_n_s16(bits);
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ d = dst_ptr = dst;
+ d_u8 = dst_u8_ptr = dst8;
+ do {
+#if defined(__aarch64__)
+ int16x8_t s11, s12, s13, s14;
+ int16x8_t s8, s9, s10;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res9, res10, res11;
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ width = w;
+ s = src_ptr + 7;
+ d = dst_ptr;
+ d_u8_tmp = dst_u8_ptr;
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+ do {
+ d_u8 = d_u8_tmp;
+ d_tmp = d;
+
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ zero, shift_round_0);
+
+ res0 = vrshlq_s16(res0, horiz_const);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ zero, shift_round_0);
+ res1 = vrshlq_s16(res1, horiz_const);
+ res1 = vaddq_s16(res1, round_offset128);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ zero, shift_round_0);
+ res2 = vrshlq_s16(res2, horiz_const);
+ res2 = vaddq_s16(res2, round_offset128);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ zero, shift_round_0);
+ res3 = vrshlq_s16(res3, horiz_const);
+ res3 = vaddq_s16(res3, round_offset128);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
+ zero, shift_round_0);
+ res4 = vrshlq_s16(res4, horiz_const);
+ res4 = vaddq_s16(res4, round_offset128);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_tmp, zero, shift_round_0);
+ res5 = vrshlq_s16(res5, horiz_const);
+ res5 = vaddq_s16(res5, round_offset128);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_tmp, zero, shift_round_0);
+ res6 = vrshlq_s16(res6, horiz_const);
+ res6 = vaddq_s16(res6, round_offset128);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_tmp, zero, shift_round_0);
+ res7 = vrshlq_s16(res7, horiz_const);
+ res7 = vaddq_s16(res7, round_offset128);
+
+ transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7);
+
+ if (conv_params->do_average) {
+ load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+ d_tmp += (dst_stride << 2);
+
+ compute_avg_8x4(
+ res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+ store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+ d_u8 += (dst8_stride << 2);
+
+ load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+ d_tmp += (dst_stride << 2);
+
+ compute_avg_8x4(
+ res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+ store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+ d_u8 += (dst8_stride << 2);
+ } else {
+ store_u16_8x8(
+ d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7));
+ d_tmp += (dst_stride << 3);
+ }
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d += 8;
+ width -= 8;
+ d_u8_tmp += 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * dst_stride;
+ dst_u8_ptr += 8 * dst8_stride;
+ height -= 8;
+#else
+ int16x8_t temp_0;
+ __builtin_prefetch(src_ptr);
+ t0 = vld1_u8(src_ptr);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ width = w;
+ s = src_ptr + 8;
+ d = dst_ptr;
+ d_u8_tmp = dst_u8_ptr;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ d_u8 = d_u8_tmp;
+ d_tmp = d;
+
+ t0 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ temp_0 = s0;
+ s0 = s7;
+
+ s1 = vextq_s16(temp_0, s7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ s2 = vextq_s16(temp_0, s7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ s3 = vextq_s16(temp_0, s7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ s4 = vextq_s16(temp_0, s7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ s5 = vextq_s16(temp_0, s7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ s6 = vextq_s16(temp_0, s7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ s7 = vextq_s16(temp_0, s7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = convolve8_8x8_s16(temp_0, s1, s2, s3, s4, s5, s6, s7,
+ x_filter_tmp, zero, shift_round_0);
+
+ res0 = vrshlq_s16(res0, horiz_const);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ if (conv_params->do_average) {
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += (dst_stride);
+ }
+
+ s += 8;
+ d += 8;
+ width -= 8;
+ d_u8_tmp += 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += dst_stride;
+ dst_u8_ptr += dst8_stride;
+ height--;
+#endif
+ } while (height > 0);
+ }
+}
+
+void av1_jnt_convolve_y_neon(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ assert(!(w % 4));
+ assert(!(h % 4));
+
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const uint16_t fwd_offset = conv_params->fwd_offset;
+ const uint16_t bck_offset = conv_params->bck_offset;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int shift_value = (conv_params->round_1 - 1 - bits);
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+
+ const uint8_t *src_ptr = src - (vert_offset * src_stride);
+
+ int16_t y_filter_tmp[8];
+ int16x8_t filter_y_coef = vld1q_s16(y_filter);
+
+ // filter coeffs are even, so downshifting by 1 to reduce intermediate
+ // precision requirements.
+ filter_y_coef = vshrq_n_s16(filter_y_coef, 1);
+ vst1q_s16(&y_filter_tmp[0], filter_y_coef);
+
+ const uint8_t *s;
+ uint8_t *d_u8;
+ uint8_t *dst_u8_ptr;
+ CONV_BUF_TYPE *d, *dst_ptr;
+ int width, height;
+
+ s = src_ptr;
+ dst_ptr = dst;
+ dst_u8_ptr = dst8;
+ width = w;
+ height = h;
+
+ // used to get rid of multiplication = (vertical filter output sum) *
+ // (1<<bits).
+ assert((conv_params->round_1 - 2) >= bits);
+
+ if ((w == 4) || (h == 4)) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
+ uint16x4_t res4;
+ uint32x2_t tu0 = vdup_n_u32(0), tu1 = vdup_n_u32(0), tu2 = vdup_n_u32(0),
+ tu3 = vdup_n_u32(0);
+ int16x8_t u0, u1, u2, u3;
+ uint8x8_t t0;
+
+#if defined(__aarch64__)
+ int16x4_t s8, s9, s10, d1, d2, d3;
+ uint16x4_t res5, res6, res7;
+ uint8x8_t t1;
+#endif
+ const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+ const int16x4_t shift_vec = vdup_n_s16(-shift_value);
+ const int16x4_t zero = vdup_n_s16(0);
+
+ do {
+ s = src_ptr;
+ d = dst_ptr;
+ d_u8 = dst_u8_ptr;
+ height = h;
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
+
+ load_unaligned_u8_4x8(s, src_stride, &tu0, &tu1, &tu2, &tu3);
+
+ u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+ u2 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu2)));
+ u3 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu3)));
+
+ s0 = vget_low_s16(u0);
+ s1 = vget_high_s16(u0);
+ s2 = vget_low_s16(u1);
+ s3 = vget_high_s16(u1);
+ s4 = vget_low_s16(u2);
+ s5 = vget_high_s16(u2);
+ s6 = vget_low_s16(u3);
+
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ s += (7 * src_stride);
+ do {
+#if defined(__aarch64__)
+ load_unaligned_u8_4x4(s, src_stride, &tu0, &tu1);
+
+ u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ u1 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu1)));
+
+ s7 = vget_low_s16(u0);
+ s8 = vget_high_s16(u0);
+ s9 = vget_low_s16(u1);
+ s10 = vget_high_s16(u1);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+ d0 = vadd_s16(d0, round_offset64);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+ zero, shift_vec);
+ d1 = vadd_s16(d1, round_offset64);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+ zero, shift_vec);
+ d2 = vadd_s16(d2, round_offset64);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+ zero, shift_vec);
+ d3 = vadd_s16(d3, round_offset64);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d + 0 * dst_stride);
+ __builtin_prefetch(d + 1 * dst_stride);
+ __builtin_prefetch(d + 2 * dst_stride);
+ __builtin_prefetch(d + 3 * dst_stride);
+
+ __builtin_prefetch(d_u8 + 0 * dst8_stride);
+ __builtin_prefetch(d_u8 + 1 * dst8_stride);
+ __builtin_prefetch(d_u8 + 2 * dst8_stride);
+ __builtin_prefetch(d_u8 + 3 * dst8_stride);
+
+ load_u16_4x4(d, dst_stride, &res4, &res5, &res6, &res7);
+ d += (dst_stride << 2);
+
+ compute_avg_4x4(res4, res5, res6, res7, vreinterpret_u16_s16(d0),
+ vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+ vreinterpret_u16_s16(d3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_jnt_comp_avg, &t0,
+ &t1);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 1);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 0);
+ d_u8 += dst8_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t1), 1);
+ d_u8 += dst8_stride;
+ } else {
+ store_u16_4x4(d, dst_stride, vreinterpret_u16_s16(d0),
+ vreinterpret_u16_s16(d1), vreinterpret_u16_s16(d2),
+ vreinterpret_u16_s16(d3));
+ d += (dst_stride << 2);
+ }
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+
+ s += (src_stride << 2);
+ height -= 4;
+#else
+ load_unaligned_u8_4x1(s, src_stride, &tu0);
+ u0 = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(tu0)));
+ s7 = vget_low_s16(u0);
+
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+
+ d0 = vadd_s16(d0, round_offset64);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d);
+
+ res4 = vld1_u16(d);
+ d += (dst_stride);
+
+ compute_avg_4x1(res4, vreinterpret_u16_s16(d0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(t0), 0);
+ d_u8 += dst8_stride;
+ } else {
+ vst1_u16(d, vreinterpret_u16_s16(d0));
+ d += (dst_stride);
+ }
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ s += (src_stride);
+ height--;
+#endif
+ } while (height > 0);
+ src_ptr += 4;
+ dst_ptr += 4;
+ dst_u8_ptr += 4;
+ width -= 4;
+ } while (width > 0);
+ } else {
+ CONV_BUF_TYPE *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ int16x8_t res0;
+ uint16x8_t res8;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ const int16x8_t round_offset128 = vdupq_n_s16(round_offset);
+ const int16x8_t shift_vec = vdupq_n_s16(-shift_value);
+ const int16x4_t round_offset64 = vdup_n_s16(round_offset);
+ const int16x8_t zero = vdupq_n_s16(0);
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t res10, res11, res9;
+#endif
+ dst_ptr = dst;
+ dst_u8_ptr = dst8;
+ do {
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+ height = h;
+ s = src_ptr + (7 * src_stride);
+ d_tmp = dst_ptr;
+ d_u8 = dst_u8_ptr;
+
+ do {
+#if defined(__aarch64__)
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+ res0 = vaddq_s16(res0, round_offset128);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter_tmp,
+ zero, shift_vec);
+ res1 = vaddq_s16(res1, round_offset128);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter_tmp,
+ zero, shift_vec);
+ res2 = vaddq_s16(res2, round_offset128);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter_tmp,
+ zero, shift_vec);
+ res3 = vaddq_s16(res3, round_offset128);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, y_filter_tmp,
+ zero, shift_vec);
+ res4 = vaddq_s16(res4, round_offset128);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+ y_filter_tmp, zero, shift_vec);
+ res5 = vaddq_s16(res5, round_offset128);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+ y_filter_tmp, zero, shift_vec);
+ res6 = vaddq_s16(res6, round_offset128);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+ y_filter_tmp, zero, shift_vec);
+ res7 = vaddq_s16(res7, round_offset128);
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d_tmp + 0 * dst8_stride);
+ __builtin_prefetch(d_tmp + 1 * dst8_stride);
+ __builtin_prefetch(d_tmp + 2 * dst8_stride);
+ __builtin_prefetch(d_tmp + 3 * dst8_stride);
+
+ load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+ d_tmp += (dst_stride << 2);
+
+ compute_avg_8x4(
+ res8, res9, res10, res11, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+ store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+ d_u8 += (dst8_stride << 2);
+
+ load_u16_8x4(d_tmp, dst_stride, &res8, &res9, &res10, &res11);
+ d_tmp += (dst_stride << 2);
+
+ compute_avg_8x4(
+ res8, res9, res10, res11, vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7), fwd_offset, bck_offset,
+ round_offset64, round_bits, use_jnt_comp_avg, &t0, &t1, &t2, &t3);
+
+ store_u8_8x4(d_u8, dst8_stride, t0, t1, t2, t3);
+ d_u8 += (dst8_stride << 2);
+ } else {
+ store_u16_8x8(
+ d_tmp, dst_stride, vreinterpretq_u16_s16(res0),
+ vreinterpretq_u16_s16(res1), vreinterpretq_u16_s16(res2),
+ vreinterpretq_u16_s16(res3), vreinterpretq_u16_s16(res4),
+ vreinterpretq_u16_s16(res5), vreinterpretq_u16_s16(res6),
+ vreinterpretq_u16_s16(res7));
+ d_tmp += (dst_stride << 3);
+ }
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += (8 * src_stride);
+ height -= 8;
+#else
+ s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
+
+ __builtin_prefetch(dst_ptr);
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter_tmp,
+ zero, shift_vec);
+ res0 = vaddq_s16(res0, round_offset128);
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+
+ if (conv_params->do_average) {
+ __builtin_prefetch(d_tmp);
+
+ res8 = vld1q_u16(d_tmp);
+ d_tmp += (dst_stride);
+
+ compute_avg_8x1(res8, vreinterpretq_u16_s16(res0), fwd_offset,
+ bck_offset, round_offset64, round_bits,
+ use_jnt_comp_avg, &t0);
+
+ vst1_u8(d_u8, t0);
+ d_u8 += (dst8_stride);
+ } else {
+ vst1q_u16(d_tmp, vreinterpretq_u16_s16(res0));
+ d_tmp += dst_stride;
+ }
+
+ s += (src_stride);
+ height--;
+#endif
+ } while (height > 0);
+ src_ptr += 8;
+ dst_ptr += 8;
+ dst_u8_ptr += 8;
+ width -= 8;
+ } while (width > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/mem_neon.h b/third_party/aom/av1/common/arm/mem_neon.h
new file mode 100644
index 0000000000..c4ae2e7849
--- /dev/null
+++ b/third_party/aom/av1/common/arm/mem_neon.h
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_MEM_NEON_H_
+#define AOM_AV1_COMMON_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+
+static INLINE void store_row2_u8_8x8(uint8_t *s, int p, const uint8x8_t s0,
+ const uint8x8_t s1) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define load_u8_4x1(s, s0, lane) \
+ do { \
+ *(s0) = vreinterpret_u8_u32( \
+ vld1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(*(s0)), lane)); \
+ } while (0)
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3,
+ uint8x8_t *const s4, uint8x8_t *const s5,
+ uint8x8_t *const s6, uint8x8_t *const s7) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+ s += p;
+ *s4 = vld1_u8(s);
+ s += p;
+ *s5 = vld1_u8(s);
+ s += p;
+ *s6 = vld1_u8(s);
+ s += p;
+ *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x16(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+ uint8x8_t *const s0, uint8x8_t *const s1,
+ uint8x8_t *const s2, uint8x8_t *const s3) {
+ *s0 = vld1_u8(s);
+ s += p;
+ *s1 = vld1_u8(s);
+ s += p;
+ *s2 = vld1_u8(s);
+ s += p;
+ *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x4_t *const s0, uint16x4_t *const s1,
+ uint16x4_t *const s2, uint16x4_t *const s3) {
+ *s0 = vld1_u16(s);
+ s += p;
+ *s1 = vld1_u16(s);
+ s += p;
+ *s2 = vld1_u16(s);
+ s += p;
+ *s3 = vld1_u16(s);
+ s += p;
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+ uint16x8_t *const s0, uint16x8_t *const s1,
+ uint16x8_t *const s2, uint16x8_t *const s3) {
+ *s0 = vld1q_u16(s);
+ s += p;
+ *s1 = vld1q_u16(s);
+ s += p;
+ *s2 = vld1q_u16(s);
+ s += p;
+ *s3 = vld1q_u16(s);
+ s += p;
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3,
+ int16x4_t *const s4, int16x4_t *const s5,
+ int16x4_t *const s6, int16x4_t *const s7) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+ s += p;
+ *s4 = vld1_s16(s);
+ s += p;
+ *s5 = vld1_s16(s);
+ s += p;
+ *s6 = vld1_s16(s);
+ s += p;
+ *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+ int16x4_t *const s0, int16x4_t *const s1,
+ int16x4_t *const s2, int16x4_t *const s3) {
+ *s0 = vld1_s16(s);
+ s += p;
+ *s1 = vld1_s16(s);
+ s += p;
+ *s2 = vld1_s16(s);
+ s += p;
+ *s3 = vld1_s16(s);
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+ to enforce that. */
+#define store_u8_4x1(s, s0, lane) \
+ do { \
+ vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \
+ } while (0)
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3, const uint8x8_t s4,
+ const uint8x8_t s5, const uint8x8_t s6,
+ const uint8x8_t s7) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+ s += p;
+ vst1_u8(s, s4);
+ s += p;
+ vst1_u8(s, s5);
+ s += p;
+ vst1_u8(s, s6);
+ s += p;
+ vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+ const uint8x8_t s1, const uint8x8_t s2,
+ const uint8x8_t s3) {
+ vst1_u8(s, s0);
+ s += p;
+ vst1_u8(s, s1);
+ s += p;
+ vst1_u8(s, s2);
+ s += p;
+ vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+ const uint8x16_t s1, const uint8x16_t s2,
+ const uint8x16_t s3) {
+ vst1q_u8(s, s0);
+ s += p;
+ vst1q_u8(s, s1);
+ s += p;
+ vst1q_u8(s, s2);
+ s += p;
+ vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3,
+ const uint16x8_t s4, const uint16x8_t s5,
+ const uint16x8_t s6, const uint16x8_t s7) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+ s += dst_stride;
+ vst1q_u16(s, s4);
+ s += dst_stride;
+ vst1q_u16(s, s5);
+ s += dst_stride;
+ vst1q_u16(s, s6);
+ s += dst_stride;
+ vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x4_t s0, const uint16x4_t s1,
+ const uint16x4_t s2, const uint16x4_t s3) {
+ vst1_u16(s, s0);
+ s += dst_stride;
+ vst1_u16(s, s1);
+ s += dst_stride;
+ vst1_u16(s, s2);
+ s += dst_stride;
+ vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+ const uint16x8_t s0, const uint16x8_t s1,
+ const uint16x8_t s2, const uint16x8_t s3) {
+ vst1q_u16(s, s0);
+ s += dst_stride;
+ vst1q_u16(s, s1);
+ s += dst_stride;
+ vst1q_u16(s, s2);
+ s += dst_stride;
+ vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3,
+ const int16x8_t s4, const int16x8_t s5,
+ const int16x8_t s6, const int16x8_t s7) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+ s += dst_stride;
+ vst1q_s16(s, s4);
+ s += dst_stride;
+ vst1q_s16(s, s5);
+ s += dst_stride;
+ vst1q_s16(s, s6);
+ s += dst_stride;
+ vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+ const int16x8_t s0, const int16x8_t s1,
+ const int16x8_t s2, const int16x8_t s3) {
+ vst1q_s16(s, s0);
+ s += dst_stride;
+ vst1q_s16(s, s1);
+ s += dst_stride;
+ vst1q_s16(s, s2);
+ s += dst_stride;
+ vst1q_s16(s, s3);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3,
+ int16x8_t *const s4, int16x8_t *const s5,
+ int16x8_t *const s6, int16x8_t *const s7) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+ s += p;
+ *s4 = vld1q_s16(s);
+ s += p;
+ *s5 = vld1q_s16(s);
+ s += p;
+ *s6 = vld1q_s16(s);
+ s += p;
+ *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+ int16x8_t *const s0, int16x8_t *const s1,
+ int16x8_t *const s2, int16x8_t *const s3) {
+ *s0 = vld1q_s16(s);
+ s += p;
+ *s1 = vld1q_s16(s);
+ s += p;
+ *s2 = vld1q_s16(s);
+ s += p;
+ *s3 = vld1q_s16(s);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+ uint32x2_t *tu0, uint32x2_t *tu1,
+ uint32x2_t *tu2, uint32x2_t *tu3) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu1 = vset_lane_u32(a, *tu1, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu1 = vset_lane_u32(a, *tu1, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu2 = vset_lane_u32(a, *tu2, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu2 = vset_lane_u32(a, *tu2, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu3 = vset_lane_u32(a, *tu3, 0);
+ memcpy(&a, buf, 4);
+ *tu3 = vset_lane_u32(a, *tu3, 1);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+ uint32x2_t *tu0, uint32x2_t *tu1) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu1 = vset_lane_u32(a, *tu1, 0);
+ memcpy(&a, buf, 4);
+ *tu1 = vset_lane_u32(a, *tu1, 1);
+}
+
+static INLINE void load_unaligned_u8_4x1(const uint8_t *buf, int stride,
+ uint32x2_t *tu0) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+}
+
+static INLINE void load_unaligned_u8_4x2(const uint8_t *buf, int stride,
+ uint32x2_t *tu0) {
+ uint32_t a;
+
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ *tu0 = vset_lane_u32(a, *tu0, 1);
+}
+
+static INLINE void load_unaligned_u8_2x2(const uint8_t *buf, int stride,
+ uint16x4_t *tu0) {
+ uint16_t a;
+
+ memcpy(&a, buf, 2);
+ buf += stride;
+ *tu0 = vset_lane_u16(a, *tu0, 0);
+ memcpy(&a, buf, 2);
+ buf += stride;
+ *tu0 = vset_lane_u16(a, *tu0, 1);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3,
+ uint8x16_t *const s4, uint8x16_t *const s5,
+ uint8x16_t *const s6, uint8x16_t *const s7) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+ s += p;
+ *s4 = vld1q_u8(s);
+ s += p;
+ *s5 = vld1q_u8(s);
+ s += p;
+ *s6 = vld1q_u8(s);
+ s += p;
+ *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+ uint8x16_t *const s0, uint8x16_t *const s1,
+ uint8x16_t *const s2, uint8x16_t *const s3) {
+ *s0 = vld1q_u8(s);
+ s += p;
+ *s1 = vld1q_u8(s);
+ s += p;
+ *s2 = vld1q_u8(s);
+ s += p;
+ *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+ uint64x2_t *tu0, uint64x2_t *tu1) {
+ uint64_t a;
+
+ memcpy(&a, buf, 8);
+ buf += stride;
+ *tu0 = vsetq_lane_u64(a, *tu0, 0);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ *tu0 = vsetq_lane_u64(a, *tu0, 1);
+ memcpy(&a, buf, 8);
+ buf += stride;
+ *tu1 = vsetq_lane_u64(a, *tu1, 0);
+ memcpy(&a, buf, 8);
+ *tu1 = vsetq_lane_u64(a, *tu1, 1);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+ int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+ *s1 = vld1q_s32(s);
+ s += p;
+ *s2 = vld1q_s32(s);
+ s += p;
+ *s3 = vld1q_s32(s);
+ s += p;
+ *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+ int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+ vst1q_s32(s, s1);
+ s += p;
+ vst1q_s32(s, s2);
+ s += p;
+ vst1q_s32(s, s3);
+ s += p;
+ vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+ uint32x4_t *s2, uint32x4_t *s3,
+ uint32x4_t *s4) {
+ *s1 = vld1q_u32(s);
+ s += p;
+ *s2 = vld1q_u32(s);
+ s += p;
+ *s3 = vld1q_u32(s);
+ s += p;
+ *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+ uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+ vst1q_u32(s, s1);
+ s += p;
+ vst1q_u32(s, s2);
+ s += p;
+ vst1q_u32(s, s3);
+ s += p;
+ vst1q_u32(s, s4);
+}
+
+#endif // AOM_AV1_COMMON_ARM_MEM_NEON_H_
diff --git a/third_party/aom/av1/common/arm/reconinter_neon.c b/third_party/aom/av1/common/arm/reconinter_neon.c
new file mode 100644
index 0000000000..44e064195e
--- /dev/null
+++ b/third_party/aom/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ assert(h >= 4);
+ assert(w >= 4);
+ assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+ const int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ uint16x8_t diff_q, tmp0, tmp1;
+ uint8x8_t diff_d, diff_select;
+ const CONV_BUF_TYPE *src0_1, *src1_1;
+ const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+ const uint8x8_t dup_38 = vdup_n_u8(38);
+ const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+ if (mask_type == DIFFWTD_38) {
+ diff_select = vdup_n_u8(255);
+ } else {
+ diff_select = vdup_n_u8(0);
+ }
+ if (w >= 8) {
+ for (int i = 0; i < h; ++i) {
+ src0_1 = src0;
+ src1_1 = src1;
+ for (int j = 0; j < w; j += 8) {
+ __builtin_prefetch(src0_1);
+ __builtin_prefetch(src1_1);
+ diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+ diff_q = vrshlq_u16(diff_q, dup_round);
+ diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+ diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+ diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+ vst1_u8(mask, diff_d);
+ src0_1 += 8;
+ src1_1 += 8;
+ mask += 8;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+ } else if (w == 4) {
+ for (int i = 0; i < h; i += 2) {
+ src0_1 = src0;
+ src1_1 = src1;
+ __builtin_prefetch(src0_1 + 0 * src0_stride);
+ __builtin_prefetch(src0_1 + 1 * src0_stride);
+ __builtin_prefetch(src1_1 + 0 * src1_stride);
+ __builtin_prefetch(src1_1 + 1 * src1_stride);
+ tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+ vld1_u16(src0_1 + (1 * src0_stride)));
+ tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+ vld1_u16(src1_1 + (1 * src1_stride)));
+ diff_q = vabdq_u16(tmp0, tmp1);
+ diff_q = vrshlq_u16(diff_q, dup_round);
+ diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+ diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+ diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+ vst1_u8(mask, diff_d);
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += w * 2;
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/selfguided_neon.c b/third_party/aom/av1/common/arm/selfguided_neon.c
new file mode 100644
index 0000000000..b3a37c4cb8
--- /dev/null
+++ b/third_party/aom/av1/common/arm/selfguided_neon.c
@@ -0,0 +1,1508 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+// Constants used for right shift in final_filter calculation.
+#define NB_EVEN 5
+#define NB_ODD 4
+
+static INLINE void calc_ab_fast_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
+ int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
+ uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
+ uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
+ const int buf_stride) {
+ uint32x4_t q0, q1, q2, q3;
+ uint32x4_t p0, p1, p2, p3;
+ uint16x4_t d0, d1, d2, d3;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+
+ q0 = vmulq_u32(s4, s4);
+ q1 = vmulq_u32(s5, s5);
+ q2 = vmulq_u32(s6, s6);
+ q3 = vmulq_u32(s7, s7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 4; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
+ }
+ p0 = vsubl_u16(sgrproj_sgr, d0);
+ p1 = vsubl_u16(sgrproj_sgr, d1);
+ p2 = vsubl_u16(sgrproj_sgr, d2);
+ p3 = vsubl_u16(sgrproj_sgr, d3);
+
+ s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
+ s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
+ s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
+ s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
+
+ s4 = vmulq_u32(s4, p0);
+ s5 = vmulq_u32(s5, p1);
+ s6 = vmulq_u32(s6, p2);
+ s7 = vmulq_u32(s7, p3);
+
+ p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+}
+static INLINE void calc_ab_internal_common(
+ uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
+ uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
+ uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
+ uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
+ uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
+ uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
+ uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
+ uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
+ uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
+
+ s0 = vmulq_u32(s0, const_n_val);
+ s1 = vmulq_u32(s1, const_n_val);
+ s2 = vmulq_u32(s2, const_n_val);
+ s3 = vmulq_u32(s3, const_n_val);
+ s4 = vmulq_u32(s4, const_n_val);
+ s5 = vmulq_u32(s5, const_n_val);
+ s6 = vmulq_u32(s6, const_n_val);
+ s7 = vmulq_u32(s7, const_n_val);
+
+ d0 = vget_low_u16(s16_4);
+ d1 = vget_low_u16(s16_5);
+ d2 = vget_low_u16(s16_6);
+ d3 = vget_low_u16(s16_7);
+ d4 = vget_high_u16(s16_4);
+ d5 = vget_high_u16(s16_5);
+ d6 = vget_high_u16(s16_6);
+ d7 = vget_high_u16(s16_7);
+
+ q0 = vmull_u16(d0, d0);
+ q1 = vmull_u16(d1, d1);
+ q2 = vmull_u16(d2, d2);
+ q3 = vmull_u16(d3, d3);
+ q4 = vmull_u16(d4, d4);
+ q5 = vmull_u16(d5, d5);
+ q6 = vmull_u16(d6, d6);
+ q7 = vmull_u16(d7, d7);
+
+ p0 = vcleq_u32(q0, s0);
+ p1 = vcleq_u32(q1, s1);
+ p2 = vcleq_u32(q2, s2);
+ p3 = vcleq_u32(q3, s3);
+ p4 = vcleq_u32(q4, s4);
+ p5 = vcleq_u32(q5, s5);
+ p6 = vcleq_u32(q6, s6);
+ p7 = vcleq_u32(q7, s7);
+
+ q0 = vsubq_u32(s0, q0);
+ q1 = vsubq_u32(s1, q1);
+ q2 = vsubq_u32(s2, q2);
+ q3 = vsubq_u32(s3, q3);
+ q4 = vsubq_u32(s4, q4);
+ q5 = vsubq_u32(s5, q5);
+ q6 = vsubq_u32(s6, q6);
+ q7 = vsubq_u32(s7, q7);
+
+ p0 = vandq_u32(p0, q0);
+ p1 = vandq_u32(p1, q1);
+ p2 = vandq_u32(p2, q2);
+ p3 = vandq_u32(p3, q3);
+ p4 = vandq_u32(p4, q4);
+ p5 = vandq_u32(p5, q5);
+ p6 = vandq_u32(p6, q6);
+ p7 = vandq_u32(p7, q7);
+
+ p0 = vmulq_u32(p0, s_vec);
+ p1 = vmulq_u32(p1, s_vec);
+ p2 = vmulq_u32(p2, s_vec);
+ p3 = vmulq_u32(p3, s_vec);
+ p4 = vmulq_u32(p4, s_vec);
+ p5 = vmulq_u32(p5, s_vec);
+ p6 = vmulq_u32(p6, s_vec);
+ p7 = vmulq_u32(p7, s_vec);
+
+ p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
+ p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
+ p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
+ p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
+ p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
+ p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
+ p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
+ p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
+
+ p0 = vminq_u32(p0, const_val);
+ p1 = vminq_u32(p1, const_val);
+ p2 = vminq_u32(p2, const_val);
+ p3 = vminq_u32(p3, const_val);
+ p4 = vminq_u32(p4, const_val);
+ p5 = vminq_u32(p5, const_val);
+ p6 = vminq_u32(p6, const_val);
+ p7 = vminq_u32(p7, const_val);
+
+ {
+ store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
+ store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
+
+ for (int x = 0; x < 4; x++) {
+ for (int y = 0; y < 8; y++) {
+ dst_A16[x * buf_stride + y] = x_by_xplus1[src1[x * buf_stride + y]];
+ }
+ }
+ load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
+ }
+
+ s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
+ s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
+ s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
+ s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
+
+ s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
+ s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
+ s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
+ s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
+ s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
+ s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
+ s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
+ s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
+
+ s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
+ s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
+ s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
+ s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
+ s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
+ s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
+ s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
+ s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
+
+ p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
+ p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
+ p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
+ p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
+ p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
+ p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
+ p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
+ p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
+
+ store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
+ vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
+ vreinterpretq_s32_u32(p3));
+ store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
+ vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
+ vreinterpretq_s32_u32(p7));
+}
+
+static INLINE void boxsum2_square_sum_calc(
+ int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
+ int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
+ int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
+ int32x4_t r12, r34, r67, r89, r1011;
+ int32x4_t r345, r6789, r789;
+
+ d1 = vmull_s16(t1, t1);
+ d2 = vmull_s16(t2, t2);
+ d3 = vmull_s16(t3, t3);
+ d4 = vmull_s16(t4, t4);
+ d5 = vmull_s16(t5, t5);
+ d6 = vmull_s16(t6, t6);
+ d7 = vmull_s16(t7, t7);
+ d8 = vmull_s16(t8, t8);
+ d9 = vmull_s16(t9, t9);
+ d10 = vmull_s16(t10, t10);
+ d11 = vmull_s16(t11, t11);
+
+ r12 = vaddq_s32(d1, d2);
+ r34 = vaddq_s32(d3, d4);
+ r67 = vaddq_s32(d6, d7);
+ r89 = vaddq_s32(d8, d9);
+ r1011 = vaddq_s32(d10, d11);
+ r345 = vaddq_s32(r34, d5);
+ r6789 = vaddq_s32(r67, r89);
+ r789 = vsubq_s32(r6789, d6);
+ *r0 = vaddq_s32(r12, r345);
+ *r1 = vaddq_s32(r67, r345);
+ *r2 = vaddq_s32(d5, r6789);
+ *r3 = vaddq_s32(r789, r1011);
+}
+
+static INLINE void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
+ int32_t *dst32, int32_t *dst2, const int dst_stride,
+ const int width, const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *dst1_16_ptr, *src_ptr;
+ int32_t *dst2_ptr;
+ int h, w, count = 0;
+ const int dst_stride_2 = (dst_stride << 1);
+ const int dst_stride_8 = (dst_stride << 3);
+
+ dst1_16_ptr = dst16;
+ dst2_ptr = dst2;
+ src_ptr = src;
+ w = width;
+ {
+ int16x8_t t1, t2, t3, t4, t5, t6, t7;
+ int16x8_t t8, t9, t10, t11, t12;
+
+ int16x8_t q12345, q56789, q34567, q7891011;
+ int16x8_t q12, q34, q67, q89, q1011;
+ int16x8_t q345, q6789, q789;
+
+ int32x4_t r12345, r56789, r34567, r7891011;
+
+ do {
+ h = height;
+ dst1_16_ptr = dst16 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+
+ dst1_16_ptr += dst_stride_2;
+ dst2_ptr += dst_stride_2;
+ do {
+ load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
+ src_ptr += 4 * src_stride;
+ load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
+
+ q12 = vaddq_s16(t1, t2);
+ q34 = vaddq_s16(t3, t4);
+ q67 = vaddq_s16(t6, t7);
+ q89 = vaddq_s16(t8, t9);
+ q1011 = vaddq_s16(t10, t11);
+ q345 = vaddq_s16(q34, t5);
+ q6789 = vaddq_s16(q67, q89);
+ q789 = vaddq_s16(q89, t7);
+ q12345 = vaddq_s16(q12, q345);
+ q34567 = vaddq_s16(q67, q345);
+ q56789 = vaddq_s16(t5, q6789);
+ q7891011 = vaddq_s16(q789, q1011);
+
+ store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
+ q7891011);
+ dst1_16_ptr += dst_stride_8;
+
+ boxsum2_square_sum_calc(
+ vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
+ vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
+ vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
+ vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
+
+ boxsum2_square_sum_calc(
+ vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
+ vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
+ vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
+ vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
+ &r7891011);
+
+ store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
+ r7891011);
+ dst2_ptr += (dst_stride_8);
+ h -= 8;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int32x4_t q12345, q34567, q23456, q45678;
+ int32x4_t q23, q45, q67;
+ int32x4_t q2345, q4567;
+
+ int32x4_t r12345, r34567, r23456, r45678;
+ int32x4_t r23, r45, r67;
+ int32x4_t r2345, r4567;
+
+ int32_t *src2_ptr, *dst1_32_ptr;
+ int16_t *src1_ptr;
+ count = 0;
+ h = height;
+ do {
+ dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
+ dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
+ src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
+ w = width;
+
+ dst1_32_ptr += 2;
+ dst2_ptr += 2;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
+ transpose_s16_4x4d(&s1, &s2, &s3, &s4);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
+ transpose_s32_4x4(&d1, &d2, &d3, &d4);
+ do {
+ src1_ptr += 4;
+ src2_ptr += 4;
+ load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
+ transpose_s16_4x4d(&s5, &s6, &s7, &s8);
+ load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
+ transpose_s32_4x4(&d5, &d6, &d7, &d8);
+ q23 = vaddl_s16(s2, s3);
+ q45 = vaddl_s16(s4, s5);
+ q67 = vaddl_s16(s6, s7);
+ q2345 = vaddq_s32(q23, q45);
+ q4567 = vaddq_s32(q45, q67);
+ q12345 = vaddq_s32(vmovl_s16(s1), q2345);
+ q23456 = vaddq_s32(q2345, vmovl_s16(s6));
+ q34567 = vaddq_s32(q4567, vmovl_s16(s3));
+ q45678 = vaddq_s32(q4567, vmovl_s16(s8));
+
+ transpose_s32_4x4(&q12345, &q23456, &q34567, &q45678);
+ store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
+ q45678);
+ dst1_32_ptr += 4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+
+ r23 = vaddq_s32(d2, d3);
+ r45 = vaddq_s32(d4, d5);
+ r67 = vaddq_s32(d6, d7);
+ r2345 = vaddq_s32(r23, r45);
+ r4567 = vaddq_s32(r45, r67);
+ r12345 = vaddq_s32(d1, r2345);
+ r23456 = vaddq_s32(r2345, d6);
+ r34567 = vaddq_s32(r4567, d3);
+ r45678 = vaddq_s32(r4567, d8);
+
+ transpose_s32_4x4(&r12345, &r23456, &r34567, &r45678);
+ store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
+ dst2_ptr += 4;
+ d1 = d5;
+ d2 = d6;
+ d3 = d7;
+ d4 = d8;
+ w -= 4;
+ } while (w > 0);
+ h -= 8;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
+
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
+ load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s16_4 = s16_0;
+ s16_5 = s16_1;
+ s16_6 = s16_2;
+ s16_7 = s16_3;
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
+ uint16_t *B16, int32_t *B,
+ const int buf_stride, const int width,
+ const int height, const int bit_depth,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *dst2, count = 0;
+ uint16_t *dst_A16, *src2;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
+ const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint16x8_t s16_0, s16_1, s16_2, s16_3;
+ uint16x8_t s16_4, s16_5, s16_6, s16_7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B16 + (count << 2) * buf_stride;
+ dst2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
+ load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
+
+ s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
+ s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
+ s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
+ s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
+
+ calc_ab_internal_common(
+ s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
+ s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
+
+ w -= 8;
+ dst2 += 8;
+ src1 += 8;
+ src2 += 8;
+ dst_A16 += 8;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int r, const int s,
+ const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vreinterpretq_u32_s32(sr0);
+ s1 = vreinterpretq_u32_s32(sr1);
+ s2 = vreinterpretq_u32_s32(sr2);
+ s3 = vreinterpretq_u32_s32(sr3);
+ s4 = vreinterpretq_u32_s32(sr4);
+ s5 = vreinterpretq_u32_s32(sr5);
+ s6 = vreinterpretq_u32_s32(sr6);
+ s7 = vreinterpretq_u32_s32(sr7);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
+ int32_t *B, const int buf_stride,
+ const int width, const int height,
+ const int bit_depth, const int r,
+ const int s, const int ht_inc) {
+ int32_t *src1, *src2, count = 0;
+ uint16_t *dst_A16;
+ const uint32_t n = (2 * r + 1) * (2 * r + 1);
+ const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
+ const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
+ const uint32x4_t const_n_val = vdupq_n_u32(n);
+ const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
+ const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(one_by_x[n - 1]);
+ const uint32x4_t const_val = vdupq_n_u32(255);
+
+ int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
+ uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ const uint32x4_t s_vec = vdupq_n_u32(s);
+ int w, h = height;
+
+ do {
+ src1 = A + (count << 2) * buf_stride;
+ src2 = B + (count << 2) * buf_stride;
+ dst_A16 = A16 + (count << 2) * buf_stride;
+ w = width;
+ do {
+ load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
+ load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
+
+ s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
+ s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
+ s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
+ s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
+ s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
+ s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
+ s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
+ s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
+
+ calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
+ sr6, sr7, const_n_val, s_vec, const_val,
+ one_by_n_minus_1_vec, sgrproj_sgr, src1,
+ dst_A16, src2, buf_stride);
+
+ w -= 4;
+ src1 += 4;
+ src2 += 4;
+ dst_A16 += 4;
+ } while (w > 0);
+ count++;
+ h -= (ht_inc * 4);
+ } while (h > 0);
+}
+
+static INLINE void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
+ int32_t *dst2, const int dst_stride, const int width,
+ const int height) {
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ int16_t *src_ptr;
+ int32_t *dst2_ptr;
+ uint16_t *dst1_ptr;
+ int h, w, count = 0;
+
+ w = width;
+ {
+ int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
+ int16x8_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r345, r456, r567, r78, r678;
+ int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
+ int32x4_t r2, r3, r5, r6, r7, r8;
+ int16x8_t q678, q78;
+
+ do {
+ dst1_ptr = dst1 + (count << 3);
+ dst2_ptr = dst2 + (count << 3);
+ src_ptr = src + (count << 3);
+ h = height;
+
+ load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ src_ptr += 4 * src_stride;
+
+ q23 = vaddq_s16(s2, s3);
+ q234 = vaddq_s16(q23, s4);
+ q34 = vaddq_s16(s3, s4);
+ dst1_ptr += (dst_stride << 1);
+
+ r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
+ r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
+ r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_low = vaddq_s32(r23, r4_low);
+ r34_low = vaddq_s32(r3, r4_low);
+
+ r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
+ r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
+ r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
+ r23 = vaddq_s32(r2, r3);
+ r234_high = vaddq_s32(r23, r4_high);
+ r34_high = vaddq_s32(r3, r4_high);
+
+ dst2_ptr += (dst_stride << 1);
+
+ do {
+ load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
+ src_ptr += 4 * src_stride;
+
+ q345 = vaddq_s16(s5, q34);
+ q56 = vaddq_s16(s5, s6);
+ q456 = vaddq_s16(s4, q56);
+ q567 = vaddq_s16(s7, q56);
+ q78 = vaddq_s16(s7, s8);
+ q678 = vaddq_s16(s6, q78);
+
+ store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += (dst_stride << 2);
+
+ s4 = s8;
+ q34 = q78;
+ q234 = q678;
+
+ r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
+ r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
+ r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
+ r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_low);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_low, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
+
+ r4_low = r8;
+ r34_low = r78;
+ r234_low = r678;
+
+ r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
+ r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
+ r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
+ r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
+
+ r345 = vaddq_s32(r5, r34_high);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4_high, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
+ dst2_ptr += (dst_stride << 2);
+
+ r4_high = r8;
+ r34_high = r78;
+ r234_high = r678;
+
+ h -= 4;
+ } while (h > 0);
+ w -= 8;
+ count++;
+ } while (w > 0);
+ }
+
+ {
+ int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
+ int16x4_t q23, q34, q56, q234, q345, q456, q567;
+ int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
+ int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
+ int16x4_t q678, q78;
+
+ int32_t *src2_ptr;
+ uint16_t *src1_ptr;
+ count = 0;
+ h = height;
+ w = width;
+ do {
+ dst1_ptr = dst1 + (count << 2) * dst_stride;
+ dst2_ptr = dst2 + (count << 2) * dst_stride;
+ src1_ptr = dst1 + (count << 2) * dst_stride;
+ src2_ptr = dst2 + (count << 2) * dst_stride;
+ w = width;
+
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
+ transpose_s16_4x4d(&d1, &d2, &d3, &d4);
+ load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
+ transpose_s32_4x4(&r1, &r2, &r3, &r4);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q23 = vadd_s16(d2, d3);
+ q234 = vadd_s16(q23, d4);
+ q34 = vadd_s16(d3, d4);
+ dst1_ptr += 2;
+ r23 = vaddq_s32(r2, r3);
+ r234 = vaddq_s32(r23, r4);
+ r34 = vaddq_s32(r3, r4);
+ dst2_ptr += 2;
+
+ do {
+ load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
+ transpose_s16_4x4d(&d5, &d6, &d7, &d8);
+ load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
+ transpose_s32_4x4(&r5, &r6, &r7, &r8);
+ src1_ptr += 4;
+ src2_ptr += 4;
+
+ q345 = vadd_s16(d5, q34);
+ q56 = vadd_s16(d5, d6);
+ q456 = vadd_s16(d4, q56);
+ q567 = vadd_s16(d7, q56);
+ q78 = vadd_s16(d7, d8);
+ q678 = vadd_s16(d6, q78);
+ transpose_s16_4x4d(&q234, &q345, &q456, &q567);
+ store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
+ dst1_ptr += 4;
+
+ d4 = d8;
+ q34 = q78;
+ q234 = q678;
+
+ r345 = vaddq_s32(r5, r34);
+ r56 = vaddq_s32(r5, r6);
+ r456 = vaddq_s32(r4, r56);
+ r567 = vaddq_s32(r7, r56);
+ r78 = vaddq_s32(r7, r8);
+ r678 = vaddq_s32(r6, r78);
+ transpose_s32_4x4(&r234, &r345, &r456, &r567);
+ store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
+ dst2_ptr += 4;
+
+ r4 = r8;
+ r34 = r78;
+ r234 = r678;
+ w -= 4;
+ } while (w > 0);
+ h -= 4;
+ count++;
+ } while (h > 0);
+ }
+}
+
+static INLINE int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ int32x4_t fours, threes, res;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
+ threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
+ return res;
+}
+
+static INLINE void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
+ uint16x8_t r0, r1;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xb = vaddq_u16(xb, x);
+ xt = vaddq_u16(xt, xr);
+ xl = vaddq_u16(xl, xb);
+ xl = vaddq_u16(xl, xt);
+
+ r0 = vshlq_n_u16(xl, 2);
+
+ xbl = vaddq_u16(xbl, xbr);
+ xtl = vaddq_u16(xtl, xtr);
+ xtl = vaddq_u16(xtl, xbl);
+
+ r1 = vshlq_n_u16(xtl, 2);
+ r1 = vsubq_u16(r1, xtl);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
+}
+
+static INLINE int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
+ int32x4_t xtr, xt, xtl, xbr, xb, xbl;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xtl = vld1q_s32(buf - buf_stride - 1);
+ xt = vld1q_s32(buf - buf_stride);
+ xtr = vld1q_s32(buf - buf_stride + 1);
+ xbl = vld1q_s32(buf + buf_stride - 1);
+ xb = vld1q_s32(buf + buf_stride);
+ xbr = vld1q_s32(buf + buf_stride + 1);
+
+ fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
+ sixes = vaddq_s32(xt, xb);
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
+ int32x4_t *a0, int32x4_t *a1) {
+ uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
+
+ xtl = vld1q_u16(buf - buf_stride - 1);
+ xt = vld1q_u16(buf - buf_stride);
+ xtr = vld1q_u16(buf - buf_stride + 1);
+ xbl = vld1q_u16(buf + buf_stride - 1);
+ xb = vld1q_u16(buf + buf_stride);
+ xbr = vld1q_u16(buf + buf_stride + 1);
+
+ xbr = vaddq_u16(xbr, xbl);
+ xtr = vaddq_u16(xtr, xtl);
+ xbr = vaddq_u16(xbr, xtr);
+ xtl = vshlq_n_u16(xbr, 2);
+ xbr = vaddq_u16(xtl, xbr);
+
+ xb = vaddq_u16(xb, xt);
+ xb0 = vshlq_n_u16(xb, 1);
+ xb = vshlq_n_u16(xb, 2);
+ xb = vaddq_u16(xb, xb0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
+}
+
+static INLINE int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
+ int32x4_t xl, x, xr;
+ int32x4_t fives, sixes, fives_plus_sixes;
+
+ xl = vld1q_s32(buf - 1);
+ x = vld1q_s32(buf);
+ xr = vld1q_s32(buf + 1);
+ fives = vaddq_s32(xl, xr);
+ sixes = x;
+ fives_plus_sixes = vaddq_s32(fives, sixes);
+
+ return vaddq_s32(
+ vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
+}
+
+static INLINE void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
+ int32x4_t *a1) {
+ uint16x8_t xl, x, xr;
+ uint16x8_t x0;
+
+ xl = vld1q_u16(buf - 1);
+ x = vld1q_u16(buf);
+ xr = vld1q_u16(buf + 1);
+ xl = vaddq_u16(xl, xr);
+ x0 = vshlq_n_u16(xl, 2);
+ xl = vaddq_u16(xl, x0);
+
+ x0 = vshlq_n_u16(x, 1);
+ x = vshlq_n_u16(x, 2);
+ x = vaddq_u16(x, x0);
+
+ *a0 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
+ *a1 = vreinterpretq_s32_u32(
+ vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
+}
+
+static void final_filter_fast_internal(uint16_t *A, int32_t *B,
+ const int buf_stride, int16_t *src,
+ const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+
+ A_tmp = A;
+ B_tmp = B;
+ src_ptr = src;
+ dst_ptr = dst;
+ h = height;
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ if (!(count & 1)) {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
+ b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ } else {
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_fast_odd_row(B_tmp);
+ b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
+
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
+ int16_t *src, const int src_stride, int32_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ int16x8_t s0;
+ int32_t *B_tmp, *dst_ptr;
+ uint16_t *A_tmp;
+ int16_t *src_ptr;
+ int32x4_t a_res0, a_res1, b_res0, b_res1;
+ int w, h, count = 0;
+
+ assert(SGRPROJ_SGR_BITS == 8);
+ assert(SGRPROJ_RST_BITS == 4);
+ h = height;
+
+ do {
+ A_tmp = (A + count * buf_stride);
+ B_tmp = (B + count * buf_stride);
+ src_ptr = (src + count * src_stride);
+ dst_ptr = (dst + count * dst_stride);
+ w = width;
+ do {
+ s0 = vld1q_s16(src_ptr);
+ cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
+ a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
+ a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
+
+ b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
+ b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
+ a_res0 = vaddq_s32(a_res0, b_res0);
+ a_res1 = vaddq_s32(a_res1, b_res1);
+
+ a_res0 =
+ vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ a_res1 =
+ vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
+ vst1q_s32(dst_ptr, a_res0);
+ vst1q_s32(dst_ptr + 4, a_res1);
+
+ A_tmp += 8;
+ B_tmp += 8;
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ count++;
+ h -= 1;
+ } while (h > 0);
+}
+
+static INLINE void restoration_fast_internal(uint16_t *dgd16, int width,
+ int height, int dgd_stride,
+ int32_t *dst, int dst_stride,
+ int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ const int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ int32_t *sum_buf = B_;
+ uint16_t *tmp16_buf = A16_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 0);
+ assert(r == 2);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
+ // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
+ // buffer(square_sum_buf).
+ boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
+ width_ext, height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+
+ if (8 == bit_depth) {
+ calc_ab_fast_internal_lbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
+ params->s[radius_idx], 2);
+ } else {
+ calc_ab_fast_internal_hbd(
+ (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
+ (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
+ bit_depth, r, params->s[radius_idx], 2);
+ }
+ final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
+ dgd_stride, dst, dst_stride, width, height);
+}
+
+static INLINE void restoration_internal(uint16_t *dgd16, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t A16_[RESTORATION_PROC_UNIT_PELS];
+ uint16_t B16_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *square_sum_buf = A_;
+ uint16_t *sum_buf = B16_;
+ uint16_t *A16 = A16_;
+ int32_t *B = B_;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ assert(radius_idx == 1);
+ assert(r == 1);
+
+ // input(dgd16) is 16bit.
+ // sum of pixels output will be in 16bit(sum_buf).
+ // sum of squares output is kept in 32bit buffer(square_sum_buf).
+ boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
+ SGRPROJ_BORDER_HORZ),
+ dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
+ height_ext);
+
+ square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
+ // [1, 256] for all bit depths. b output is kept in 32bit buffer.
+ if (8 == bit_depth) {
+ calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, r, params->s[radius_idx], 1);
+ } else {
+ calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
+ (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
+ (B - buf_stride - 1), buf_stride, width + 2,
+ height + 2, bit_depth, r, params->s[radius_idx], 1);
+ }
+ final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
+ dst_stride, width, height);
+}
+
+static INLINE void src_convert_u8_to_u16(const uint8_t *src,
+ const int src_stride, uint16_t *dst,
+ const int dst_stride, const int width,
+ const int height) {
+ const uint8_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+
+ uint8x8_t t1, t2, t3, t4;
+ uint16x8_t s1, s2, s3, s4;
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ if (w >= 7) {
+ do {
+ load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
+ s1 = vmovl_u8(t1);
+ s2 = vmovl_u8(t2);
+ s3 = vmovl_u8(t3);
+ s4 = vmovl_u8(t4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+ }
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ for (int x = 0; x < h; x++) {
+ for (int y = 0; y < width; y++) {
+ dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
+ }
+ }
+}
+
+static INLINE void src_convert_hbd_copy(const uint16_t *src, int src_stride,
+ uint16_t *dst, const int dst_stride,
+ int width, int height) {
+ const uint16_t *src_ptr;
+ uint16_t *dst_ptr;
+ int h, w, count = 0;
+ uint16x8_t s1, s2, s3, s4;
+
+ h = height;
+ do {
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+ w = width;
+ do {
+ load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
+ store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
+ src_ptr += 8;
+ dst_ptr += 8;
+ w -= 8;
+ } while (w > 7);
+
+ for (int y = 0; y < w; y++) {
+ dst_ptr[y] = src_ptr[y];
+ dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
+ dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
+ dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
+ }
+ count++;
+ h -= 4;
+ } while (h > 3);
+
+ src_ptr = src + (count << 2) * src_stride;
+ dst_ptr = dst + (count << 2) * dst_stride;
+
+ for (int x = 0; x < h; x++) {
+ memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
+ sizeof(uint16_t) * width);
+ }
+}
+
+int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
+ int stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
+ flt_stride, bit_depth, sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, flt_stride,
+ bit_depth, sgr_params_idx, 1);
+ return 0;
+}
+
+void apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ uint16_t *dgd16 =
+ dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ const int dgd_stride = stride;
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ if (highbd) {
+ const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
+ src_convert_hbd_copy(
+ dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ } else {
+ src_convert_u8_to_u16(
+ dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
+ dgd_stride,
+ dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
+ dgd16_stride, width_ext, height_ext);
+ }
+
+ if (params->r[0] > 0)
+ restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0, width,
+ bit_depth, eps, 0);
+ if (params->r[1] > 0)
+ restoration_internal(dgd16, width, height, dgd16_stride, flt1, width,
+ bit_depth, eps, 1);
+
+ decode_xq(xqd, xq, params);
+
+ {
+ int16_t *src_ptr;
+ uint8_t *dst_ptr;
+ uint16_t *dst16_ptr;
+ int16x4_t d0, d4;
+ int16x8_t r0, s0;
+ uint16x8_t r4;
+ int32x4_t u0, u4, v0, v4, f00, f10;
+ uint8x8_t t0;
+ int count = 0, w = width, h = height, rc = 0;
+
+ const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
+ const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
+ const int16x8_t zero = vdupq_n_s16(0);
+ const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
+ dst_ptr = dst8;
+ src_ptr = (int16_t *)dgd16;
+ do {
+ w = width;
+ count = 0;
+ dst_ptr = dst8 + rc * dst_stride;
+ dst16_ptr = dst16 + rc * dst_stride;
+ do {
+ s0 = vld1q_s16(src_ptr + count);
+
+ u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
+ u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
+
+ v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
+ v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ f00 = vld1q_s32(flt0 + count);
+ f10 = vld1q_s32(flt0 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq0_vec, f00);
+ v4 = vmlaq_s32(v4, xq0_vec, f10);
+ }
+
+ if (params->r[1] > 0) {
+ f00 = vld1q_s32(flt1 + count);
+ f10 = vld1q_s32(flt1 + count + 4);
+
+ f00 = vsubq_s32(f00, u0);
+ f10 = vsubq_s32(f10, u4);
+
+ v0 = vmlaq_s32(v0, xq1_vec, f00);
+ v4 = vmlaq_s32(v4, xq1_vec, f10);
+ }
+
+ d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ r0 = vcombine_s16(d0, d4);
+
+ r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
+
+ if (highbd) {
+ r4 = vminq_u16(r4, max);
+ vst1q_u16(dst16_ptr, r4);
+ } else {
+ t0 = vqmovn_u16(r4);
+ vst1_u8(dst_ptr, t0);
+ }
+ w -= 8;
+ count += 8;
+ dst_ptr += 8;
+ dst16_ptr += 8;
+ } while (w > 0);
+
+ src_ptr += dgd16_stride;
+ flt1 += width;
+ flt0 += width;
+ rc++;
+ h--;
+ } while (h > 0);
+ }
+}
diff --git a/third_party/aom/av1/common/arm/transpose_neon.h b/third_party/aom/av1/common/arm/transpose_neon.h
new file mode 100644
index 0000000000..8a3d9f07ff
--- /dev/null
+++ b/third_party/aom/av1/common/arm/transpose_neon.h
@@ -0,0 +1,537 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+#define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
+ uint8x8_t *a6, uint8x8_t *a7) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
+ // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
+ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
+ // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
+
+ const uint8x16x2_t b0 =
+ vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
+ const uint8x16x2_t b1 =
+ vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
+ // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
+ // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
+ // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ // Unzip 32 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+ vreinterpretq_u32_u16(c1.val[0]));
+ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+ vreinterpretq_u32_u16(c1.val[1]));
+
+ *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+ *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+ *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+ *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+ *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+ *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+ *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+ *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+ const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const uint16x4x2_t c0 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+ const uint16x4x2_t c1 =
+ vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpret_u8_u16(c0.val[0]);
+ *a1 = vreinterpret_u8_u16(c1.val[0]);
+ *a2 = vreinterpret_u8_u16(c0.val[1]);
+ *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 10 11 12 13
+ // a1: 20 21 22 23 30 31 32 33
+ // to:
+ // b0.val[0]: 00 01 20 21 10 11 30 31
+ // b0.val[1]: 02 03 22 23 12 13 32 33
+
+ const uint16x4x2_t b0 =
+ vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 02 03 22 23
+ // c0.val[1]: 10 11 30 31 12 13 32 33
+
+ const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b0.val[1]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 02 12 22 32
+ // d0.val[1]: 01 11 21 31 03 13 23 33
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+}
+
+static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
+ uint8x8_t *a3, const uint8x8_t a4,
+ const uint8x8_t a5, const uint8x8_t a6,
+ const uint8x8_t a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 XX XX XX XX
+ // a1: 10 11 12 13 XX XX XX XX
+ // a2: 20 21 22 23 XX XX XX XX
+ // a3; 30 31 32 33 XX XX XX XX
+ // a4: 40 41 42 43 XX XX XX XX
+ // a5: 50 51 52 53 XX XX XX XX
+ // a6: 60 61 62 63 XX XX XX XX
+ // a7: 70 71 72 73 XX XX XX XX
+ // to:
+ // b0.val[0]: 00 01 02 03 40 41 42 43
+ // b1.val[0]: 10 11 12 13 50 51 52 53
+ // b2.val[0]: 20 21 22 23 60 61 62 63
+ // b3.val[0]: 30 31 32 33 70 71 72 73
+
+ const uint32x2x2_t b0 =
+ vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
+ const uint32x2x2_t b1 =
+ vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
+ const uint32x2x2_t b2 =
+ vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
+ const uint32x2x2_t b3 =
+ vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 01 20 21 40 41 60 61
+ // c0.val[1]: 02 03 22 23 42 43 62 63
+ // c1.val[0]: 10 11 30 31 50 51 70 71
+ // c1.val[1]: 12 13 32 33 52 53 72 73
+
+ const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+ vreinterpret_u16_u32(b2.val[0]));
+ const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+ vreinterpret_u16_u32(b3.val[0]));
+
+ // Swap 8 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 01 11 21 31 41 51 61 71
+ // d1.val[0]: 02 12 22 32 42 52 62 72
+ // d1.val[1]: 03 13 23 33 43 53 63 73
+
+ const uint8x8x2_t d0 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+ const uint8x8x2_t d1 =
+ vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+ *a0 = d0.val[0];
+ *a1 = d0.val[1];
+ *a2 = d1.val[0];
+ *a3 = d1.val[1];
+}
+
+static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
+ uint16x4_t *a2, uint16x4_t *a3,
+ uint16x4_t *a4, uint16x4_t *a5,
+ uint16x4_t *a6, uint16x4_t *a7,
+ uint16x8_t *o0, uint16x8_t *o1,
+ uint16x8_t *o2, uint16x8_t *o3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // a4: 40 41 42 43
+ // a5: 50 51 52 53
+ // a6: 60 61 62 63
+ // a7: 70 71 72 73
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+ // b2.val[0]: 40 50 42 52
+ // b2.val[1]: 41 51 43 53
+ // b3.val[0]: 60 70 62 72
+ // b3.val[1]: 61 71 63 73
+
+ uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+ uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+ uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
+ uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+ // c2.val[0]: 40 50 60 70
+ // c2.val[1]: 42 52 62 72
+ // c3.val[0]: 41 51 61 71
+ // c3.val[1]: 43 53 63 73
+
+ uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+ vreinterpret_u32_u16(b1.val[0]));
+ uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+ vreinterpret_u32_u16(b1.val[1]));
+ uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
+ vreinterpret_u32_u16(b3.val[0]));
+ uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
+ vreinterpret_u32_u16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // o0: 00 10 20 30 40 50 60 70
+ // o1: 01 11 21 31 41 51 61 71
+ // o2: 02 12 22 32 42 52 62 72
+ // o3: 03 13 23 33 43 53 63 73
+
+ *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
+ vreinterpret_u16_u32(c2.val[0]));
+ *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
+ vreinterpret_u16_u32(c3.val[0]));
+ *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
+ vreinterpret_u16_u32(c2.val[1]));
+ *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
+ vreinterpret_u16_u32(c3.val[1]));
+}
+
+static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
+ uint16x8_t *a2, uint16x8_t *a3,
+ uint16x8_t *a4, uint16x8_t *a5,
+ uint16x8_t *a6, uint16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+ const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+ const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+ const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+ vreinterpretq_u32_u16(b1.val[0]));
+ const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+ vreinterpretq_u32_u16(b1.val[1]));
+ const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+ vreinterpretq_u32_u16(b3.val[0]));
+ const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+ vreinterpretq_u32_u16(b3.val[1]));
+
+ *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
+ vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
+ *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
+ vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
+
+ *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
+ vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
+ *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
+ vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
+
+ *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
+ vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
+ *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
+ vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
+
+ *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
+ vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
+ *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
+ vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
+}
+
+static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2, int16x8_t *a3,
+ int16x8_t *a4, int16x8_t *a5,
+ int16x8_t *a6, int16x8_t *a7) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+ const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+ const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
+ vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
+ *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
+ vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
+
+ *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
+ vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
+ *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
+ vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
+
+ *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
+ vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
+ *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
+ vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
+
+ *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
+ vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
+ *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
+ vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
+}
+
+static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+ int16x8x2_t b0;
+ b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+ vreinterpret_s16_s32(vget_low_s32(a1)));
+ b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+ vreinterpret_s16_s32(vget_high_s32(a1)));
+ return b0;
+}
+
+static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+ // b2.val[0]: 40 50 42 52 44 54 46 56
+ // b2.val[1]: 41 51 43 53 45 55 47 57
+ // b3.val[0]: 60 70 62 72 64 74 66 76
+ // b3.val[1]: 61 71 63 73 65 75 67 77
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
+ const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
+ const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
+ const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 02 12 22 32 06 16 26 36
+ // c1.val[0]: 01 11 21 31 05 15 25 35
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+ // c2.val[0]: 40 50 60 70 44 54 64 74
+ // c2.val[1]: 42 52 62 72 46 56 66 76
+ // c3.val[0]: 41 51 61 71 45 55 65 75
+ // c3.val[1]: 43 53 63 73 47 57 67 77
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b3.val[0]));
+ const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ // Swap 64 bit elements resulting in:
+ // d0.val[0]: 00 10 20 30 40 50 60 70
+ // d0.val[1]: 04 14 24 34 44 54 64 74
+ // d1.val[0]: 01 11 21 31 41 51 61 71
+ // d1.val[1]: 05 15 25 35 45 55 65 75
+ // d2.val[0]: 02 12 22 32 42 52 62 72
+ // d2.val[1]: 06 16 26 36 46 56 66 76
+ // d3.val[0]: 03 13 23 33 43 53 63 73
+ // d3.val[1]: 07 17 27 37 47 57 67 77
+ const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+ const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+ const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+ const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+ *out = d0.val[0];
+ *(out + 1) = d1.val[0];
+ *(out + 2) = d2.val[0];
+ *(out + 3) = d3.val[0];
+ *(out + 4) = d0.val[1];
+ *(out + 5) = d1.val[1];
+ *(out + 6) = d2.val[1];
+ *(out + 7) = d3.val[1];
+}
+
+static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
+ int16x4_t *a2, int16x4_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+ const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+ vreinterpret_s32_s16(b1.val[0]));
+ const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+ vreinterpret_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpret_s16_s32(c0.val[0]);
+ *a1 = vreinterpret_s16_s32(c1.val[0]);
+ *a2 = vreinterpret_s16_s32(c0.val[1]);
+ *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+ int32x4x2_t b0;
+ b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+ b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+ return b0;
+}
+
+static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+ int32x4_t *a2, int32x4_t *a3) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0.val[0]: 00 10 02 12
+ // b0.val[1]: 01 11 03 13
+ // b1.val[0]: 20 30 22 32
+ // b1.val[1]: 21 31 23 33
+
+ const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
+ const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
+
+ // Swap 64 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30
+ // c0.val[1]: 02 12 22 32
+ // c1.val[0]: 01 11 21 31
+ // c1.val[1]: 03 13 23 33
+
+ const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+ *a0 = c0.val[0];
+ *a1 = c1.val[0];
+ *a2 = c0.val[1];
+ *a3 = c1.val[1];
+}
+
+#endif // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/av1/common/arm/warp_plane_neon.c b/third_party/aom/av1/common/arm/warp_plane_neon.c
new file mode 100644
index 0000000000..7f02d42a73
--- /dev/null
+++ b/third_party/aom/av1/common/arm/warp_plane_neon.c
@@ -0,0 +1,714 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+#include <memory.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "config/av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+ filter_8bit_neon[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+
+#else
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
+ // dummy (replicate row index 95)
+ { 0, 0, 4, -3, 0, -1, 127, 1},
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+static INLINE void convolve(int32x2x2_t x0, int32x2x2_t x1, uint8x8_t src_0,
+ uint8x8_t src_1, int16x4_t *res) {
+ int16x8_t coeff_0, coeff_1;
+ int16x8_t pix_0, pix_1;
+
+ coeff_0 = vcombine_s16(vreinterpret_s16_s32(x0.val[0]),
+ vreinterpret_s16_s32(x1.val[0]));
+ coeff_1 = vcombine_s16(vreinterpret_s16_s32(x0.val[1]),
+ vreinterpret_s16_s32(x1.val[1]));
+
+ pix_0 = vreinterpretq_s16_u16(vmovl_u8(src_0));
+ pix_0 = vmulq_s16(coeff_0, pix_0);
+
+ pix_1 = vreinterpretq_s16_u16(vmovl_u8(src_1));
+ pix_0 = vmlaq_s16(pix_0, coeff_1, pix_1);
+
+ *res = vpadd_s16(vget_low_s16(pix_0), vget_high_s16(pix_0));
+}
+
+static INLINE void horizontal_filter_neon(uint8x16_t src_1, uint8x16_t src_2,
+ uint8x16_t src_3, uint8x16_t src_4,
+ int16x8_t *tmp_dst, int sx, int alpha,
+ int k, const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ const uint8x16_t mask = { 255, 0, 255, 0, 255, 0, 255, 0,
+ 255, 0, 255, 0, 255, 0, 255, 0 };
+ const int32x4_t add_const = vdupq_n_s32((int32_t)(1 << offset_bits_horiz));
+ const int16x8_t shift = vdupq_n_s16(-(int16_t)reduce_bits_horiz);
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int32x2x2_t b0, b1;
+ uint8x8_t src_1_low, src_2_low, src_3_low, src_4_low, src_5_low, src_6_low;
+ int32x4_t tmp_res_low, tmp_res_high;
+ uint16x8_t res;
+ int16x4_t res_0246_even, res_0246_odd, res_1357_even, res_1357_odd;
+
+ uint8x16_t tmp_0 = vandq_u8(src_1, mask);
+ uint8x16_t tmp_1 = vandq_u8(src_2, mask);
+ uint8x16_t tmp_2 = vandq_u8(src_3, mask);
+ uint8x16_t tmp_3 = vandq_u8(src_4, mask);
+
+ tmp_2 = vextq_u8(tmp_0, tmp_0, 1);
+ tmp_3 = vextq_u8(tmp_1, tmp_1, 1);
+
+ src_1 = vaddq_u8(tmp_0, tmp_2);
+ src_2 = vaddq_u8(tmp_1, tmp_3);
+
+ src_1_low = vget_low_u8(src_1);
+ src_2_low = vget_low_u8(src_2);
+ src_3_low = vget_low_u8(vextq_u8(src_1, src_1, 4));
+ src_4_low = vget_low_u8(vextq_u8(src_2, src_2, 4));
+ src_5_low = vget_low_u8(vextq_u8(src_1, src_1, 2));
+ src_6_low = vget_low_u8(vextq_u8(src_1, src_1, 6));
+
+ // Loading the 8 filter taps
+ f0 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f1 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f2 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f3 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f4 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f5 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f6 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]));
+ f7 = vmovl_s8(
+ vld1_s8(filter_8bit_neon[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]));
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f0)),
+ vreinterpret_s32_s16(vget_low_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f4)),
+ vreinterpret_s32_s16(vget_low_s16(f6)));
+ convolve(b0, b1, src_1_low, src_3_low, &res_0246_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f1)),
+ vreinterpret_s32_s16(vget_low_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(f5)),
+ vreinterpret_s32_s16(vget_low_s16(f7)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_0246_odd);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f0)),
+ vreinterpret_s32_s16(vget_high_s16(f2)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f4)),
+ vreinterpret_s32_s16(vget_high_s16(f6)));
+ convolve(b0, b1, src_2_low, src_4_low, &res_1357_even);
+
+ b0 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f1)),
+ vreinterpret_s32_s16(vget_high_s16(f3)));
+ b1 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(f5)),
+ vreinterpret_s32_s16(vget_high_s16(f7)));
+ convolve(b0, b1, src_5_low, src_6_low, &res_1357_odd);
+
+ tmp_res_low = vaddl_s16(res_0246_even, res_1357_even);
+ tmp_res_high = vaddl_s16(res_0246_odd, res_1357_odd);
+
+ tmp_res_low = vaddq_s32(tmp_res_low, add_const);
+ tmp_res_high = vaddq_s32(tmp_res_high, add_const);
+
+ res = vcombine_u16(vqmovun_s32(tmp_res_low), vqmovun_s32(tmp_res_high));
+ res = vqrshlq_u16(res, shift);
+
+ tmp_dst[k + 7] = vreinterpretq_s16_u16(res);
+}
+
+static INLINE void vertical_filter_neon(const int16x8_t *src,
+ int32x4_t *res_low, int32x4_t *res_high,
+ int sy, int gamma) {
+ int16x4_t src_0, src_1, fltr_0, fltr_1;
+ int32x4_t res_0, res_1;
+ int32x2_t res_0_im, res_1_im;
+ int32x4_t res_even, res_odd, im_res_0, im_res_1;
+
+ int16x8_t f0, f1, f2, f3, f4, f5, f6, f7;
+ int16x8x2_t b0, b1, b2, b3;
+ int32x4x2_t c0, c1, c2, c3;
+ int32x4x2_t d0, d1, d2, d3;
+
+ b0 = vtrnq_s16(src[0], src[1]);
+ b1 = vtrnq_s16(src[2], src[3]);
+ b2 = vtrnq_s16(src[4], src[5]);
+ b3 = vtrnq_s16(src[6], src[7]);
+
+ c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b0.val[1]));
+ c1 = vtrnq_s32(vreinterpretq_s32_s16(b1.val[0]),
+ vreinterpretq_s32_s16(b1.val[1]));
+ c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+ vreinterpretq_s32_s16(b2.val[1]));
+ c3 = vtrnq_s32(vreinterpretq_s32_s16(b3.val[0]),
+ vreinterpretq_s32_s16(b3.val[1]));
+
+ f0 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f1 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f2 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f3 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f4 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f5 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f6 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ f7 = vld1q_s16(
+ (int16_t *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ d0 = vtrnq_s32(vreinterpretq_s32_s16(f0), vreinterpretq_s32_s16(f2));
+ d1 = vtrnq_s32(vreinterpretq_s32_s16(f4), vreinterpretq_s32_s16(f6));
+ d2 = vtrnq_s32(vreinterpretq_s32_s16(f1), vreinterpretq_s32_s16(f3));
+ d3 = vtrnq_s32(vreinterpretq_s32_s16(f5), vreinterpretq_s32_s16(f7));
+
+ // row:0,1 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 even_col:0,2,4,6
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 even_col:0,2
+ src_0 = vget_low_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d0.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 even_col:4,6
+ src_1 = vget_low_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d1.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 even_col:0,2,4,6
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 even_col:0,2,4,6
+ res_even = vaddq_s32(im_res_0, im_res_1);
+
+ // row:0,1 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c0.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:0,1,2,3 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c1.val[0]));
+ fltr_0 = vget_low_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:0,1 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c0.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:0,1,2,3 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c1.val[1]));
+ fltr_1 = vget_low_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:0,1,2,3 odd_col:1,3,5,7
+ im_res_0 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:4,5 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c2.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[0]));
+ res_0 = vmull_s16(src_0, fltr_0);
+
+ // row:4,5,6,7 odd_col:1,3
+ src_0 = vget_high_s16(vreinterpretq_s16_s32(c3.val[0]));
+ fltr_0 = vget_high_s16(vreinterpretq_s16_s32(d2.val[1]));
+ res_0 = vmlal_s16(res_0, src_0, fltr_0);
+ res_0_im = vpadd_s32(vget_low_s32(res_0), vget_high_s32(res_0));
+
+ // row:4,5 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c2.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[0]));
+ res_1 = vmull_s16(src_1, fltr_1);
+
+ // row:4,5,6,7 odd_col:5,7
+ src_1 = vget_high_s16(vreinterpretq_s16_s32(c3.val[1]));
+ fltr_1 = vget_high_s16(vreinterpretq_s16_s32(d3.val[1]));
+ res_1 = vmlal_s16(res_1, src_1, fltr_1);
+ res_1_im = vpadd_s32(vget_low_s32(res_1), vget_high_s32(res_1));
+
+ // row:4,5,6,7 odd_col:1,3,5,7
+ im_res_1 = vcombine_s32(res_0_im, res_1_im);
+
+ // row:0-7 odd_col:1,3,5,7
+ res_odd = vaddq_s32(im_res_0, im_res_1);
+
+ // reordering as 0 1 2 3 | 4 5 6 7
+ c0 = vtrnq_s32(res_even, res_odd);
+
+ // Final store
+ *res_low = vcombine_s32(vget_low_s32(c0.val[0]), vget_low_s32(c0.val[1]));
+ *res_high = vcombine_s32(vget_high_s32(c0.val[0]), vget_high_s32(c0.val[1]));
+}
+
+void av1_warp_affine_neon(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int16x8_t tmp[15];
+ const int bd = 8;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const int32x4_t fwd = vdupq_n_s32((int32_t)w0);
+ const int32x4_t bwd = vdupq_n_s32((int32_t)w1);
+ const int16x8_t sub_constant = vdupq_n_s16((1 << (bd - 1)) + (1 << bd));
+
+ int limit = 0;
+ uint8x16_t vec_dup, mask_val;
+ int32x4_t res_lo, res_hi;
+ int16x8_t result_final;
+ uint8x16_t src_1, src_2, src_3, src_4;
+ uint8x16_t indx_vec = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+ };
+ uint8x16_t cmp_vec;
+
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int32x4_t shift_vert = vdupq_n_s32(-(int32_t)reduce_bits_vert);
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ int32x4_t add_const_vert = vdupq_n_s32((int32_t)(1 << offset_bits_vert));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int16x4_t round_bits_vec = vdup_n_s16(-(int16_t)round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16x4_t res_sub_const =
+ vdup_n_s16(-((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1))));
+ int k;
+
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ for (int i = 0; i < p_height; i += 8) {
+ for (int j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ // horizontal
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val =
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz));
+
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int16_t dup_val = (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz));
+ tmp[k + 7] = vdupq_n_s16(dup_val);
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+
+ if (out_of_boundary_left >= 0) {
+ limit = out_of_boundary_left + 1;
+ cmp_vec = vdupq_n_u8(out_of_boundary_left);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcleq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ if (out_of_boundary_right >= 0) {
+ limit = 15 - (out_of_boundary_right + 1);
+ cmp_vec = vdupq_n_u8(15 - out_of_boundary_right);
+ vec_dup = vdupq_n_u8(*(src + limit));
+ mask_val = vcgeq_u8(indx_vec, cmp_vec);
+ src_1 = vbslq_u8(mask_val, vec_dup, src_1);
+ }
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ const uint8_t *src = ref + iy * stride + ix4 - 7;
+ src_1 = vld1q_u8(src);
+ src_2 = vextq_u8(src_1, src_1, 1);
+ src_3 = vextq_u8(src_2, src_2, 1);
+ src_4 = vextq_u8(src_3, src_3, 1);
+
+ horizontal_filter_neon(src_1, src_2, src_3, src_4, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ }
+
+ // vertical
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ const int16x8_t *v_src = tmp + (k + 4);
+
+ vertical_filter_neon(v_src, &res_lo, &res_hi, sy, gamma);
+
+ res_lo = vaddq_s32(res_lo, add_const_vert);
+ res_hi = vaddq_s32(res_hi, add_const_vert);
+
+ if (conv_params->is_compound) {
+ uint16_t *const p =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8 = &pred[(i + k + 4) * p_stride + j];
+ uint16x4_t tmp16_lo = vld1_u16(p);
+ int32x4_t tmp32_lo = vreinterpretq_s32_u32(vmovl_u16(tmp16_lo));
+ int16x4_t tmp16_low;
+ if (conv_params->use_jnt_comp_avg) {
+ res_lo = vmulq_s32(res_lo, bwd);
+ tmp32_lo = vmulq_s32(tmp32_lo, fwd);
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, DIST_PRECISION_BITS);
+ } else {
+ tmp32_lo = vaddq_s32(tmp32_lo, res_lo);
+ tmp16_low = vshrn_n_s32(tmp32_lo, 1);
+ }
+ int16x4_t res_low = vadd_s16(tmp16_low, res_sub_const);
+ res_low = vqrshl_s16(res_low, round_bits_vec);
+ int16x8_t final_res_low = vcombine_s16(res_low, res_low);
+ uint8x8_t res_8_low = vqmovun_s16(final_res_low);
+
+ vst1_lane_u32((uint32_t *)dst8, vreinterpret_u32_u8(res_8_low), 0);
+ } else {
+ uint16x4_t res_u16_low = vqmovun_s32(res_lo);
+ vst1_u16(p, res_u16_low);
+ }
+ if (p_width > 4) {
+ uint16_t *const p4 =
+ (uint16_t *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+ if (conv_params->do_average) {
+ uint8_t *const dst8_4 = &pred[(i + k + 4) * p_stride + j + 4];
+
+ uint16x4_t tmp16_hi = vld1_u16(p4);
+ int32x4_t tmp32_hi = vreinterpretq_s32_u32(vmovl_u16(tmp16_hi));
+ int16x4_t tmp16_high;
+ if (conv_params->use_jnt_comp_avg) {
+ res_hi = vmulq_s32(res_hi, bwd);
+ tmp32_hi = vmulq_s32(tmp32_hi, fwd);
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, DIST_PRECISION_BITS);
+ } else {
+ tmp32_hi = vaddq_s32(tmp32_hi, res_hi);
+ tmp16_high = vshrn_n_s32(tmp32_hi, 1);
+ }
+ int16x4_t res_high = vadd_s16(tmp16_high, res_sub_const);
+ res_high = vqrshl_s16(res_high, round_bits_vec);
+ int16x8_t final_res_high = vcombine_s16(res_high, res_high);
+ uint8x8_t res_8_high = vqmovun_s16(final_res_high);
+
+ vst1_lane_u32((uint32_t *)dst8_4, vreinterpret_u32_u8(res_8_high),
+ 0);
+ } else {
+ uint16x4_t res_u16_high = vqmovun_s32(res_hi);
+ vst1_u16(p4, res_u16_high);
+ }
+ }
+ } else {
+ res_lo = vrshlq_s32(res_lo, shift_vert);
+ res_hi = vrshlq_s32(res_hi, shift_vert);
+
+ result_final = vcombine_s16(vmovn_s32(res_lo), vmovn_s32(res_hi));
+ result_final = vsubq_s16(result_final, sub_constant);
+
+ uint8_t *const p = (uint8_t *)&pred[(i + k + 4) * p_stride + j];
+ uint8x8_t val = vqmovun_s16(result_final);
+
+ if (p_width == 4) {
+ vst1_lane_u32((uint32_t *)p, vreinterpret_u32_u8(val), 0);
+ } else {
+ vst1_u8(p, val);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/arm/wiener_convolve_neon.c b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
new file mode 100644
index 0000000000..a9bb5bcf00
--- /dev/null
+++ b/third_party/aom/av1/common/arm/wiener_convolve_neon.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_ports/mem.h"
+#include "av1/common/common.h"
+#include "av1/common/arm/convolve_neon.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/arm/transpose_neon.h"
+
+/* Wiener filter 2D
+ Apply horizontal filter and store in a temporary buffer. When applying
+ vertical filter, overwrite the original pixel values.
+ */
+void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ uint16_t *d_tmp;
+ uint8_t *d;
+ const uint8_t *src_ptr, *s_tmp;
+ uint16_t *dst_ptr;
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ int width, height;
+ const int bd = 8;
+ const int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ int16_t filter_x_tmp[7], filter_y_tmp[7];
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w % 8));
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+
+ assert(filter_x[7] == 0);
+ assert(filter_y[7] == 0);
+
+ /* assumption of horizontal filtering output will not exceed 15 bit.
+ ((bd) + 1 + FILTER_BITS - conv_params->round_0) <= 15
+ 16 - conv_params->round_0 <= 15 -- (conv_params->round_0) >= 1
+ */
+ assert((conv_params->round_0) >= 1);
+
+ memcpy(&filter_x_tmp[0], filter_x, sizeof(*filter_x) * FILTER_BITS);
+ memcpy(&filter_y_tmp[0], filter_y, sizeof(*filter_y) * FILTER_BITS);
+
+ filter_x_tmp[3] += (1 << FILTER_BITS);
+ filter_y_tmp[3] += (1 << FILTER_BITS);
+
+ s_tmp = src - center_tap * src_stride - center_tap;
+ dst_ptr = temp;
+ src_ptr = s_tmp;
+ height = intermediate_height;
+
+ /* if height is a multiple of 8 */
+ if (!(h & 7)) {
+ int16x8_t res0, res1, res2, res3;
+ uint16x8_t res4;
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+#if defined(__aarch64__)
+ uint16x8_t res5, res6, res7, res8, res9, res10, res11;
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14;
+
+ do {
+ const uint8_t *s;
+
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
+
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+ width = w;
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * dst_stride);
+
+ do {
+ load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+ transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
+ res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
+ &res11);
+ store_u16_8x8(d_tmp, MAX_SB_SIZE, res4, res5, res6, res7, res8, res9,
+ res10, res11);
+
+ t0 = t8;
+ t1 = t9;
+ t2 = t10;
+ t3 = t11;
+ t4 = t12;
+ t5 = t13;
+ t6 = t14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * MAX_SB_SIZE;
+ height -= 8;
+ } while (height > 0);
+#else
+ uint8x8_t temp_0;
+
+ do {
+ const uint8_t *s;
+
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ __builtin_prefetch(dst_ptr);
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
+ bd, conv_params->round_0);
+
+ vst1q_u16(d_tmp, res4);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height--;
+ } while (height > 0);
+#endif
+ } else {
+ /*if height is a multiple of 4*/
+ const uint8_t *s;
+ int16x8_t tt0, tt1, tt2, tt3;
+ uint16x8_t d0;
+ uint8x8_t t0, t1, t2, t3;
+
+#if defined(__aarch64__)
+ uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
+ uint16x8_t d1, d2, d3;
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+ int16x4_t s11, s12, s13, s14;
+ do {
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+
+ load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+ transpose_u8_8x4(&t0, &t1, &t2,
+ &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
+
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ s0 = vget_low_s16(tt0); /*pa0 pb0 pc0 pd0 -- pixel_a0*/
+ s1 = vget_low_s16(tt1); /*pa1 pb1 pc1 pd1 */
+ s2 = vget_low_s16(tt2); /*pa2 pb2 pc2 pd2 */
+ s3 = vget_low_s16(tt3); /*pa3 pb3 pc3 pd3 */
+ s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
+ s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
+ s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+ width = w;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+ s7 = vget_low_s16(tt0); /*pa7 pb7 pc7 pd7 */ /*4x8*/
+ s8 = vget_low_s16(tt1); /*pa8 pb8 pc8 pd8 */
+ s9 = vget_low_s16(tt2); /*pa9 pb9 pc9 pd9 */
+ s10 = vget_low_s16(tt3); /*pa10 pb10 pc10 pd10 */
+ s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
+ s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
+ s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
+ s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
+
+ res0 = wiener_convolve8_horiz_4x8(
+ s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
+ res1 = wiener_convolve8_horiz_4x8(
+ s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
+ res2 = wiener_convolve8_horiz_4x8(
+ s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
+ res3 = wiener_convolve8_horiz_4x8(
+ s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
+ res4 =
+ wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
+ filter_x_tmp, bd, conv_params->round_0);
+ res5 =
+ wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
+ filter_x_tmp, bd, conv_params->round_0);
+ res6 =
+ wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
+ filter_x_tmp, bd, conv_params->round_0);
+ res7 =
+ wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
+ filter_x_tmp, bd, conv_params->round_0);
+
+ transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7, &d0, &d1, &d2, &d3);
+
+ store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * MAX_SB_SIZE;
+ height -= 4;
+ } while (height > 0);
+#else
+ uint8x8_t temp_0, t4, t5, t6, t7;
+
+ do {
+ __builtin_prefetch(src_ptr);
+
+ t0 = vld1_u8(src_ptr); // a0 a1 a2 a3 a4 a5 a6 a7
+
+ __builtin_prefetch(dst_ptr);
+
+ s = src_ptr + 8;
+ d_tmp = dst_ptr;
+ width = w;
+
+ do {
+ t7 = vld1_u8(s); // a8 a9 a10 a11 a12 a13 a14 a15
+ temp_0 = t0;
+ t0 = t7;
+
+ t1 = vext_u8(temp_0, t7, 1); // a1 a2 a3 a4 a5 a6 a7 a8
+ t2 = vext_u8(temp_0, t7, 2); // a2 a3 a4 a5 a6 a7 a8 a9
+ t3 = vext_u8(temp_0, t7, 3); // a3 a4 a5 a6 a7 a8 a9 a10
+ t4 = vext_u8(temp_0, t7, 4); // a4 a5 a6 a7 a8 a9 a10 a11
+ t5 = vext_u8(temp_0, t7, 5); // a5 a6 a7 a8 a9 a10 a11 a12
+ t6 = vext_u8(temp_0, t7, 6); // a6 a7 a8 a9 a10 a11 a12 a13
+ t7 = vext_u8(temp_0, t7, 7); // a7 a8 a9 a10 a11 a12 a13 a14
+
+ tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
+ tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
+ tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
+ tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
+ conv_params->round_0);
+
+ vst1q_u16(d_tmp, d0);
+
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+
+ src_ptr += src_stride;
+ dst_ptr += MAX_SB_SIZE;
+ height -= 1;
+ } while (height > 0);
+#endif
+ }
+
+ {
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t t0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t t1, t2, t3;
+#endif
+ int16_t *src_tmp_ptr, *s;
+ uint8_t *dst_tmp_ptr;
+ height = h;
+ width = w;
+ src_tmp_ptr = (int16_t *)temp;
+ dst_tmp_ptr = dst;
+ src_stride = MAX_SB_SIZE;
+
+ do {
+ s = src_tmp_ptr;
+ s0 = vld1q_s16(s);
+ s += src_stride;
+ s1 = vld1q_s16(s);
+ s += src_stride;
+ s2 = vld1q_s16(s);
+ s += src_stride;
+ s3 = vld1q_s16(s);
+ s += src_stride;
+ s4 = vld1q_s16(s);
+ s += src_stride;
+ s5 = vld1q_s16(s);
+ s += src_stride;
+ s6 = vld1q_s16(s);
+ s += src_stride;
+ d = dst_tmp_ptr;
+ height = h;
+
+#if defined(__aarch64__)
+ do {
+ __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_tmp_ptr + 3 * dst_stride);
+
+ s7 = vld1q_s16(s);
+ s += src_stride;
+ s8 = vld1q_s16(s);
+ s += src_stride;
+ s9 = vld1q_s16(s);
+ s += src_stride;
+ s10 = vld1q_s16(s);
+ s += src_stride;
+
+ t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+ bd, conv_params->round_1);
+ t1 = wiener_convolve8_vert_4x8(s1, s2, s3, s4, s5, s6, s7, filter_y_tmp,
+ bd, conv_params->round_1);
+ t2 = wiener_convolve8_vert_4x8(s2, s3, s4, s5, s6, s7, s8, filter_y_tmp,
+ bd, conv_params->round_1);
+ t3 = wiener_convolve8_vert_4x8(s3, s4, s5, s6, s7, s8, s9, filter_y_tmp,
+ bd, conv_params->round_1);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+ vst1_u8(d, t1);
+ d += dst_stride;
+ vst1_u8(d, t2);
+ d += dst_stride;
+ vst1_u8(d, t3);
+ d += dst_stride;
+
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+ } while (height > 3);
+
+ if (height != 0) {
+ __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
+
+ do {
+ s7 = vld1q_s16(s);
+ s += src_stride;
+
+ t0 =
+ wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
+ filter_y_tmp, bd, conv_params->round_1);
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+ } while (height > 0);
+ }
+
+ src_tmp_ptr += 8;
+ dst_tmp_ptr += 8;
+
+ w -= 8;
+ } while (w > 0);
+#else
+ do {
+ __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
+
+ s7 = vld1q_s16(s);
+ s += src_stride;
+
+ t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
+ bd, conv_params->round_1);
+
+ vst1_u8(d, t0);
+ d += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+ } while (height > 0);
+
+ src_tmp_ptr += 8;
+ dst_tmp_ptr += 8;
+
+ w -= 8;
+ } while (w > 0);
+#endif
+ }
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.c b/third_party/aom/av1/common/av1_inv_txfm1d.c
new file mode 100644
index 0000000000..7ef2d6d7fa
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.c
@@ -0,0 +1,1846 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+// TODO(angiebird): Make 1-d txfm functions static
+//
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 4;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[4];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[2];
+ bf1[2] = input[1];
+ bf1[3] = input[3];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+}
+
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 8;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[4];
+ bf1[2] = input[2];
+ bf1[3] = input[6];
+ bf1[4] = input[1];
+ bf1[5] = input[5];
+ bf1[6] = input[3];
+ bf1[7] = input[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+}
+
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 16;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[8];
+ bf1[2] = input[4];
+ bf1[3] = input[12];
+ bf1[4] = input[2];
+ bf1[5] = input[10];
+ bf1[6] = input[6];
+ bf1[7] = input[14];
+ bf1[8] = input[1];
+ bf1[9] = input[9];
+ bf1[10] = input[5];
+ bf1[11] = input[13];
+ bf1[12] = input[3];
+ bf1[13] = input[11];
+ bf1[14] = input[7];
+ bf1[15] = input[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+}
+
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 32;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[32];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[16];
+ bf1[2] = input[8];
+ bf1[3] = input[24];
+ bf1[4] = input[4];
+ bf1[5] = input[20];
+ bf1[6] = input[12];
+ bf1[7] = input[28];
+ bf1[8] = input[2];
+ bf1[9] = input[18];
+ bf1[10] = input[10];
+ bf1[11] = input[26];
+ bf1[12] = input[6];
+ bf1[13] = input[22];
+ bf1[14] = input[14];
+ bf1[15] = input[30];
+ bf1[16] = input[1];
+ bf1[17] = input[17];
+ bf1[18] = input[9];
+ bf1[19] = input[25];
+ bf1[20] = input[5];
+ bf1[21] = input[21];
+ bf1[22] = input[13];
+ bf1[23] = input[29];
+ bf1[24] = input[3];
+ bf1[25] = input[19];
+ bf1[26] = input[11];
+ bf1[27] = input[27];
+ bf1[28] = input[7];
+ bf1[29] = input[23];
+ bf1[30] = input[15];
+ bf1[31] = input[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+ bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+ bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+ bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+ bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+ bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+ bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+}
+
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ int bit = cos_bit;
+ const int32_t *sinpi = sinpi_arr(bit);
+ int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ int32_t x0 = input[0];
+ int32_t x1 = input[1];
+ int32_t x2 = input[2];
+ int32_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ assert(sinpi[1] + sinpi[2] == sinpi[4]);
+
+ // stage 1
+ s0 = range_check_value(sinpi[1] * x0, stage_range[1] + bit);
+ s1 = range_check_value(sinpi[2] * x0, stage_range[1] + bit);
+ s2 = range_check_value(sinpi[3] * x1, stage_range[1] + bit);
+ s3 = range_check_value(sinpi[4] * x2, stage_range[1] + bit);
+ s4 = range_check_value(sinpi[1] * x2, stage_range[1] + bit);
+ s5 = range_check_value(sinpi[2] * x3, stage_range[1] + bit);
+ s6 = range_check_value(sinpi[4] * x3, stage_range[1] + bit);
+
+ // stage 2
+ // NOTICE: (x0 - x2) here may use one extra bit compared to the
+ // opt_range_row/col specified in av1_gen_inv_stage_range()
+ s7 = range_check_value((x0 - x2) + x3, stage_range[2]);
+
+ // stage 3
+ s0 = range_check_value(s0 + s3, stage_range[3] + bit);
+ s1 = range_check_value(s1 - s4, stage_range[3] + bit);
+ s3 = range_check_value(s2, stage_range[3] + bit);
+ s2 = range_check_value(sinpi[3] * s7, stage_range[3] + bit);
+
+ // stage 4
+ s0 = range_check_value(s0 + s5, stage_range[4] + bit);
+ s1 = range_check_value(s1 - s6, stage_range[4] + bit);
+
+ // stage 5
+ x0 = range_check_value(s0 + s3, stage_range[5] + bit);
+ x1 = range_check_value(s1 + s3, stage_range[5] + bit);
+ x2 = range_check_value(s2, stage_range[5] + bit);
+ x3 = range_check_value(s0 + s1, stage_range[5] + bit);
+
+ // stage 6
+ x3 = range_check_value(x3 - s3, stage_range[6] + bit);
+
+ output[0] = round_shift(x0, bit);
+ output[1] = round_shift(x1, bit);
+ output[2] = round_shift(x2, bit);
+ output[3] = round_shift(x3, bit);
+}
+
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 8;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[7];
+ bf1[1] = input[0];
+ bf1[2] = input[5];
+ bf1[3] = input[2];
+ bf1[4] = input[3];
+ bf1[5] = input[4];
+ bf1[6] = input[1];
+ bf1[7] = input[6];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = -bf0[4];
+ bf1[2] = bf0[6];
+ bf1[3] = -bf0[2];
+ bf1[4] = bf0[3];
+ bf1[5] = -bf0[7];
+ bf1[6] = bf0[5];
+ bf1[7] = -bf0[1];
+}
+
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 16;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[15];
+ bf1[1] = input[0];
+ bf1[2] = input[13];
+ bf1[3] = input[2];
+ bf1[4] = input[11];
+ bf1[5] = input[4];
+ bf1[6] = input[9];
+ bf1[7] = input[6];
+ bf1[8] = input[7];
+ bf1[9] = input[8];
+ bf1[10] = input[5];
+ bf1[11] = input[10];
+ bf1[12] = input[3];
+ bf1[13] = input[12];
+ bf1[14] = input[1];
+ bf1[15] = input[14];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+ bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+ bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[8], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[9], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[10], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[11], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[12], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[13], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[14], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[15], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[0] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[1] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[2] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[3] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[4] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[5] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[6] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[7] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+ bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[4], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[5], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[6], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[7], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[0] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[1] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[2] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[3] - bf0[7], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[12], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[13], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[14], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[15], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[8] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[9] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[10] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[11] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[2], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[3], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[0] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[1] - bf0[3], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[6], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[7], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[4] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[5] - bf0[7], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[10], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[11], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[8] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[9] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[14], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[15], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[12] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[13] - bf0[15], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = -bf0[8];
+ bf1[2] = bf0[12];
+ bf1[3] = -bf0[4];
+ bf1[4] = bf0[6];
+ bf1[5] = -bf0[14];
+ bf1[6] = bf0[10];
+ bf1[7] = -bf0[2];
+ bf1[8] = bf0[3];
+ bf1[9] = -bf0[11];
+ bf1[10] = bf0[15];
+ bf1[11] = -bf0[7];
+ bf1[12] = bf0[5];
+ bf1[13] = -bf0[13];
+ bf1[14] = bf0[9];
+ bf1[15] = -bf0[1];
+}
+
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 4; ++i) {
+ output[i] = round_shift((int64_t)NewSqrt2 * input[i], NewSqrt2Bits);
+ }
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 8; ++i) output[i] = (int32_t)((int64_t)input[i] * 2);
+}
+
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 16; ++i)
+ output[i] = round_shift((int64_t)NewSqrt2 * 2 * input[i], NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+}
+
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ (void)stage_range;
+ for (int i = 0; i < 32; ++i) output[i] = (int32_t)((int64_t)input[i] * 4);
+}
+
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ assert(output != input);
+ const int32_t size = 64;
+ const int32_t *cospi = cospi_arr(cos_bit);
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[64];
+
+ // stage 0;
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = input[32];
+ bf1[2] = input[16];
+ bf1[3] = input[48];
+ bf1[4] = input[8];
+ bf1[5] = input[40];
+ bf1[6] = input[24];
+ bf1[7] = input[56];
+ bf1[8] = input[4];
+ bf1[9] = input[36];
+ bf1[10] = input[20];
+ bf1[11] = input[52];
+ bf1[12] = input[12];
+ bf1[13] = input[44];
+ bf1[14] = input[28];
+ bf1[15] = input[60];
+ bf1[16] = input[2];
+ bf1[17] = input[34];
+ bf1[18] = input[18];
+ bf1[19] = input[50];
+ bf1[20] = input[10];
+ bf1[21] = input[42];
+ bf1[22] = input[26];
+ bf1[23] = input[58];
+ bf1[24] = input[6];
+ bf1[25] = input[38];
+ bf1[26] = input[22];
+ bf1[27] = input[54];
+ bf1[28] = input[14];
+ bf1[29] = input[46];
+ bf1[30] = input[30];
+ bf1[31] = input[62];
+ bf1[32] = input[1];
+ bf1[33] = input[33];
+ bf1[34] = input[17];
+ bf1[35] = input[49];
+ bf1[36] = input[9];
+ bf1[37] = input[41];
+ bf1[38] = input[25];
+ bf1[39] = input[57];
+ bf1[40] = input[5];
+ bf1[41] = input[37];
+ bf1[42] = input[21];
+ bf1[43] = input[53];
+ bf1[44] = input[13];
+ bf1[45] = input[45];
+ bf1[46] = input[29];
+ bf1[47] = input[61];
+ bf1[48] = input[3];
+ bf1[49] = input[35];
+ bf1[50] = input[19];
+ bf1[51] = input[51];
+ bf1[52] = input[11];
+ bf1[53] = input[43];
+ bf1[54] = input[27];
+ bf1[55] = input[59];
+ bf1[56] = input[7];
+ bf1[57] = input[39];
+ bf1[58] = input[23];
+ bf1[59] = input[55];
+ bf1[60] = input[15];
+ bf1[61] = input[47];
+ bf1[62] = input[31];
+ bf1[63] = input[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = bf0[26];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit);
+ bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit);
+ bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit);
+ bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit);
+ bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit);
+ bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit);
+ bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit);
+ bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit);
+ bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit);
+ bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit);
+ bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit);
+ bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit);
+ bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit);
+ bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit);
+ bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit);
+ bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit);
+ bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit);
+ bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit);
+ bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit);
+ bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit);
+ bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit);
+ bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit);
+ bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit);
+ bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit);
+ bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit);
+ bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit);
+ bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit);
+ bf1[32] = clamp_value(bf0[32] + bf0[33], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[32] - bf0[33], stage_range[stage]);
+ bf1[34] = clamp_value(-bf0[34] + bf0[35], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[34] + bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[36] + bf0[37], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[36] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(-bf0[38] + bf0[39], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[38] + bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[40] + bf0[41], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[40] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(-bf0[42] + bf0[43], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[42] + bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[44] + bf0[45], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[44] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(-bf0[46] + bf0[47], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[46] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[49], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[48] - bf0[49], stage_range[stage]);
+ bf1[50] = clamp_value(-bf0[50] + bf0[51], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[50] + bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[52] + bf0[53], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[52] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(-bf0[54] + bf0[55], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[54] + bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[56] + bf0[57], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[56] - bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(-bf0[58] + bf0[59], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[58] + bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[60] + bf0[61], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[60] - bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(-bf0[62] + bf0[63], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[62] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit);
+ bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit);
+ bf1[16] = clamp_value(bf0[16] + bf0[17], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[16] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(-bf0[18] + bf0[19], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[18] + bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[21], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[20] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(-bf0[22] + bf0[23], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[22] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[25], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[24] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[27], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[26] + bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[29], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[28] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(-bf0[30] + bf0[31], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[30] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+ bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+ bf1[43] = bf0[43];
+ bf1[44] = bf0[44];
+ bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit);
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[52];
+ bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit);
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit);
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit);
+ bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit);
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit);
+ bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit);
+ bf1[8] = clamp_value(bf0[8] + bf0[9], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[8] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(-bf0[10] + bf0[11], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[10] + bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[13], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[12] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(-bf0[14] + bf0[15], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[14] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit);
+ bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit);
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[35], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[34], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[33] - bf0[34], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[32] - bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(-bf0[36] + bf0[39], stage_range[stage]);
+ bf1[37] = clamp_value(-bf0[37] + bf0[38], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[37] + bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[36] + bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[40] + bf0[43], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[41] + bf0[42], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[41] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[40] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(-bf0[44] + bf0[47], stage_range[stage]);
+ bf1[45] = clamp_value(-bf0[45] + bf0[46], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[45] + bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[44] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[51], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[49] + bf0[50], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[49] - bf0[50], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[48] - bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(-bf0[52] + bf0[55], stage_range[stage]);
+ bf1[53] = clamp_value(-bf0[53] + bf0[54], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[53] + bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[52] + bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[56] + bf0[59], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[57] + bf0[58], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[57] - bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[56] - bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(-bf0[60] + bf0[63], stage_range[stage]);
+ bf1[61] = clamp_value(-bf0[61] + bf0[62], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[61] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[60] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit);
+ bf1[4] = clamp_value(bf0[4] + bf0[5], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[4] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(-bf0[6] + bf0[7], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[6] + bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[19], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[18], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[17] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[16] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(-bf0[20] + bf0[23], stage_range[stage]);
+ bf1[21] = clamp_value(-bf0[21] + bf0[22], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[21] + bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[20] + bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[27], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[26], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[25] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[24] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(-bf0[28] + bf0[31], stage_range[stage]);
+ bf1[29] = clamp_value(-bf0[29] + bf0[30], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[29] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[28] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+ bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+ bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = bf0[41];
+ bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit);
+ bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit);
+ bf1[54] = bf0[54];
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit);
+ bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit);
+ bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit);
+ bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit);
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[3], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[2], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[1] - bf0[2], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[0] - bf0[3], stage_range[stage]);
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = clamp_value(bf0[8] + bf0[11], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[10], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[9] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[8] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(-bf0[12] + bf0[15], stage_range[stage]);
+ bf1[13] = clamp_value(-bf0[13] + bf0[14], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[13] + bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[12] + bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit);
+ bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit);
+ bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit);
+ bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[39], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[38], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[34] + bf0[37], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[35] + bf0[36], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[35] - bf0[36], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[34] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[33] - bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[32] - bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(-bf0[40] + bf0[47], stage_range[stage]);
+ bf1[41] = clamp_value(-bf0[41] + bf0[46], stage_range[stage]);
+ bf1[42] = clamp_value(-bf0[42] + bf0[45], stage_range[stage]);
+ bf1[43] = clamp_value(-bf0[43] + bf0[44], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[43] + bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[42] + bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[41] + bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[40] + bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[48] + bf0[55], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[49] + bf0[54], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[50] + bf0[53], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[51] + bf0[52], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[51] - bf0[52], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[50] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[49] - bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[48] - bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(-bf0[56] + bf0[63], stage_range[stage]);
+ bf1[57] = clamp_value(-bf0[57] + bf0[62], stage_range[stage]);
+ bf1[58] = clamp_value(-bf0[58] + bf0[61], stage_range[stage]);
+ bf1[59] = clamp_value(-bf0[59] + bf0[60], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[59] + bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[58] + bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[57] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[56] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[7], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[6], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[5], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[4], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[3] - bf0[4], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[2] - bf0[5], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[1] - bf0[6], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[0] - bf0[7], stage_range[stage]);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = clamp_value(bf0[16] + bf0[23], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[22], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[21], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[20], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[19] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[18] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[17] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[16] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(-bf0[24] + bf0[31], stage_range[stage]);
+ bf1[25] = clamp_value(-bf0[25] + bf0[30], stage_range[stage]);
+ bf1[26] = clamp_value(-bf0[26] + bf0[29], stage_range[stage]);
+ bf1[27] = clamp_value(-bf0[27] + bf0[28], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[27] + bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[26] + bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[25] + bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[24] + bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+ bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+ bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+ bf1[44] = bf0[44];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = bf0[50];
+ bf1[51] = bf0[51];
+ bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit);
+ bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit);
+ bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit);
+ bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit);
+ bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit);
+ bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit);
+ bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit);
+ bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit);
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[15], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[14], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[13], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[12], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[11], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[10], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[9], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[8], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[7] - bf0[8], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[6] - bf0[9], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[5] - bf0[10], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[4] - bf0[11], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[3] - bf0[12], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[2] - bf0[13], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[1] - bf0[14], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[0] - bf0[15], stage_range[stage]);
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = clamp_value(bf0[32] + bf0[47], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[33] + bf0[46], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[34] + bf0[45], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[35] + bf0[44], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[36] + bf0[43], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[37] + bf0[42], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[38] + bf0[41], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[39] + bf0[40], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[39] - bf0[40], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[38] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[37] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[36] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[35] - bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[34] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[33] - bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[32] - bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(-bf0[48] + bf0[63], stage_range[stage]);
+ bf1[49] = clamp_value(-bf0[49] + bf0[62], stage_range[stage]);
+ bf1[50] = clamp_value(-bf0[50] + bf0[61], stage_range[stage]);
+ bf1[51] = clamp_value(-bf0[51] + bf0[60], stage_range[stage]);
+ bf1[52] = clamp_value(-bf0[52] + bf0[59], stage_range[stage]);
+ bf1[53] = clamp_value(-bf0[53] + bf0[58], stage_range[stage]);
+ bf1[54] = clamp_value(-bf0[54] + bf0[57], stage_range[stage]);
+ bf1[55] = clamp_value(-bf0[55] + bf0[56], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[55] + bf0[56], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[54] + bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[53] + bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[52] + bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[51] + bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[50] + bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[49] + bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[48] + bf0[63], stage_range[stage]);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 10
+ stage++;
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = clamp_value(bf0[0] + bf0[31], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[30], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[29], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[28], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[27], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[26], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[25], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[24], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[23], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[22], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[21], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[20], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[19], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[18], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[17], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[16], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[15] - bf0[16], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[14] - bf0[17], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[13] - bf0[18], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[12] - bf0[19], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[11] - bf0[20], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[10] - bf0[21], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[9] - bf0[22], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[8] - bf0[23], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[7] - bf0[24], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[6] - bf0[25], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[5] - bf0[26], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[4] - bf0[27], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[3] - bf0[28], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[2] - bf0[29], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[1] - bf0[30], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[0] - bf0[31], stage_range[stage]);
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = bf0[37];
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = bf0[58];
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 11
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = clamp_value(bf0[0] + bf0[63], stage_range[stage]);
+ bf1[1] = clamp_value(bf0[1] + bf0[62], stage_range[stage]);
+ bf1[2] = clamp_value(bf0[2] + bf0[61], stage_range[stage]);
+ bf1[3] = clamp_value(bf0[3] + bf0[60], stage_range[stage]);
+ bf1[4] = clamp_value(bf0[4] + bf0[59], stage_range[stage]);
+ bf1[5] = clamp_value(bf0[5] + bf0[58], stage_range[stage]);
+ bf1[6] = clamp_value(bf0[6] + bf0[57], stage_range[stage]);
+ bf1[7] = clamp_value(bf0[7] + bf0[56], stage_range[stage]);
+ bf1[8] = clamp_value(bf0[8] + bf0[55], stage_range[stage]);
+ bf1[9] = clamp_value(bf0[9] + bf0[54], stage_range[stage]);
+ bf1[10] = clamp_value(bf0[10] + bf0[53], stage_range[stage]);
+ bf1[11] = clamp_value(bf0[11] + bf0[52], stage_range[stage]);
+ bf1[12] = clamp_value(bf0[12] + bf0[51], stage_range[stage]);
+ bf1[13] = clamp_value(bf0[13] + bf0[50], stage_range[stage]);
+ bf1[14] = clamp_value(bf0[14] + bf0[49], stage_range[stage]);
+ bf1[15] = clamp_value(bf0[15] + bf0[48], stage_range[stage]);
+ bf1[16] = clamp_value(bf0[16] + bf0[47], stage_range[stage]);
+ bf1[17] = clamp_value(bf0[17] + bf0[46], stage_range[stage]);
+ bf1[18] = clamp_value(bf0[18] + bf0[45], stage_range[stage]);
+ bf1[19] = clamp_value(bf0[19] + bf0[44], stage_range[stage]);
+ bf1[20] = clamp_value(bf0[20] + bf0[43], stage_range[stage]);
+ bf1[21] = clamp_value(bf0[21] + bf0[42], stage_range[stage]);
+ bf1[22] = clamp_value(bf0[22] + bf0[41], stage_range[stage]);
+ bf1[23] = clamp_value(bf0[23] + bf0[40], stage_range[stage]);
+ bf1[24] = clamp_value(bf0[24] + bf0[39], stage_range[stage]);
+ bf1[25] = clamp_value(bf0[25] + bf0[38], stage_range[stage]);
+ bf1[26] = clamp_value(bf0[26] + bf0[37], stage_range[stage]);
+ bf1[27] = clamp_value(bf0[27] + bf0[36], stage_range[stage]);
+ bf1[28] = clamp_value(bf0[28] + bf0[35], stage_range[stage]);
+ bf1[29] = clamp_value(bf0[29] + bf0[34], stage_range[stage]);
+ bf1[30] = clamp_value(bf0[30] + bf0[33], stage_range[stage]);
+ bf1[31] = clamp_value(bf0[31] + bf0[32], stage_range[stage]);
+ bf1[32] = clamp_value(bf0[31] - bf0[32], stage_range[stage]);
+ bf1[33] = clamp_value(bf0[30] - bf0[33], stage_range[stage]);
+ bf1[34] = clamp_value(bf0[29] - bf0[34], stage_range[stage]);
+ bf1[35] = clamp_value(bf0[28] - bf0[35], stage_range[stage]);
+ bf1[36] = clamp_value(bf0[27] - bf0[36], stage_range[stage]);
+ bf1[37] = clamp_value(bf0[26] - bf0[37], stage_range[stage]);
+ bf1[38] = clamp_value(bf0[25] - bf0[38], stage_range[stage]);
+ bf1[39] = clamp_value(bf0[24] - bf0[39], stage_range[stage]);
+ bf1[40] = clamp_value(bf0[23] - bf0[40], stage_range[stage]);
+ bf1[41] = clamp_value(bf0[22] - bf0[41], stage_range[stage]);
+ bf1[42] = clamp_value(bf0[21] - bf0[42], stage_range[stage]);
+ bf1[43] = clamp_value(bf0[20] - bf0[43], stage_range[stage]);
+ bf1[44] = clamp_value(bf0[19] - bf0[44], stage_range[stage]);
+ bf1[45] = clamp_value(bf0[18] - bf0[45], stage_range[stage]);
+ bf1[46] = clamp_value(bf0[17] - bf0[46], stage_range[stage]);
+ bf1[47] = clamp_value(bf0[16] - bf0[47], stage_range[stage]);
+ bf1[48] = clamp_value(bf0[15] - bf0[48], stage_range[stage]);
+ bf1[49] = clamp_value(bf0[14] - bf0[49], stage_range[stage]);
+ bf1[50] = clamp_value(bf0[13] - bf0[50], stage_range[stage]);
+ bf1[51] = clamp_value(bf0[12] - bf0[51], stage_range[stage]);
+ bf1[52] = clamp_value(bf0[11] - bf0[52], stage_range[stage]);
+ bf1[53] = clamp_value(bf0[10] - bf0[53], stage_range[stage]);
+ bf1[54] = clamp_value(bf0[9] - bf0[54], stage_range[stage]);
+ bf1[55] = clamp_value(bf0[8] - bf0[55], stage_range[stage]);
+ bf1[56] = clamp_value(bf0[7] - bf0[56], stage_range[stage]);
+ bf1[57] = clamp_value(bf0[6] - bf0[57], stage_range[stage]);
+ bf1[58] = clamp_value(bf0[5] - bf0[58], stage_range[stage]);
+ bf1[59] = clamp_value(bf0[4] - bf0[59], stage_range[stage]);
+ bf1[60] = clamp_value(bf0[3] - bf0[60], stage_range[stage]);
+ bf1[61] = clamp_value(bf0[2] - bf0[61], stage_range[stage]);
+ bf1[62] = clamp_value(bf0[1] - bf0[62], stage_range[stage]);
+ bf1[63] = clamp_value(bf0[0] - bf0[63], stage_range[stage]);
+}
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d.h b/third_party/aom/av1/common/av1_inv_txfm1d.h
new file mode 100644
index 0000000000..c31c019aa9
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int32_t clamp_value(int32_t value, int8_t bit) {
+ if (bit <= 0) return value; // Do nothing for invalid clamp bit.
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+ return (int32_t)clamp64(value, min_value, max_value);
+}
+
+static INLINE void clamp_buf(int32_t *buf, int32_t size, int8_t bit) {
+ for (int i = 0; i < size; ++i) buf[i] = clamp_value(buf[i], bit);
+}
+
+void av1_idct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_idct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_iidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
new file mode 100644
index 0000000000..7d80a00996
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm1d_cfg.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#define AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
+#include "av1/common/av1_inv_txfm1d.h"
+
+// sum of fwd_shift_##
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+ 5, // 4x4 transform
+ 6, // 8x8 transform
+ 7, // 16x16 transform
+ 7, // 32x32 transform
+ 7, // 64x64 transform
+ 5, // 4x8 transform
+ 5, // 8x4 transform
+ 6, // 8x16 transform
+ 6, // 16x8 transform
+ 6, // 16x32 transform
+ 6, // 32x16 transform
+ 6, // 32x64 transform
+ 6, // 64x32 transform
+ 6, // 4x16 transform
+ 6, // 16x4 transform
+ 7, // 8x32 transform
+ 7, // 32x8 transform
+ 7, // 16x64 transform
+ 7, // 64x16 transform
+};
+
+extern const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL];
+
+// Values in both inv_cos_bit_col and inv_cos_bit_row are always 12
+// for each valid row and col combination
+#define INV_COS_BIT 12
+extern const int8_t inv_cos_bit_col[5 /*row*/][5 /*col*/];
+extern const int8_t inv_cos_bit_row[5 /*row*/][5 /*col*/];
+
+#endif // AOM_AV1_COMMON_AV1_INV_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/common/av1_inv_txfm2d.c b/third_party/aom/av1/common/av1_inv_txfm2d.c
new file mode 100644
index 0000000000..4e69443145
--- /dev/null
+++ b/third_party/aom/av1/common/av1_inv_txfm2d.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/av1_inv_txfm1d.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+
+void av1_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+ 0.5 shifts per pixel. */
+ int i;
+ tran_low_t output[16];
+ tran_low_t a1, b1, c1, d1, e1;
+ const tran_low_t *ip = input;
+ tran_low_t *op = output;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ c1 = ip[1] >> UNIT_QUANT_SHIFT;
+ d1 = ip[2] >> UNIT_QUANT_SHIFT;
+ b1 = ip[3] >> UNIT_QUANT_SHIFT;
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+
+ op[0] = a1;
+ op[1] = b1;
+ op[2] = c1;
+ op[3] = d1;
+ ip += 4;
+ op += 4;
+ }
+
+ ip = output;
+ for (i = 0; i < 4; i++) {
+ a1 = ip[4 * 0];
+ c1 = ip[4 * 1];
+ d1 = ip[4 * 2];
+ b1 = ip[4 * 3];
+ a1 += c1;
+ d1 -= b1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= b1;
+ d1 += c1;
+
+ range_check_value(a1, bd + 1);
+ range_check_value(b1, bd + 1);
+ range_check_value(c1, bd + 1);
+ range_check_value(d1, bd + 1);
+
+ dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd);
+ dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd);
+ dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd);
+ dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd);
+
+ ip++;
+ dest++;
+ }
+}
+
+void av1_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+ int dest_stride, int bd) {
+ int i;
+ tran_low_t a1, e1;
+ tran_low_t tmp[4];
+ const tran_low_t *ip = in;
+ tran_low_t *op = tmp;
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ (void)bd;
+
+ a1 = ip[0] >> UNIT_QUANT_SHIFT;
+ e1 = a1 >> 1;
+ a1 -= e1;
+ op[0] = a1;
+ op[1] = op[2] = op[3] = e1;
+
+ ip = tmp;
+ for (i = 0; i < 4; i++) {
+ e1 = ip[0] >> 1;
+ a1 = ip[0] - e1;
+ dest[dest_stride * 0] =
+ highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
+ dest[dest_stride * 1] =
+ highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
+ dest[dest_stride * 2] =
+ highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
+ dest[dest_stride * 3] =
+ highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
+ ip++;
+ dest++;
+ }
+}
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT4: return av1_idct4_new;
+ case TXFM_TYPE_DCT8: return av1_idct8_new;
+ case TXFM_TYPE_DCT16: return av1_idct16_new;
+ case TXFM_TYPE_DCT32: return av1_idct32_new;
+ case TXFM_TYPE_DCT64: return av1_idct64_new;
+ case TXFM_TYPE_ADST4: return av1_iadst4_new;
+ case TXFM_TYPE_ADST8: return av1_iadst8_new;
+ case TXFM_TYPE_ADST16: return av1_iadst16_new;
+ case TXFM_TYPE_IDENTITY4: return av1_iidentity4_c;
+ case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
+ case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
+ case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
+ default: assert(0); return NULL;
+ }
+}
+
+static const int8_t inv_shift_4x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
+static const int8_t inv_shift_4x8[2] = { 0, -4 };
+static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_8x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x32[2] = { -1, -4 };
+static const int8_t inv_shift_32x16[2] = { -1, -4 };
+static const int8_t inv_shift_32x64[2] = { -1, -4 };
+static const int8_t inv_shift_64x32[2] = { -1, -4 };
+static const int8_t inv_shift_4x16[2] = { -1, -4 };
+static const int8_t inv_shift_16x4[2] = { -1, -4 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
+
+const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
+ inv_shift_4x4, inv_shift_8x8, inv_shift_16x16, inv_shift_32x32,
+ inv_shift_64x64, inv_shift_4x8, inv_shift_8x4, inv_shift_8x16,
+ inv_shift_16x8, inv_shift_16x32, inv_shift_32x16, inv_shift_32x64,
+ inv_shift_64x32, inv_shift_4x16, inv_shift_16x4, inv_shift_8x32,
+ inv_shift_32x8, inv_shift_16x64, inv_shift_64x16,
+};
+
+/* clang-format off */
+const int8_t inv_cos_bit_col[MAX_TXWH_IDX] // txw_idx
+ [MAX_TXWH_IDX] = { // txh_idx
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 },
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 },
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+ { 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+ { 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+ };
+
+const int8_t inv_cos_bit_row[MAX_TXWH_IDX] // txw_idx
+ [MAX_TXWH_IDX] = { // txh_idx
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0, 0 },
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, 0 },
+ { INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+ { 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT },
+ { 0, 0, INV_COS_BIT, INV_COS_BIT, INV_COS_BIT }
+ };
+/* clang-format on */
+
+const int8_t iadst4_range[7] = { 0, 1, 0, 0, 0, 0, 0 };
+
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg) {
+ assert(cfg != NULL);
+ cfg->tx_size = tx_size;
+ set_flip_cfg(tx_type, cfg);
+ av1_zero(cfg->stage_range_col);
+ av1_zero(cfg->stage_range_row);
+ set_flip_cfg(tx_type, cfg);
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ cfg->shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ cfg->cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ cfg->cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+ if (cfg->txfm_type_col == TXFM_TYPE_ADST4) {
+ memcpy(cfg->stage_range_col, iadst4_range, sizeof(iadst4_range));
+ }
+ cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+ if (cfg->txfm_type_row == TXFM_TYPE_ADST4) {
+ memcpy(cfg->stage_range_row, iadst4_range, sizeof(iadst4_range));
+ }
+ cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+ cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+}
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+ int bd) {
+ const int fwd_shift = inv_start_range[tx_size];
+ const int8_t *shift = cfg->shift;
+ int8_t opt_range_row, opt_range_col;
+ if (bd == 8) {
+ opt_range_row = 16;
+ opt_range_col = 16;
+ } else if (bd == 10) {
+ opt_range_row = 18;
+ opt_range_col = 16;
+ } else {
+ assert(bd == 12);
+ opt_range_row = 20;
+ opt_range_col = 18;
+ }
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ int real_range_row = cfg->stage_range_row[i] + fwd_shift + bd + 1;
+ (void)real_range_row;
+ if (cfg->txfm_type_row == TXFM_TYPE_ADST4 && i == 1) {
+ // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+ // so opt_range_col >= real_range_col will not hold
+ stage_range_row[i] = opt_range_row;
+ } else {
+ assert(opt_range_row >= real_range_row);
+ stage_range_row[i] = opt_range_row;
+ }
+ }
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+ int real_range_col =
+ cfg->stage_range_col[i] + fwd_shift + shift[0] + bd + 1;
+ (void)real_range_col;
+ if (cfg->txfm_type_col == TXFM_TYPE_ADST4 && i == 1) {
+ // the adst4 may use 1 extra bit on top of opt_range_row at stage 1
+ // so opt_range_col >= real_range_col will not hold
+ stage_range_col[i] = opt_range_col;
+ } else {
+ assert(opt_range_col >= real_range_col);
+ stage_range_col[i] = opt_range_col;
+ }
+ }
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+ int stride, TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf, TX_SIZE tx_size,
+ int bd) {
+ // Note when assigning txfm_size_col, we use the txfm_size from the
+ // row configuration and vice versa. This is intentionally done to
+ // accurately perform rectangular transforms. When the transform is
+ // rectangular, the number of columns will be the same as the
+ // txfm_size stored in the row cfg struct. It will make no difference
+ // for square transforms.
+ const int txfm_size_col = tx_size_wide[cfg->tx_size];
+ const int txfm_size_row = tx_size_high[cfg->tx_size];
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+ assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+ av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
+
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->txfm_type_row);
+
+ // txfm_buf's length is txfm_size_row * txfm_size_col + 2 *
+ // AOMMAX(txfm_size_row, txfm_size_col)
+ // it is used for intermediate data buffering
+ const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
+ int32_t *temp_in = txfm_buf;
+ int32_t *temp_out = temp_in + buf_offset;
+ int32_t *buf = temp_out + buf_offset;
+ int32_t *buf_ptr = buf;
+ int c, r;
+
+ // Rows
+ for (r = 0; r < txfm_size_row; ++r) {
+ if (abs(rect_type) == 1) {
+ for (c = 0; c < txfm_size_col; ++c) {
+ temp_in[c] = round_shift((int64_t)input[c] * NewInvSqrt2, NewSqrt2Bits);
+ }
+ clamp_buf(temp_in, txfm_size_col, bd + 8);
+ txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+ } else {
+ for (c = 0; c < txfm_size_col; ++c) {
+ temp_in[c] = input[c];
+ }
+ clamp_buf(temp_in, txfm_size_col, bd + 8);
+ txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+ }
+ av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
+ input += txfm_size_col;
+ buf_ptr += txfm_size_col;
+ }
+
+ // Columns
+ for (c = 0; c < txfm_size_col; ++c) {
+ if (cfg->lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + c];
+ } else {
+ // flip left right
+ for (r = 0; r < txfm_size_row; ++r)
+ temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
+ }
+ clamp_buf(temp_in, txfm_size_row, AOMMAX(bd + 6, 16));
+ txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+ if (cfg->ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] =
+ highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
+ }
+ } else {
+ // flip upside down
+ for (r = 0; r < txfm_size_row; ++r) {
+ output[r * stride + c] = highbd_clip_pixel_add(
+ output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
+ }
+ }
+ }
+}
+
+static INLINE void inv_txfm2d_add_facade(const int32_t *input, uint16_t *output,
+ int stride, int32_t *txfm_buf,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int bd) {
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
+ // Forward shift sum uses larger square size, to be consistent with what
+ // av1_gen_inv_stage_range() does for inverse shifts.
+ inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
+}
+
+void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X8, bd);
+}
+
+void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
+}
+
+void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 4 + 4]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 8 + 8 + 8]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X8, bd);
+}
+
+void av1_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X16, bd);
+}
+
+void av1_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[32 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X32, bd);
+}
+
+void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // TODO(urvang): Can the same array be reused, instead of using a new array?
+ // Remap 32x32 input into a modified 64x64 by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 64];
+ for (int row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ memset(mod_input + 32 * 64, 0, 32 * 64 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x32 input into a modified 64x32 by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 32];
+ for (int row = 0; row < 32; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+ bd);
+}
+
+void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x32 input into a modified 32x64 input by:
+ // - Copying over these values in top-left 32x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[32 * 64];
+ memcpy(mod_input, input, 32 * 32 * sizeof(*mod_input));
+ memset(mod_input + 32 * 32, 0, 32 * 32 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_32X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_16x64_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 16x32 input into a modified 16x64 input by:
+ // - Copying over these values in top-left 16x32 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[16 * 64];
+ memcpy(mod_input, input, 16 * 32 * sizeof(*mod_input));
+ memset(mod_input + 16 * 32, 0, 16 * 32 * sizeof(*mod_input));
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_16X64,
+ bd);
+}
+
+void av1_inv_txfm2d_add_64x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ // Remap 32x16 input into a modified 64x16 by:
+ // - Copying over these values in top-left 32x16 locations.
+ // - Setting the rest of the locations to 0.
+ int32_t mod_input[64 * 16];
+ for (int row = 0; row < 16; ++row) {
+ memcpy(mod_input + row * 64, input + row * 32, 32 * sizeof(*mod_input));
+ memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
+ }
+ DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+ inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+ bd);
+}
+
+void av1_inv_txfm2d_add_4x16_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_4X16, bd);
+}
+
+void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+}
+
+void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X32, bd);
+}
+
+void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+ inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.c b/third_party/aom/av1/common/av1_loopfilter.c
new file mode 100644
index 0000000000..537d8dfe92
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.c
@@ -0,0 +1,2377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+
+static const SEG_LVL_FEATURES seg_lvl_lf_lut[MAX_MB_PLANE][2] = {
+ { SEG_LVL_ALT_LF_Y_V, SEG_LVL_ALT_LF_Y_H },
+ { SEG_LVL_ALT_LF_U, SEG_LVL_ALT_LF_U },
+ { SEG_LVL_ALT_LF_V, SEG_LVL_ALT_LF_V }
+};
+
+static const int delta_lf_id_lut[MAX_MB_PLANE][2] = {
+ { 0, 1 }, { 2, 2 }, { 3, 3 }
+};
+
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+
+static const int mode_lf_lut[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // INTRA_MODES
+ 1, 1, 0, 1, // INTER_MODES (GLOBALMV == 0)
+ 1, 1, 1, 1, 1, 1, 0, 1 // INTER_COMPOUND_MODES (GLOBAL_GLOBALMV == 0)
+};
+
+#if LOOP_FILTER_BITMASK
+// 256 bit masks (64x64 / 4x4) for left transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the left border of an 4x4 block boundary.
+//
+// In the case of TX_8x8-> ( in low order byte first we end up with
+// a mask that looks like this (-- and | are used for better view)
+//
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// -----------------
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+// 10101010|10101010
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+// 256 bit masks (64x64 / 4x4) for above transform size for Y plane.
+// We use 4 uint64_t to represent the 256 bit.
+// Each 1 represents a position where we should apply a loop filter
+// across the top border of an 4x4 block boundary.
+//
+// In the case of TX_8x8-> ( in low order byte first we end up with
+// a mask that looks like this
+//
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// -----------------
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+// 11111111|11111111
+// 00000000|00000000
+//
+// A loopfilter should be applied to every other 4x4 horizontally.
+
+const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1, -1, -1, 13, 14, 15, 16, 17, 18
+};
+
+const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, 10, 11, 12, 13
+};
+
+const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL] = {
+ -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, -1, 7, 8
+};
+
+const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL] = { -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, 0, 1, 2,
+ 3, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1 };
+
+const FilterMask left_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x0000000000050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x0055005500550055ULL, 0x0055005500550055ULL, 0x0055005500550055ULL,
+ 0x0055005500550055ULL } }, // block size 32X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x5555555555555555ULL, 0x5555555555555555ULL, 0x5555555555555555ULL,
+ 0x5555555555555555ULL } }, // block size 64X64, TX_8X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x0000000000550055ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0005000500050005ULL, 0x0005000500050005ULL, 0x0005000500050005ULL,
+ 0x0005000500050005ULL } }, // block size 16X64, TX_8X8
+ { { 0x5555555555555555ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x0011001100110011ULL, 0x0011001100110011ULL, 0x0011001100110011ULL,
+ 0x0011001100110011ULL } }, // block size 32X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x1111111111111111ULL, 0x1111111111111111ULL, 0x1111111111111111ULL,
+ 0x1111111111111111ULL } }, // block size 64X64, TX_16X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X16
+ { { 0x1111111111111111ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 32X64, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x0101010101010101ULL, 0x0101010101010101ULL, 0x0101010101010101ULL,
+ 0x0101010101010101ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x0000000000000005ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x0000000000110011ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X32
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x0101010101010101ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 32X64, TX_32X64
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL,
+ 0x0001000100010001ULL } }, // block size 16X64, TX_16X64
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
+
+const FilterMask above_mask_univariant_reordered[67] = {
+ // TX_4X4
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X4, TX_4X4
+ { { 0x0000000000010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_4X4
+ { { 0x0000000000030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_4X4
+ { { 0x0003000300030003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_4X4
+ { { 0x00000000000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_4X4
+ { { 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL,
+ 0x00ff00ff00ff00ffULL } }, // block size 32X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_4X4
+ { { 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL,
+ 0xffffffffffffffffULL } }, // block size 64X64, TX_4x4
+ { { 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_4X4
+ { { 0x0003000300030003ULL, 0x0003000300030003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_4X4
+ { { 0x0000000000ff00ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_4X4
+ { { 0x000f000f000f000fULL, 0x000f000f000f000fULL, 0x000f000f000f000fULL,
+ 0x000f000f000f000fULL } }, // block size 16X64, TX_4X4
+ { { 0xffffffffffffffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_4X4
+ // TX_8X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X8, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_8X8
+ { { 0x000000ff000000ffULL, 0x000000ff000000ffULL, 0x000000ff000000ffULL,
+ 0x000000ff000000ffULL } }, // block size 32X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL, 0x0000ffff0000ffffULL,
+ 0x0000ffff0000ffffULL } }, // block size 64X64, TX_8X8
+ { { 0x0000000300000003ULL, 0x0000000300000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_8X8
+ { { 0x0000000f0000000fULL, 0x0000000f0000000fULL, 0x0000000f0000000fULL,
+ 0x0000000f0000000fULL } }, // block size 16X64, TX_8X8
+ { { 0x0000ffff0000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_8X8
+ // TX_16X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X16, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_16X16
+ { { 0x00000000000000ffULL, 0x00000000000000ffULL, 0x00000000000000ffULL,
+ 0x00000000000000ffULL } }, // block size 32X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_16X16
+ { { 0x000000000000ffffULL, 0x000000000000ffffULL, 0x000000000000ffffULL,
+ 0x000000000000ffffULL } }, // block size 64X64, TX_16X16
+ { { 0x000000000000000fULL, 0x000000000000000fULL, 0x000000000000000fULL,
+ 0x000000000000000fULL } }, // block size 16X64, TX_16X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_16X16
+ // TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X32, TX_32X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x00000000000000ffULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_32X32
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x000000000000ffffULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_32X32
+ // TX_64X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X64, TX_64X64
+ // 2:1, 1:2 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X8, TX_4X8
+ { { 0x0000000100000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X8
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X4, TX_8X4
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_8X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X16, TX_8X16
+ { { 0x0000000000000003ULL, 0x0000000000000003ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X8, TX_16X8
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_16X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X32, TX_16X32
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x000000000000000fULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X16, TX_32X16
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_32X16
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X64, TX_32X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X32, TX_64X32
+ // 4:1, 1:4 transform sizes.
+ { { 0x0000000000000001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 4X16, TX_4X16
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X4, TX_16X4
+ { { 0x0000000000000003ULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 8X32, TX_8X32
+ { { 0x00000000000000ffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 32X8, TX_32X8
+ { { 0x000000000000000fULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 16X64, TX_16X64
+ { { 0x000000000000ffffULL, 0x0000000000000000ULL, 0x0000000000000000ULL,
+ 0x0000000000000000ULL } }, // block size 64X16, TX_64X16
+};
+
+LoopFilterMask *get_loop_filter_mask(const AV1_COMMON *const cm, int mi_row,
+ int mi_col) {
+ assert(cm->lf.lfm != NULL);
+ const int row = mi_row >> MIN_MIB_SIZE_LOG2; // 64x64
+ const int col = mi_col >> MIN_MIB_SIZE_LOG2;
+ return &cm->lf.lfm[row * cm->lf.lfm_stride + col];
+}
+
+typedef void (*LpfFunc)(uint8_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh);
+
+typedef void (*LpfDualFunc)(uint8_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1);
+
+typedef void (*HbdLpfFunc)(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh, int bd);
+
+typedef void (*HbdLpfDualFunc)(uint16_t *s, int p, const uint8_t *blimit0,
+ const uint8_t *limit0, const uint8_t *thresh0,
+ const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd);
+#endif // LOOP_FILTER_BITMASK
+
+static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
+ int lvl;
+
+ // For each possible value for the loop filter fill out limits
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++) {
+ // Set loop filter parameters that control sharpness.
+ int block_inside_limit = lvl >> ((sharpness_lvl > 0) + (sharpness_lvl > 4));
+
+ if (sharpness_lvl > 0) {
+ if (block_inside_limit > (9 - sharpness_lvl))
+ block_inside_limit = (9 - sharpness_lvl);
+ }
+
+ if (block_inside_limit < 1) block_inside_limit = 1;
+
+ memset(lfi->lfthr[lvl].lim, block_inside_limit, SIMD_WIDTH);
+ memset(lfi->lfthr[lvl].mblim, (2 * (lvl + 2) + block_inside_limit),
+ SIMD_WIDTH);
+ }
+}
+
+uint8_t get_filter_level(const AV1_COMMON *cm, const loop_filter_info_n *lfi_n,
+ const int dir_idx, int plane,
+ const MB_MODE_INFO *mbmi) {
+ const int segment_id = mbmi->segment_id;
+ if (cm->delta_lf_present_flag) {
+ int delta_lf;
+ if (cm->delta_lf_multi) {
+ const int delta_lf_idx = delta_lf_id_lut[plane][dir_idx];
+ delta_lf = mbmi->delta_lf[delta_lf_idx];
+ } else {
+ delta_lf = mbmi->delta_lf_from_base;
+ }
+ int base_level;
+ if (plane == 0)
+ base_level = cm->lf.filter_level[dir_idx];
+ else if (plane == 1)
+ base_level = cm->lf.filter_level_u;
+ else
+ base_level = cm->lf.filter_level_v;
+ int lvl_seg = clamp(delta_lf + base_level, 0, MAX_LOOP_FILTER);
+ assert(plane >= 0 && plane <= 2);
+ const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir_idx];
+ if (segfeature_active(&cm->seg, segment_id, seg_lf_feature_id)) {
+ const int data = get_segdata(&cm->seg, segment_id, seg_lf_feature_id);
+ lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+ }
+
+ if (cm->lf.mode_ref_delta_enabled) {
+ const int scale = 1 << (lvl_seg >> 5);
+ lvl_seg += cm->lf.ref_deltas[mbmi->ref_frame[0]] * scale;
+ if (mbmi->ref_frame[0] > INTRA_FRAME)
+ lvl_seg += cm->lf.mode_deltas[mode_lf_lut[mbmi->mode]] * scale;
+ lvl_seg = clamp(lvl_seg, 0, MAX_LOOP_FILTER);
+ }
+ return lvl_seg;
+ } else {
+ return lfi_n->lvl[plane][segment_id][dir_idx][mbmi->ref_frame[0]]
+ [mode_lf_lut[mbmi->mode]];
+ }
+}
+
+void av1_loop_filter_init(AV1_COMMON *cm) {
+ assert(MB_MODE_COUNT == NELEMENTS(mode_lf_lut));
+ loop_filter_info_n *lfi = &cm->lf_info;
+ struct loopfilter *lf = &cm->lf;
+ int lvl;
+
+ lf->combine_vert_horz_lf = 1;
+
+ // init limits for given sharpness
+ update_sharpness(lfi, lf->sharpness_level);
+
+ // init hev threshold const vectors
+ for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
+ memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+}
+
+// Update the loop filter for the current frame.
+// This should be called before loop_filter_rows(),
+// av1_loop_filter_frame() calls this function directly.
+void av1_loop_filter_frame_init(AV1_COMMON *cm, int plane_start,
+ int plane_end) {
+ int filt_lvl[MAX_MB_PLANE], filt_lvl_r[MAX_MB_PLANE];
+ int plane;
+ int seg_id;
+ // n_shift is the multiplier for lf_deltas
+ // the multiplier is 1 for when filter_lvl is between 0 and 31;
+ // 2 when filter_lvl is between 32 and 63
+ loop_filter_info_n *const lfi = &cm->lf_info;
+ struct loopfilter *const lf = &cm->lf;
+ const struct segmentation *const seg = &cm->seg;
+
+ // update sharpness limits
+ update_sharpness(lfi, lf->sharpness_level);
+
+ filt_lvl[0] = cm->lf.filter_level[0];
+ filt_lvl[1] = cm->lf.filter_level_u;
+ filt_lvl[2] = cm->lf.filter_level_v;
+
+ filt_lvl_r[0] = cm->lf.filter_level[1];
+ filt_lvl_r[1] = cm->lf.filter_level_u;
+ filt_lvl_r[2] = cm->lf.filter_level_v;
+
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !filt_lvl[0] && !filt_lvl_r[0])
+ break;
+ else if (plane == 1 && !filt_lvl[1])
+ continue;
+ else if (plane == 2 && !filt_lvl[2])
+ continue;
+
+ for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
+ for (int dir = 0; dir < 2; ++dir) {
+ int lvl_seg = (dir == 0) ? filt_lvl[plane] : filt_lvl_r[plane];
+ assert(plane >= 0 && plane <= 2);
+ const int seg_lf_feature_id = seg_lvl_lf_lut[plane][dir];
+ if (segfeature_active(seg, seg_id, seg_lf_feature_id)) {
+ const int data = get_segdata(&cm->seg, seg_id, seg_lf_feature_id);
+ lvl_seg = clamp(lvl_seg + data, 0, MAX_LOOP_FILTER);
+ }
+
+ if (!lf->mode_ref_delta_enabled) {
+ // we could get rid of this if we assume that deltas are set to
+ // zero when not in use; encoder always uses deltas
+ memset(lfi->lvl[plane][seg_id][dir], lvl_seg,
+ sizeof(lfi->lvl[plane][seg_id][dir]));
+ } else {
+ int ref, mode;
+ const int scale = 1 << (lvl_seg >> 5);
+ const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+ lfi->lvl[plane][seg_id][dir][INTRA_FRAME][0] =
+ clamp(intra_lvl, 0, MAX_LOOP_FILTER);
+
+ for (ref = LAST_FRAME; ref < REF_FRAMES; ++ref) {
+ for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+ const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale +
+ lf->mode_deltas[mode] * scale;
+ lfi->lvl[plane][seg_id][dir][ref][mode] =
+ clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+#if LOOP_FILTER_BITMASK
+// A 64x64 tx block requires 256 bits to represent each 4x4 tx block.
+// Every 4 rows is represented by one uint64_t mask. Hence,
+// there are 4 uint64_t bitmask[4] to represent the 64x64 block.
+//
+// Given a location by (mi_col, mi_row), This function returns the index
+// 0, 1, 2, 3 to select which bitmask[] to use, and the shift value.
+//
+// For example, mi_row is the offset of pixels in mi size (4),
+// (mi_row / 4) returns which uint64_t.
+// After locating which uint64_t, mi_row % 4 is the
+// row offset, and each row has 16 = 1 << stride_log2 4x4 units.
+// Therefore, shift = (row << stride_log2) + mi_col;
+int get_index_shift(int mi_col, int mi_row, int *index) {
+ // *index = mi_row >> 2;
+ // rows = mi_row % 4;
+ // stride_log2 = 4;
+ // shift = (rows << stride_log2) + mi_col;
+ *index = mi_row >> 2;
+ return ((mi_row & 3) << 4) | mi_col;
+}
+
+static void check_mask(const FilterMask *lfm) {
+#ifndef NDEBUG
+ for (int i = 0; i < 4; ++i) {
+ assert(!(lfm[TX_4X4].bits[i] & lfm[TX_8X8].bits[i]));
+ assert(!(lfm[TX_4X4].bits[i] & lfm[TX_16X16].bits[i]));
+ assert(!(lfm[TX_4X4].bits[i] & lfm[TX_32X32].bits[i]));
+ assert(!(lfm[TX_4X4].bits[i] & lfm[TX_64X64].bits[i]));
+ assert(!(lfm[TX_8X8].bits[i] & lfm[TX_16X16].bits[i]));
+ assert(!(lfm[TX_8X8].bits[i] & lfm[TX_32X32].bits[i]));
+ assert(!(lfm[TX_8X8].bits[i] & lfm[TX_64X64].bits[i]));
+ assert(!(lfm[TX_16X16].bits[i] & lfm[TX_32X32].bits[i]));
+ assert(!(lfm[TX_16X16].bits[i] & lfm[TX_64X64].bits[i]));
+ assert(!(lfm[TX_32X32].bits[i] & lfm[TX_64X64].bits[i]));
+ }
+#else
+ (void)lfm;
+#endif
+}
+
+static void check_loop_filter_masks(const LoopFilterMask *lfm, int plane) {
+ if (plane == 0) {
+ // Assert if we try to apply 2 different loop filters at the same
+ // position.
+ check_mask(lfm->left_y);
+ check_mask(lfm->above_y);
+ } else if (plane == 1) {
+ check_mask(lfm->left_u);
+ check_mask(lfm->above_u);
+ } else {
+ check_mask(lfm->left_v);
+ check_mask(lfm->above_v);
+ }
+}
+
+static void update_masks(EDGE_DIR dir, int plane, uint64_t *mask,
+ TX_SIZE sqr_tx_size, LoopFilterMask *lfm) {
+ if (dir == VERT_EDGE) {
+ switch (plane) {
+ case 0:
+ for (int i = 0; i < 4; ++i) lfm->left_y[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ case 1:
+ for (int i = 0; i < 4; ++i) lfm->left_u[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ case 2:
+ for (int i = 0; i < 4; ++i) lfm->left_v[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ default: assert(plane <= 2);
+ }
+ } else {
+ switch (plane) {
+ case 0:
+ for (int i = 0; i < 4; ++i)
+ lfm->above_y[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ case 1:
+ for (int i = 0; i < 4; ++i)
+ lfm->above_u[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ case 2:
+ for (int i = 0; i < 4; ++i)
+ lfm->above_v[sqr_tx_size].bits[i] |= mask[i];
+ break;
+ default: assert(plane <= 2);
+ }
+ }
+}
+
+static int is_frame_boundary(AV1_COMMON *const cm, int plane, int mi_row,
+ int mi_col, int ssx, int ssy, EDGE_DIR dir) {
+ if (plane && (ssx || ssy)) {
+ if (ssx && ssy) { // format 420
+ if ((mi_row << MI_SIZE_LOG2) > cm->height ||
+ (mi_col << MI_SIZE_LOG2) > cm->width)
+ return 1;
+ } else if (ssx) { // format 422
+ if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+ (mi_col << MI_SIZE_LOG2) > cm->width)
+ return 1;
+ }
+ } else {
+ if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+ (mi_col << MI_SIZE_LOG2) >= cm->width)
+ return 1;
+ }
+
+ int row_or_col;
+ if (plane == 0) {
+ row_or_col = dir == VERT_EDGE ? mi_col : mi_row;
+ } else {
+ // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+ // So if mi_col == 1, it is actually the frame boundary.
+ if (dir == VERT_EDGE) {
+ row_or_col = ssx ? (mi_col & 0x0FFFFFFE) : mi_col;
+ } else {
+ row_or_col = ssy ? (mi_row & 0x0FFFFFFE) : mi_row;
+ }
+ }
+ return row_or_col == 0;
+}
+
+static void setup_masks(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+ int ssx, int ssy, TX_SIZE tx_size) {
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const int x = (mi_col << (MI_SIZE_LOG2 - ssx));
+ const int y = (mi_row << (MI_SIZE_LOG2 - ssy));
+ // decide whether current vertical/horizontal edge needs loop filtering
+ for (EDGE_DIR dir = VERT_EDGE; dir <= HORZ_EDGE; ++dir) {
+ // chroma sub8x8 block uses bottom/right mi of co-located 8x8 luma block.
+ mi_row |= ssy;
+ mi_col |= ssx;
+
+ MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+ const MB_MODE_INFO *const mbmi = mi[0];
+ const int curr_skip = mbmi->skip && is_inter_block(mbmi);
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+ const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+ const uint8_t level = get_filter_level(cm, &cm->lf_info, dir, plane, mbmi);
+ const int prediction_masks = dir == VERT_EDGE
+ ? block_size_wide[plane_bsize] - 1
+ : block_size_high[plane_bsize] - 1;
+ const int is_coding_block_border =
+ dir == VERT_EDGE ? !(x & prediction_masks) : !(y & prediction_masks);
+
+ // TODO(chengchen): step can be optimized.
+ const int row_step = mi_size_high[TX_4X4] << ssy;
+ const int col_step = mi_size_wide[TX_4X4] << ssx;
+ const int mi_height =
+ dir == VERT_EDGE ? tx_size_high_unit[tx_size] << ssy : row_step;
+ const int mi_width =
+ dir == VERT_EDGE ? col_step : tx_size_wide_unit[tx_size] << ssx;
+
+ // assign filter levels
+ for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+ for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+ // do not filter frame boundary
+ // Note: when chroma planes' size are half of luma plane,
+ // chroma plane mi corresponds to even position.
+ // If frame size is not even, we still need to filter this chroma
+ // position. Therefore the boundary condition check needs to be
+ // separated to two cases.
+ if (plane && (ssx || ssy)) {
+ if (ssx && ssy) { // format 420
+ if ((r << MI_SIZE_LOG2) > cm->height ||
+ (c << MI_SIZE_LOG2) > cm->width)
+ continue;
+ } else if (ssx) { // format 422
+ if ((r << MI_SIZE_LOG2) >= cm->height ||
+ (c << MI_SIZE_LOG2) > cm->width)
+ continue;
+ }
+ } else {
+ if ((r << MI_SIZE_LOG2) >= cm->height ||
+ (c << MI_SIZE_LOG2) >= cm->width)
+ continue;
+ }
+
+ const int row = r % MI_SIZE_64X64;
+ const int col = c % MI_SIZE_64X64;
+ if (plane == 0) {
+ if (dir == VERT_EDGE)
+ lfm->lfl_y_ver[row][col] = level;
+ else
+ lfm->lfl_y_hor[row][col] = level;
+ } else if (plane == 1) {
+ lfm->lfl_u[row][col] = level;
+ } else {
+ lfm->lfl_v[row][col] = level;
+ }
+ }
+ }
+
+ for (int r = mi_row; r < mi_row + mi_height; r += row_step) {
+ for (int c = mi_col; c < mi_col + mi_width; c += col_step) {
+ // do not filter frame boundary
+ if (is_frame_boundary(cm, plane, r, c, ssx, ssy, dir)) continue;
+
+ uint64_t mask[4] = { 0 };
+ const int prev_row = dir == VERT_EDGE ? r : r - (1 << ssy);
+ const int prev_col = dir == VERT_EDGE ? c - (1 << ssx) : c;
+ MB_MODE_INFO **mi_prev =
+ cm->mi_grid_visible + prev_row * cm->mi_stride + prev_col;
+ const MB_MODE_INFO *const mbmi_prev = mi_prev[0];
+ const int prev_skip = mbmi_prev->skip && is_inter_block(mbmi_prev);
+ const uint8_t level_prev =
+ get_filter_level(cm, &cm->lf_info, dir, plane, mbmi_prev);
+ const int is_edge =
+ (level || level_prev) &&
+ (!curr_skip || !prev_skip || is_coding_block_border);
+
+ if (is_edge) {
+ const TX_SIZE prev_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi_prev->sb_type, ssx, ssy)
+ : mbmi_prev->tx_size;
+ TX_SIZE min_tx_size = (dir == VERT_EDGE)
+ ? AOMMIN(txsize_horz_map[tx_size],
+ txsize_horz_map[prev_tx_size])
+ : AOMMIN(txsize_vert_map[tx_size],
+ txsize_vert_map[prev_tx_size]);
+ min_tx_size = AOMMIN(min_tx_size, TX_16X16);
+ assert(min_tx_size < TX_SIZES);
+ const int row = r % MI_SIZE_64X64;
+ const int col = c % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ assert(index < 4 && index >= 0);
+ mask[index] |= ((uint64_t)1 << shift);
+ // set mask on corresponding bit
+ update_masks(dir, plane, mask, min_tx_size, lfm);
+ }
+ }
+ }
+ }
+}
+
+static void setup_tx_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int plane, int ssx, int ssy) {
+ blk_row <<= ssy;
+ blk_col <<= ssx;
+ if (((mi_row + blk_row) << MI_SIZE_LOG2) >= cm->height ||
+ ((mi_col + blk_col) << MI_SIZE_LOG2) >= cm->width)
+ return;
+
+ // U/V plane, tx_size is always the largest size
+ if (plane) {
+ assert(tx_size_wide[tx_size] <= 32 && tx_size_high[tx_size] <= 32);
+ setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+ tx_size);
+ return;
+ }
+
+ MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+ const MB_MODE_INFO *const mbmi = mi[0];
+ // For Y plane:
+ // If intra block, tx size is univariant.
+ // If inter block, tx size follows inter_tx_size.
+ TX_SIZE plane_tx_size = tx_size;
+ const int is_inter = is_inter_block(mbmi);
+
+ if (plane == 0) {
+ if (is_inter) {
+ if (mbmi->skip) {
+ // TODO(chengchen): change av1_get_transform_size() to be consistant.
+ // plane_tx_size = get_max_rect_tx_size(plane_bsize);
+ plane_tx_size = mbmi->tx_size;
+ } else {
+ plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+ plane_bsize, blk_row, blk_col)];
+ }
+ } else {
+ MB_MODE_INFO **mi_this = cm->mi_grid_visible +
+ (mi_row + blk_row) * cm->mi_stride + mi_col +
+ blk_col;
+ const MB_MODE_INFO *const mbmi_this = mi_this[0];
+ plane_tx_size = mbmi_this->tx_size;
+ }
+ }
+
+ assert(txsize_to_bsize[plane_tx_size] <= plane_bsize);
+
+ if (plane || plane_tx_size == tx_size) {
+ setup_masks(cm, mi_row + blk_row, mi_col + blk_col, plane, ssx, ssy,
+ tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+ setup_tx_block_mask(cm, mi_row, mi_col, offsetr, offsetc, plane_bsize,
+ sub_txs, plane, ssx, ssy);
+ }
+ }
+ }
+}
+
+static void setup_fix_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+ int plane, int ssx, int ssy) {
+ MB_MODE_INFO **mi =
+ cm->mi_grid_visible + (mi_row | ssy) * cm->mi_stride + (mi_col | ssx);
+ const MB_MODE_INFO *const mbmi = mi[0];
+
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsizec = scale_chroma_bsize(bsize, ssx, ssy);
+ const BLOCK_SIZE plane_bsize = ss_size_lookup[bsizec][ssx][ssy];
+
+ const int block_width = mi_size_wide[plane_bsize];
+ const int block_height = mi_size_high[plane_bsize];
+
+ TX_SIZE max_txsize = max_txsize_rect_lookup[plane_bsize];
+ // The decoder is designed so that it can process 64x64 luma pixels at a
+ // time. If this is a chroma plane with subsampling and bsize corresponds to
+ // a subsampled BLOCK_128X128 then the lookup above will give TX_64X64. That
+ // mustn't be used for the subsampled plane (because it would be bigger than
+ // a 64x64 luma block) so we round down to TX_32X32.
+ if (plane && txsize_sqr_up_map[max_txsize] == TX_64X64) {
+ if (max_txsize == TX_16X64)
+ max_txsize = TX_16X32;
+ else if (max_txsize == TX_64X16)
+ max_txsize = TX_32X16;
+ else
+ max_txsize = TX_32X32;
+ }
+
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize];
+ const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ const int bh = block_size_high[txb_size] >> tx_size_wide_log2[0];
+ const BLOCK_SIZE max_unit_bsize = ss_size_lookup[BLOCK_64X64][ssx][ssy];
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ mu_blocks_wide = AOMMIN(block_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(block_height, mu_blocks_high);
+
+ // Y: Largest tx_size is 64x64, while superblock size can be 128x128.
+ // Here we ensure that setup_tx_block_mask process at most a 64x64 block.
+ // U/V: largest tx size is 32x32.
+ for (int idy = 0; idy < block_height; idy += mu_blocks_high) {
+ for (int idx = 0; idx < block_width; idx += mu_blocks_wide) {
+ const int unit_height = AOMMIN(mu_blocks_high + idy, block_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, block_width);
+ for (int blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (int blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ setup_tx_block_mask(cm, mi_row, mi_col, blk_row, blk_col, plane_bsize,
+ max_txsize, plane, ssx, ssy);
+ }
+ }
+ }
+ }
+}
+
+static void setup_block_mask(AV1_COMMON *const cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int plane, int ssx, int ssy) {
+ if ((mi_row << MI_SIZE_LOG2) >= cm->height ||
+ (mi_col << MI_SIZE_LOG2) >= cm->width)
+ return;
+
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int quarter_step = mi_size_wide[bsize] / 4;
+ const int allow_sub8x8 = (ssx || ssy) ? bsize > BLOCK_8X8 : 1;
+ const int has_next_row =
+ (((mi_row + hbs) << MI_SIZE_LOG2) < cm->height) & allow_sub8x8;
+ const int has_next_col =
+ (((mi_col + hbs) << MI_SIZE_LOG2) < cm->width) & allow_sub8x8;
+ int i;
+
+ switch (partition) {
+ case PARTITION_NONE:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ break;
+ case PARTITION_HORZ:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+ break;
+ case PARTITION_VERT:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_col)
+ setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+ break;
+ case PARTITION_SPLIT:
+ setup_block_mask(cm, mi_row, mi_col, subsize, plane, ssx, ssy);
+ if (has_next_col)
+ setup_block_mask(cm, mi_row, mi_col + hbs, subsize, plane, ssx, ssy);
+ if (has_next_row)
+ setup_block_mask(cm, mi_row + hbs, mi_col, subsize, plane, ssx, ssy);
+ if (has_next_col & has_next_row)
+ setup_block_mask(cm, mi_row + hbs, mi_col + hbs, subsize, plane, ssx,
+ ssy);
+ break;
+ case PARTITION_HORZ_A:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_col)
+ setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+ if (has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+ break;
+ case PARTITION_HORZ_B:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+ if (has_next_col & has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+ break;
+ case PARTITION_VERT_A:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col, plane, ssx, ssy);
+ if (has_next_col)
+ setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+ break;
+ case PARTITION_VERT_B:
+ setup_fix_block_mask(cm, mi_row, mi_col, plane, ssx, ssy);
+ if (has_next_col)
+ setup_fix_block_mask(cm, mi_row, mi_col + hbs, plane, ssx, ssy);
+ if (has_next_row)
+ setup_fix_block_mask(cm, mi_row + hbs, mi_col + hbs, plane, ssx, ssy);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && (this_mi_row << MI_SIZE_LOG2) >= cm->height) break;
+ // chroma plane filter the odd location
+ if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+ setup_fix_block_mask(cm, this_mi_row, mi_col, plane, ssx, ssy);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= cm->mi_cols) break;
+ // chroma plane filter the odd location
+ if (plane && bsize == BLOCK_16X16 && (i & 0x01)) continue;
+
+ setup_fix_block_mask(cm, mi_row, this_mi_col, plane, ssx, ssy);
+ }
+ break;
+ default: assert(0);
+ }
+}
+
+// TODO(chengchen): if lossless, do not need to setup mask. But when
+// segments enabled, each segment has different lossless settings.
+void av1_setup_bitmask(AV1_COMMON *const cm, int mi_row, int mi_col, int plane,
+ int subsampling_x, int subsampling_y, int row_end,
+ int col_end) {
+ const int num_64x64 = cm->seq_params.mib_size >> MIN_MIB_SIZE_LOG2;
+ for (int y = 0; y < num_64x64; ++y) {
+ for (int x = 0; x < num_64x64; ++x) {
+ const int row = mi_row + y * MI_SIZE_64X64;
+ const int col = mi_col + x * MI_SIZE_64X64;
+ if (row >= row_end || col >= col_end) continue;
+ if ((row << MI_SIZE_LOG2) >= cm->height ||
+ (col << MI_SIZE_LOG2) >= cm->width)
+ continue;
+
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+ if (lfm == NULL) return;
+
+ // init mask to zero
+ if (plane == 0) {
+ av1_zero(lfm->left_y);
+ av1_zero(lfm->above_y);
+ av1_zero(lfm->lfl_y_ver);
+ av1_zero(lfm->lfl_y_hor);
+ } else if (plane == 1) {
+ av1_zero(lfm->left_u);
+ av1_zero(lfm->above_u);
+ av1_zero(lfm->lfl_u);
+ } else {
+ av1_zero(lfm->left_v);
+ av1_zero(lfm->above_v);
+ av1_zero(lfm->lfl_v);
+ }
+ }
+ }
+
+ // set up bitmask for each superblock
+ setup_block_mask(cm, mi_row, mi_col, cm->seq_params.sb_size, plane,
+ subsampling_x, subsampling_y);
+
+ for (int y = 0; y < num_64x64; ++y) {
+ for (int x = 0; x < num_64x64; ++x) {
+ const int row = mi_row + y * MI_SIZE_64X64;
+ const int col = mi_col + x * MI_SIZE_64X64;
+ if (row >= row_end || col >= col_end) continue;
+ if ((row << MI_SIZE_LOG2) >= cm->height ||
+ (col << MI_SIZE_LOG2) >= cm->width)
+ continue;
+
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, row, col);
+ if (lfm == NULL) return;
+
+ // check if the mask is valid
+ check_loop_filter_masks(lfm, plane);
+
+ {
+ // Let 16x16 hold 32x32 (Y/U/V) and 64x64(Y only).
+ // Even tx size is greater, we only apply max length filter, which
+ // is 16.
+ if (plane == 0) {
+ for (int j = 0; j < 4; ++j) {
+ lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_32X32].bits[j];
+ lfm->left_y[TX_16X16].bits[j] |= lfm->left_y[TX_64X64].bits[j];
+ lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_32X32].bits[j];
+ lfm->above_y[TX_16X16].bits[j] |= lfm->above_y[TX_64X64].bits[j];
+
+ // set 32x32 and 64x64 to 0
+ lfm->left_y[TX_32X32].bits[j] = 0;
+ lfm->left_y[TX_64X64].bits[j] = 0;
+ lfm->above_y[TX_32X32].bits[j] = 0;
+ lfm->above_y[TX_64X64].bits[j] = 0;
+ }
+ } else if (plane == 1) {
+ for (int j = 0; j < 4; ++j) {
+ lfm->left_u[TX_16X16].bits[j] |= lfm->left_u[TX_32X32].bits[j];
+ lfm->above_u[TX_16X16].bits[j] |= lfm->above_u[TX_32X32].bits[j];
+
+ // set 32x32 to 0
+ lfm->left_u[TX_32X32].bits[j] = 0;
+ lfm->above_u[TX_32X32].bits[j] = 0;
+ }
+ } else {
+ for (int j = 0; j < 4; ++j) {
+ lfm->left_v[TX_16X16].bits[j] |= lfm->left_v[TX_32X32].bits[j];
+ lfm->above_v[TX_16X16].bits[j] |= lfm->above_v[TX_32X32].bits[j];
+
+ // set 32x32 to 0
+ lfm->left_v[TX_32X32].bits[j] = 0;
+ lfm->above_v[TX_32X32].bits[j] = 0;
+ }
+ }
+ }
+
+ // check if the mask is valid
+ check_loop_filter_masks(lfm, plane);
+ }
+ }
+}
+
+static void filter_selectively_vert_row2(
+ int subsampling_factor, uint8_t *s, int pitch, int plane,
+ uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+ uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+ const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2) {
+ uint64_t mask;
+ const int step = 1 << subsampling_factor;
+
+ for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+ mask_8x8_1 | mask_4x4_1;
+ mask; mask >>= step) {
+ const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+ const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+ if (mask & 1) {
+ if ((mask_16x16_0 | mask_16x16_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_14;
+
+ if ((mask_16x16_0 & mask_16x16_1) & 1) {
+ if (plane) {
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ aom_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ } else if (mask_16x16_0 & 1) {
+ lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+
+ if ((mask_8x8_0 | mask_8x8_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_vertical = plane ? aom_lpf_vertical_6 : aom_lpf_vertical_8;
+
+ if ((mask_8x8_0 & mask_8x8_1) & 1) {
+ if (plane) {
+ aom_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else {
+ aom_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ } else if (mask_8x8_0 & 1) {
+ lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+
+ if ((mask_4x4_0 | mask_4x4_1) & 1) {
+ if ((mask_4x4_0 & mask_4x4_1) & 1) {
+ aom_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ } else if (mask_4x4_0 & 1) {
+ aom_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr);
+ } else {
+ aom_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr);
+ }
+ }
+ }
+
+ s += 4;
+ lfl += step;
+ lfl2 += step;
+ mask_16x16_0 >>= step;
+ mask_8x8_0 >>= step;
+ mask_4x4_0 >>= step;
+ mask_16x16_1 >>= step;
+ mask_8x8_1 >>= step;
+ mask_4x4_1 >>= step;
+ }
+}
+
+static void highbd_filter_selectively_vert_row2(
+ int subsampling_factor, uint16_t *s, int pitch, int plane,
+ uint64_t mask_16x16_0, uint64_t mask_8x8_0, uint64_t mask_4x4_0,
+ uint64_t mask_16x16_1, uint64_t mask_8x8_1, uint64_t mask_4x4_1,
+ const loop_filter_info_n *lfi_n, uint8_t *lfl, uint8_t *lfl2, int bd) {
+ uint64_t mask;
+ const int step = 1 << subsampling_factor;
+
+ for (mask = mask_16x16_0 | mask_8x8_0 | mask_4x4_0 | mask_16x16_1 |
+ mask_8x8_1 | mask_4x4_1;
+ mask; mask >>= step) {
+ const loop_filter_thresh *lfi0 = lfi_n->lfthr + *lfl;
+ const loop_filter_thresh *lfi1 = lfi_n->lfthr + *lfl2;
+
+ if (mask & 1) {
+ if ((mask_16x16_0 | mask_16x16_1) & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ HbdLpfFunc highbd_lpf_vertical =
+ plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_14;
+
+ if ((mask_16x16_0 & mask_16x16_1) & 1) {
+ if (plane) {
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_14_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ } else if (mask_16x16_0 & 1) {
+ highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+ bd);
+ } else {
+ highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ }
+ }
+
+ if ((mask_8x8_0 | mask_8x8_1) & 1) {
+ HbdLpfFunc highbd_lpf_vertical =
+ plane ? aom_highbd_lpf_vertical_6 : aom_highbd_lpf_vertical_8;
+
+ if ((mask_8x8_0 & mask_8x8_1) & 1) {
+ if (plane) {
+ aom_highbd_lpf_vertical_6_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_8_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ } else if (mask_8x8_0 & 1) {
+ highbd_lpf_vertical(s, pitch, lfi0->mblim, lfi0->lim, lfi0->hev_thr,
+ bd);
+ } else {
+ highbd_lpf_vertical(s + 4 * pitch, pitch, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ }
+ }
+
+ if ((mask_4x4_0 | mask_4x4_1) & 1) {
+ if ((mask_4x4_0 & mask_4x4_1) & 1) {
+ aom_highbd_lpf_vertical_4_dual(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, lfi1->mblim, lfi1->lim,
+ lfi1->hev_thr, bd);
+ } else if (mask_4x4_0 & 1) {
+ aom_highbd_lpf_vertical_4(s, pitch, lfi0->mblim, lfi0->lim,
+ lfi0->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_vertical_4(s + 4 * pitch, pitch, lfi1->mblim,
+ lfi1->lim, lfi1->hev_thr, bd);
+ }
+ }
+ }
+
+ s += 4;
+ lfl += step;
+ lfl2 += step;
+ mask_16x16_0 >>= step;
+ mask_8x8_0 >>= step;
+ mask_4x4_0 >>= step;
+ mask_16x16_1 >>= step;
+ mask_8x8_1 >>= step;
+ mask_4x4_1 >>= step;
+ }
+}
+
+static void filter_selectively_horiz(uint8_t *s, int pitch, int plane,
+ int subsampling, uint64_t mask_16x16,
+ uint64_t mask_8x8, uint64_t mask_4x4,
+ const loop_filter_info_n *lfi_n,
+ const uint8_t *lfl) {
+ uint64_t mask;
+ int count;
+ const int step = 1 << subsampling;
+ const unsigned int two_block_mask = subsampling ? 5 : 3;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+ // Next block's thresholds.
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
+
+ count = 1;
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_horizontal =
+ plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_14;
+
+ if ((mask_16x16 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
+ count = 2;
+ } else {
+ lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ } else if (mask_8x8 & 1) {
+ // chroma plane filters less pixels introduced in deblock_13tap
+ // experiment
+ LpfFunc lpf_horizontal =
+ plane ? aom_lpf_horizontal_6 : aom_lpf_horizontal_8;
+
+ if ((mask_8x8 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ } else {
+ aom_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ }
+ count = 2;
+ } else {
+ lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & two_block_mask) == two_block_mask) {
+ aom_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr);
+ count = 2;
+ } else {
+ aom_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr);
+ }
+ }
+ }
+
+ s += 4 * count;
+ lfl += step * count;
+ mask_16x16 >>= step * count;
+ mask_8x8 >>= step * count;
+ mask_4x4 >>= step * count;
+ }
+}
+
+static void highbd_filter_selectively_horiz(
+ uint16_t *s, int pitch, int plane, int subsampling, uint64_t mask_16x16,
+ uint64_t mask_8x8, uint64_t mask_4x4, const loop_filter_info_n *lfi_n,
+ uint8_t *lfl, int bd) {
+ uint64_t mask;
+ int count;
+ const int step = 1 << subsampling;
+ const unsigned int two_block_mask = subsampling ? 5 : 3;
+
+ for (mask = mask_16x16 | mask_8x8 | mask_4x4; mask; mask >>= step * count) {
+ const loop_filter_thresh *lfi = lfi_n->lfthr + *lfl;
+ // Next block's thresholds.
+ const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + step);
+
+ count = 1;
+ if (mask & 1) {
+ if (mask_16x16 & 1) {
+ HbdLpfFunc highbd_lpf_horizontal =
+ plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_14;
+
+ if ((mask_16x16 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_14_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
+ count = 2;
+ } else {
+ highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+ bd);
+ }
+ } else if (mask_8x8 & 1) {
+ HbdLpfFunc highbd_lpf_horizontal =
+ plane ? aom_highbd_lpf_horizontal_6 : aom_highbd_lpf_horizontal_8;
+
+ if ((mask_8x8 & two_block_mask) == two_block_mask) {
+ if (plane) {
+ aom_highbd_lpf_horizontal_6_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ } else {
+ aom_highbd_lpf_horizontal_8_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim,
+ lfin->lim, lfin->hev_thr, bd);
+ }
+ count = 2;
+ } else {
+ highbd_lpf_horizontal(s, pitch, lfi->mblim, lfi->lim, lfi->hev_thr,
+ bd);
+ }
+ } else if (mask_4x4 & 1) {
+ if ((mask_4x4 & two_block_mask) == two_block_mask) {
+ aom_highbd_lpf_horizontal_4_dual(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, lfin->mblim, lfin->lim,
+ lfin->hev_thr, bd);
+ count = 2;
+ } else {
+ aom_highbd_lpf_horizontal_4(s, pitch, lfi->mblim, lfi->lim,
+ lfi->hev_thr, bd);
+ }
+ }
+ }
+
+ s += 4 * count;
+ lfl += step * count;
+ mask_16x16 >>= step * count;
+ mask_8x8 >>= step * count;
+ mask_4x4 >>= step * count;
+ }
+}
+
+void av1_build_bitmask_vert_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int row_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height; r += row_step) {
+ const int mi_row = r << subsampling_y;
+ const int row = mi_row % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(0, row, &index);
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width;
+ c += (tx_size_wide_unit[TX_64X64] >> subsampling_x)) {
+ const int mi_col = c << subsampling_x;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int col_in_unit = 0;
+ col_in_unit < (tx_size_wide_unit[TX_64X64] >> subsampling_x);) {
+ const int x = (c + col_in_unit) << MI_SIZE_LOG2;
+ if (x >= plane_ptr->dst.width) break;
+ const int col = col_in_unit << subsampling_x;
+ const uint64_t mask = ((uint64_t)1 << (shift | col));
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_vert_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_ver[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_ver[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((c + col_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (mi_row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+ switch (plane) {
+ case 0: lfm->left_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->left_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->left_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ col_in_unit += tx_size_wide_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_build_bitmask_horz_info(
+ AV1_COMMON *const cm, const struct macroblockd_plane *const plane_ptr,
+ int plane) {
+ const int subsampling_x = plane_ptr->subsampling_x;
+ const int subsampling_y = plane_ptr->subsampling_y;
+ const int col_step = (MI_SIZE >> MI_SIZE_LOG2);
+ const int is_uv = plane > 0;
+ TX_SIZE tx_size = TX_16X16, prev_tx_size = TX_16X16;
+ uint8_t level, prev_level = 1;
+ int skip, prev_skip = 0;
+ int is_coding_block_border;
+
+ for (int c = 0; (c << MI_SIZE_LOG2) < plane_ptr->dst.width; c += col_step) {
+ const int mi_col = c << subsampling_x;
+ const int col = mi_col % MI_SIZE_64X64;
+
+ for (int r = 0; (r << MI_SIZE_LOG2) < plane_ptr->dst.height;
+ r += (tx_size_high_unit[TX_64X64] >> subsampling_y)) {
+ const int mi_row = r << subsampling_y;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+
+ for (int r_in_unit = 0;
+ r_in_unit < (tx_size_high_unit[TX_64X64] >> subsampling_y);) {
+ const int y = (r + r_in_unit) << MI_SIZE_LOG2;
+ if (y >= plane_ptr->dst.height) break;
+ const int row = r_in_unit << subsampling_y;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ const uint64_t mask = ((uint64_t)1 << shift);
+ skip = lfm->skip.bits[index] & mask;
+ is_coding_block_border = lfm->is_horz_border.bits[index] & mask;
+ switch (plane) {
+ case 0: level = lfm->lfl_y_hor[row][col]; break;
+ case 1: level = lfm->lfl_u[row][col]; break;
+ case 2: level = lfm->lfl_v[row][col]; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ for (TX_SIZE ts = TX_4X4; ts <= TX_64X64; ++ts) {
+ if (is_uv && ts == TX_64X64) continue;
+ if (lfm->tx_size_hor[is_uv][ts].bits[index] & mask) {
+ tx_size = ts;
+ break;
+ }
+ }
+ if ((r + r_in_unit > 0) && (level || prev_level) &&
+ (!prev_skip || !skip || is_coding_block_border)) {
+ const TX_SIZE min_tx_size =
+ AOMMIN(TX_16X16, AOMMIN(tx_size, prev_tx_size));
+ const int tmp_row = (row | subsampling_y) % MI_SIZE_64X64;
+ const int tmp_col = (mi_col | subsampling_x) % MI_SIZE_64X64;
+ const int shift_1 = get_index_shift(tmp_col, tmp_row, &index);
+ const uint64_t mask_1 = ((uint64_t)1 << shift_1);
+
+ switch (plane) {
+ case 0: lfm->above_y[min_tx_size].bits[index] |= mask_1; break;
+ case 1: lfm->above_u[min_tx_size].bits[index] |= mask_1; break;
+ case 2: lfm->above_v[min_tx_size].bits[index] |= mask_1; break;
+ default: assert(plane >= 0 && plane <= 2); return;
+ }
+ }
+
+ // update prev info
+ prev_level = level;
+ prev_skip = skip;
+ prev_tx_size = tx_size;
+ // advance
+ r_in_unit += tx_size_high_unit[tx_size];
+ }
+ }
+ }
+}
+
+void av1_filter_block_plane_bitmask_vert(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int two_row_step = 2 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ const int two_row_stride = row_stride << 1;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+
+ // 1. vertical filtering. filter two rows at a time
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += two_row_step) {
+ const int row = r | ssy;
+ const int row_next = row + row_step;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(
+ ssx, dst->buf, dst->stride, pl, mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1, &cm->lf_info, lfl, lfl2);
+ dst->buf += two_row_stride;
+ }
+ // reset buf pointer for horizontal filtering
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_bitmask_horz(
+ AV1_COMMON *const cm, struct macroblockd_plane *const plane_ptr, int pl,
+ int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ uint8_t *const buf0 = dst->buf;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int row_step = 1 << ssy;
+ const int row_stride = dst->stride << MI_SIZE_LOG2;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ assert(lfm);
+ for (int r = 0;
+ ((mi_row + r) << MI_SIZE_LOG2) < cm->height && r < MI_SIZE_64X64;
+ r += row_step) {
+ if (mi_row + r == 0) {
+ dst->buf += row_stride;
+ continue;
+ }
+ const int row = r | ssy;
+ const int col = ssx;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(
+ CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+ dst->buf += row_stride;
+ }
+ // reset buf pointer for next block
+ dst->buf = buf0;
+}
+
+void av1_filter_block_plane_ver(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ int r, c;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int single_step = 1 << ssy;
+ const int r_step = 2 << ssy;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+ uint8_t *lfl2;
+
+ // filter two rows at a time
+ for (r = 0; r < cm->seq_params.mib_size &&
+ ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+ r += r_step) {
+ for (c = 0; c < cm->seq_params.mib_size &&
+ ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+ c += MI_SIZE_64X64) {
+ dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+ assert(lfm);
+ const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+ const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ // current and next row should belong to the same mask_idx and index
+ // next row's shift
+ const int row_next = row + single_step;
+ int index_next = 0;
+ const int shift_next = get_index_shift(col, row_next, &index_next);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->left_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_ver[row][col];
+ lfl2 = &lfm->lfl_y_ver[row_next][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->left_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ lfl2 = &lfm->lfl_u[row_next][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->left_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->left_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->left_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ lfl2 = &lfm->lfl_v[row_next][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ uint64_t mask_16x16_0 = (mask_16x16 >> shift) & mask_cutoff;
+ uint64_t mask_8x8_0 = (mask_8x8 >> shift) & mask_cutoff;
+ uint64_t mask_4x4_0 = (mask_4x4 >> shift) & mask_cutoff;
+ uint64_t mask_16x16_1 = (mask_16x16 >> shift_next) & mask_cutoff;
+ uint64_t mask_8x8_1 = (mask_8x8 >> shift_next) & mask_cutoff;
+ uint64_t mask_4x4_1 = (mask_4x4 >> shift_next) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_vert_row2(
+ ssx, CONVERT_TO_SHORTPTR(dst->buf), dst->stride, pl, mask_16x16_0,
+ mask_8x8_0, mask_4x4_0, mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2, (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_vert_row2(ssx, dst->buf, dst->stride, pl,
+ mask_16x16_0, mask_8x8_0, mask_4x4_0,
+ mask_16x16_1, mask_8x8_1, mask_4x4_1,
+ &cm->lf_info, lfl, lfl2);
+ dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+ }
+ dst->buf += 2 * MI_SIZE * dst->stride;
+ }
+}
+
+void av1_filter_block_plane_hor(AV1_COMMON *const cm,
+ struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col) {
+ struct buf_2d *const dst = &plane_ptr->dst;
+ int r, c;
+ const int ssx = plane_ptr->subsampling_x;
+ const int ssy = plane_ptr->subsampling_y;
+ const int mask_cutoff = 0xffff;
+ const int r_step = 1 << ssy;
+ uint64_t mask_16x16 = 0;
+ uint64_t mask_8x8 = 0;
+ uint64_t mask_4x4 = 0;
+ uint8_t *lfl;
+
+ for (r = 0; r < cm->seq_params.mib_size &&
+ ((mi_row + r) << MI_SIZE_LOG2 < cm->height);
+ r += r_step) {
+ for (c = 0; c < cm->seq_params.mib_size &&
+ ((mi_col + c) << MI_SIZE_LOG2 < cm->width);
+ c += MI_SIZE_64X64) {
+ if (mi_row + r == 0) continue;
+
+ dst->buf += ((c << MI_SIZE_LOG2) >> ssx);
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row + r, mi_col + c);
+ assert(lfm);
+ const int row = ((mi_row + r) | ssy) % MI_SIZE_64X64;
+ const int col = ((mi_col + c) | ssx) % MI_SIZE_64X64;
+ int index = 0;
+ const int shift = get_index_shift(col, row, &index);
+ switch (pl) {
+ case 0:
+ mask_16x16 = lfm->above_y[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_y[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_y[TX_4X4].bits[index];
+ lfl = &lfm->lfl_y_hor[row][col];
+ break;
+ case 1:
+ mask_16x16 = lfm->above_u[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_u[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_u[TX_4X4].bits[index];
+ lfl = &lfm->lfl_u[row][col];
+ break;
+ case 2:
+ mask_16x16 = lfm->above_v[TX_16X16].bits[index];
+ mask_8x8 = lfm->above_v[TX_8X8].bits[index];
+ mask_4x4 = lfm->above_v[TX_4X4].bits[index];
+ lfl = &lfm->lfl_v[row][col];
+ break;
+ default: assert(pl >= 0 && pl <= 2); return;
+ }
+ mask_16x16 = (mask_16x16 >> shift) & mask_cutoff;
+ mask_8x8 = (mask_8x8 >> shift) & mask_cutoff;
+ mask_4x4 = (mask_4x4 >> shift) & mask_cutoff;
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
+ dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl,
+ (int)cm->seq_params.bit_depth);
+ else
+ filter_selectively_horiz(dst->buf, dst->stride, pl, ssx, mask_16x16,
+ mask_8x8, mask_4x4, &cm->lf_info, lfl);
+ dst->buf -= ((c << MI_SIZE_LOG2) >> ssx);
+ }
+ dst->buf += MI_SIZE * dst->stride;
+ }
+}
+#endif // LOOP_FILTER_BITMASK
+
+static TX_SIZE get_transform_size(const MACROBLOCKD *const xd,
+ const MB_MODE_INFO *const mbmi,
+ const EDGE_DIR edge_dir, const int mi_row,
+ const int mi_col, const int plane,
+ const struct macroblockd_plane *plane_ptr) {
+ assert(mbmi != NULL);
+ if (xd && xd->lossless[mbmi->segment_id]) return TX_4X4;
+
+ TX_SIZE tx_size =
+ (plane == AOM_PLANE_Y)
+ ? mbmi->tx_size
+ : av1_get_max_uv_txsize(mbmi->sb_type, plane_ptr->subsampling_x,
+ plane_ptr->subsampling_y);
+ assert(tx_size < TX_SIZES_ALL);
+ if ((plane == AOM_PLANE_Y) && is_inter_block(mbmi) && !mbmi->skip) {
+ const BLOCK_SIZE sb_type = mbmi->sb_type;
+ const int blk_row = mi_row & (mi_size_high[sb_type] - 1);
+ const int blk_col = mi_col & (mi_size_wide[sb_type] - 1);
+ const TX_SIZE mb_tx_size =
+ mbmi->inter_tx_size[av1_get_txb_size_index(sb_type, blk_row, blk_col)];
+ assert(mb_tx_size < TX_SIZES_ALL);
+ tx_size = mb_tx_size;
+ }
+
+ // since in case of chrominance or non-square transorm need to convert
+ // transform size into transform size in particular direction.
+ // for vertical edge, filter direction is horizontal, for horizontal
+ // edge, filter direction is vertical.
+ tx_size = (VERT_EDGE == edge_dir) ? txsize_horz_map[tx_size]
+ : txsize_vert_map[tx_size];
+ return tx_size;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+ // length of the filter applied to the outer edge
+ uint32_t filter_length;
+ // deblocking limits
+ const uint8_t *lim;
+ const uint8_t *mblim;
+ const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+// Return TX_SIZE from get_transform_size(), so it is plane and direction
+// awared
+static TX_SIZE set_lpf_parameters(
+ AV1_DEBLOCKING_PARAMETERS *const params, const ptrdiff_t mode_step,
+ const AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ const EDGE_DIR edge_dir, const uint32_t x, const uint32_t y,
+ const int plane, const struct macroblockd_plane *const plane_ptr) {
+ // reset to initial values
+ params->filter_length = 0;
+
+ // no deblocking is required
+ const uint32_t width = plane_ptr->dst.width;
+ const uint32_t height = plane_ptr->dst.height;
+ if ((width <= x) || (height <= y)) {
+ // just return the smallest transform unit size
+ return TX_4X4;
+ }
+
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ // for sub8x8 block, chroma prediction mode is obtained from the bottom/right
+ // mi structure of the co-located 8x8 luma block. so for chroma plane, mi_row
+ // and mi_col should map to the bottom/right mi structure, i.e, both mi_row
+ // and mi_col should be odd number for chroma plane.
+ const int mi_row = scale_vert | ((y << scale_vert) >> MI_SIZE_LOG2);
+ const int mi_col = scale_horz | ((x << scale_horz) >> MI_SIZE_LOG2);
+ MB_MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride + mi_col;
+ const MB_MODE_INFO *mbmi = mi[0];
+ // If current mbmi is not correctly setup, return an invalid value to stop
+ // filtering. One example is that if this tile is not coded, then its mbmi
+ // it not set up.
+ if (mbmi == NULL) return TX_INVALID;
+
+ const TX_SIZE ts =
+ get_transform_size(xd, mi[0], edge_dir, mi_row, mi_col, plane, plane_ptr);
+
+ {
+ const uint32_t coord = (VERT_EDGE == edge_dir) ? (x) : (y);
+ const uint32_t transform_masks =
+ edge_dir == VERT_EDGE ? tx_size_wide[ts] - 1 : tx_size_high[ts] - 1;
+ const int32_t tu_edge = (coord & transform_masks) ? (0) : (1);
+
+ if (!tu_edge) return ts;
+
+ // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+ {
+ const uint32_t curr_level =
+ get_filter_level(cm, &cm->lf_info, edge_dir, plane, mbmi);
+ const int curr_skipped = mbmi->skip && is_inter_block(mbmi);
+ uint32_t level = curr_level;
+ if (coord) {
+ {
+ const MB_MODE_INFO *const mi_prev = *(mi - mode_step);
+ if (mi_prev == NULL) return TX_INVALID;
+ const int pv_row =
+ (VERT_EDGE == edge_dir) ? (mi_row) : (mi_row - (1 << scale_vert));
+ const int pv_col =
+ (VERT_EDGE == edge_dir) ? (mi_col - (1 << scale_horz)) : (mi_col);
+ const TX_SIZE pv_ts = get_transform_size(
+ xd, mi_prev, edge_dir, pv_row, pv_col, plane, plane_ptr);
+
+ const uint32_t pv_lvl =
+ get_filter_level(cm, &cm->lf_info, edge_dir, plane, mi_prev);
+
+ const int pv_skip = mi_prev->skip && is_inter_block(mi_prev);
+ const BLOCK_SIZE bsize =
+ get_plane_block_size(mbmi->sb_type, plane_ptr->subsampling_x,
+ plane_ptr->subsampling_y);
+ const int prediction_masks = edge_dir == VERT_EDGE
+ ? block_size_wide[bsize] - 1
+ : block_size_high[bsize] - 1;
+ const int32_t pu_edge = !(coord & prediction_masks);
+ // if the current and the previous blocks are skipped,
+ // deblock the edge if the edge belongs to a PU's edge only.
+ if ((curr_level || pv_lvl) &&
+ (!pv_skip || !curr_skipped || pu_edge)) {
+ const TX_SIZE min_ts = AOMMIN(ts, pv_ts);
+ if (TX_4X4 >= min_ts) {
+ params->filter_length = 4;
+ } else if (TX_8X8 == min_ts) {
+ if (plane != 0)
+ params->filter_length = 6;
+ else
+ params->filter_length = 8;
+ } else {
+ params->filter_length = 14;
+ // No wide filtering for chroma plane
+ if (plane != 0) {
+ params->filter_length = 6;
+ }
+ }
+
+ // update the level if the current block is skipped,
+ // but the previous one is not
+ level = (curr_level) ? (curr_level) : (pv_lvl);
+ }
+ }
+ }
+ // prepare common parameters
+ if (params->filter_length) {
+ const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+ params->lim = limits->lim;
+ params->mblim = limits->mblim;
+ params->hev_thr = limits->hev_thr;
+ }
+ }
+ }
+
+ return ts;
+}
+
+void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col) {
+ const int row_step = MI_SIZE >> MI_SIZE_LOG2;
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int y_range = (MAX_MIB_SIZE >> scale_vert);
+ const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+ for (int y = 0; y < y_range; y += row_step) {
+ uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+ for (int x = 0; x < x_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+ // If 4x4 trasnform is used, it will then filter the internal edge
+ // aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size =
+ set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+ VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ if (use_highbitdepth)
+ aom_highbd_lpf_vertical_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim, params.hev_thr,
+ bit_depth);
+ else
+ aom_lpf_vertical_4(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ case 6: // apply 6-tap filter for chroma plane only
+ assert(plane != 0);
+ if (use_highbitdepth)
+ aom_highbd_lpf_vertical_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim, params.hev_thr,
+ bit_depth);
+ else
+ aom_lpf_vertical_6(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ if (use_highbitdepth)
+ aom_highbd_lpf_vertical_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim, params.hev_thr,
+ bit_depth);
+ else
+ aom_lpf_vertical_8(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ if (use_highbitdepth)
+ aom_highbd_lpf_vertical_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim, params.hev_thr,
+ bit_depth);
+ else
+ aom_lpf_vertical_14(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+ // advance the destination pointer
+ advance_units = tx_size_wide_unit[tx_size];
+ x += advance_units;
+ p += advance_units * MI_SIZE;
+ }
+ }
+}
+
+void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col) {
+ const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+ const uint32_t scale_horz = plane_ptr->subsampling_x;
+ const uint32_t scale_vert = plane_ptr->subsampling_y;
+ uint8_t *const dst_ptr = plane_ptr->dst.buf;
+ const int dst_stride = plane_ptr->dst.stride;
+ const int y_range = (MAX_MIB_SIZE >> scale_vert);
+ const int x_range = (MAX_MIB_SIZE >> scale_horz);
+ const int use_highbitdepth = cm->seq_params.use_highbitdepth;
+ const aom_bit_depth_t bit_depth = cm->seq_params.bit_depth;
+ for (int x = 0; x < x_range; x += col_step) {
+ uint8_t *p = dst_ptr + x * MI_SIZE;
+ for (int y = 0; y < y_range;) {
+ // inner loop always filter vertical edges in a MI block. If MI size
+ // is 8x8, it will first filter the vertical edge aligned with a 8x8
+ // block. If 4x4 trasnform is used, it will then filter the internal
+ // edge aligned with a 4x4 block
+ const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+ const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+ uint32_t advance_units;
+ TX_SIZE tx_size;
+ AV1_DEBLOCKING_PARAMETERS params;
+ memset(&params, 0, sizeof(params));
+
+ tx_size =
+ set_lpf_parameters(&params, (cm->mi_stride << scale_vert), cm, xd,
+ HORZ_EDGE, curr_x, curr_y, plane, plane_ptr);
+ if (tx_size == TX_INVALID) {
+ params.filter_length = 0;
+ tx_size = TX_4X4;
+ }
+
+ switch (params.filter_length) {
+ // apply 4-tap filtering
+ case 4:
+ if (use_highbitdepth)
+ aom_highbd_lpf_horizontal_4(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim,
+ params.hev_thr, bit_depth);
+ else
+ aom_lpf_horizontal_4(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 6-tap filtering
+ case 6:
+ assert(plane != 0);
+ if (use_highbitdepth)
+ aom_highbd_lpf_horizontal_6(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim,
+ params.hev_thr, bit_depth);
+ else
+ aom_lpf_horizontal_6(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 8-tap filtering
+ case 8:
+ if (use_highbitdepth)
+ aom_highbd_lpf_horizontal_8(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim,
+ params.hev_thr, bit_depth);
+ else
+ aom_lpf_horizontal_8(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // apply 14-tap filtering
+ case 14:
+ if (use_highbitdepth)
+ aom_highbd_lpf_horizontal_14(CONVERT_TO_SHORTPTR(p), dst_stride,
+ params.mblim, params.lim,
+ params.hev_thr, bit_depth);
+ else
+ aom_lpf_horizontal_14(p, dst_stride, params.mblim, params.lim,
+ params.hev_thr);
+ break;
+ // no filtering
+ default: break;
+ }
+
+ // advance the destination pointer
+ advance_units = tx_size_high_unit[tx_size];
+ y += advance_units;
+ p += advance_units * dst_stride * MI_SIZE;
+ }
+ }
+}
+
+static void loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int start, int stop,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
+ int plane_start, int plane_end) {
+ struct macroblockd_plane *pd = xd->plane;
+ const int col_start = 0;
+ const int col_end = cm->mi_cols;
+ int mi_row, mi_col;
+ int plane;
+
+#if LOOP_FILTER_BITMASK
+ if (is_decoding) {
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, 0, 0,
+ plane, plane + 1);
+ av1_build_bitmask_vert_info(cm, &pd[plane], plane);
+ av1_build_bitmask_horz_info(cm, &pd[plane], plane);
+
+ // apply loop filtering which only goes through buffer once
+ for (mi_row = start; mi_row < stop; mi_row += MI_SIZE_64X64) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MI_SIZE_64X64) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row, mi_col,
+ plane, plane + 1);
+ av1_filter_block_plane_bitmask_vert(cm, &pd[plane], plane, mi_row,
+ mi_col);
+ if (mi_col - MI_SIZE_64X64 >= 0) {
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ av1_setup_dst_planes(pd, MI_SIZE_64X64, frame_buffer, mi_row,
+ mi_col - MI_SIZE_64X64, plane, plane + 1);
+ av1_filter_block_plane_bitmask_horz(cm, &pd[plane], plane, mi_row,
+ mi_col - MI_SIZE_64X64);
+ }
+ }
+ return;
+ }
+#endif
+
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+
+#if LOOP_FILTER_BITMASK
+ // filter all vertical edges every superblock (could be 128x128 or 64x64)
+ for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+ for (mi_col = col_start; mi_col < col_end;
+ mi_col += cm->seq_params.mib_size) {
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+
+ av1_setup_bitmask(cm, mi_row, mi_col, plane, pd[plane].subsampling_x,
+ pd[plane].subsampling_y, stop, col_end);
+ av1_filter_block_plane_ver(cm, &pd[plane], plane, mi_row, mi_col);
+ }
+ }
+
+ // filter all horizontal edges every superblock
+ for (mi_row = start; mi_row < stop; mi_row += cm->seq_params.mib_size) {
+ for (mi_col = col_start; mi_col < col_end;
+ mi_col += cm->seq_params.mib_size) {
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+
+ av1_filter_block_plane_hor(cm, &pd[plane], plane, mi_row, mi_col);
+ }
+ }
+#else
+ if (cm->lf.combine_vert_horz_lf) {
+ // filter all vertical and horizontal edges in every 128x128 super block
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+ // filter vertical edges
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+ av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ // filter horizontal edges
+ if (mi_col - MAX_MIB_SIZE >= 0) {
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer,
+ mi_row, mi_col - MAX_MIB_SIZE, plane,
+ plane + 1);
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+ }
+ }
+ // filter horizontal edges
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col - MAX_MIB_SIZE, plane, plane + 1);
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col - MAX_MIB_SIZE);
+ }
+ } else {
+ // filter all vertical edges in every 128x128 super block
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+ av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ }
+ }
+
+ // filter all horizontal edges in every 128x128 super block
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
+ av1_setup_dst_planes(pd, cm->seq_params.sb_size, frame_buffer, mi_row,
+ mi_col, plane, plane + 1);
+ av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+ mi_col);
+ }
+ }
+ }
+#endif // LOOP_FILTER_BITMASK
+ }
+}
+
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+#if LOOP_FILTER_BITMASK
+ int is_decoding,
+#endif
+ int plane_start, int plane_end, int partial_frame) {
+ int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+ start_mi_row = 0;
+ mi_rows_to_filter = cm->mi_rows;
+ if (partial_frame && cm->mi_rows > 8) {
+ start_mi_row = cm->mi_rows >> 1;
+ start_mi_row &= 0xfffffff8;
+ mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+ }
+ end_mi_row = start_mi_row + mi_rows_to_filter;
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+ loop_filter_rows(frame, cm, xd, start_mi_row, end_mi_row,
+#if LOOP_FILTER_BITMASK
+ is_decoding,
+#endif
+ plane_start, plane_end);
+}
diff --git a/third_party/aom/av1/common/av1_loopfilter.h b/third_party/aom/av1/common/av1_loopfilter.h
new file mode 100644
index 0000000000..80ac611781
--- /dev/null
+++ b/third_party/aom/av1/common/av1_loopfilter.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+#define AOM_AV1_COMMON_AV1_LOOPFILTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LOOP_FILTER 63
+#define MAX_SHARPNESS 7
+
+#define SIMD_WIDTH 16
+
+enum lf_path {
+ LF_PATH_420,
+ LF_PATH_444,
+ LF_PATH_SLOW,
+};
+
+#if LOOP_FILTER_BITMASK
+typedef struct {
+ uint64_t bits[4];
+} FilterMask;
+
+// This structure holds bit masks for all 4x4 blocks in a 64x64 region.
+// Each 1 bit represents a position in which we want to apply the loop filter.
+// For Y plane, 4x4 in 64x64 requires 16x16 = 256 bit, therefore we use 4
+// uint64_t; For U, V plane, for 420 format, plane size is 32x32, thus we use
+// a uint64_t to represent bitmask.
+// Left_ entries refer to whether we apply a filter on the border to the
+// left of the block. Above_ entries refer to whether or not to apply a
+// filter on the above border.
+// Since each transform is accompanied by a potentially different type of
+// loop filter there is a different entry in the array for each transform size.
+typedef struct {
+ FilterMask left_y[TX_SIZES];
+ FilterMask above_y[TX_SIZES];
+ FilterMask left_u[TX_SIZES];
+ FilterMask above_u[TX_SIZES];
+ FilterMask left_v[TX_SIZES];
+ FilterMask above_v[TX_SIZES];
+
+ // Y plane vertical edge and horizontal edge filter level
+ uint8_t lfl_y_hor[MI_SIZE_64X64][MI_SIZE_64X64];
+ uint8_t lfl_y_ver[MI_SIZE_64X64][MI_SIZE_64X64];
+
+ // U plane filter level
+ uint8_t lfl_u[MI_SIZE_64X64][MI_SIZE_64X64];
+
+ // V plane filter level
+ uint8_t lfl_v[MI_SIZE_64X64][MI_SIZE_64X64];
+
+ // other info
+ FilterMask skip;
+ FilterMask is_vert_border;
+ FilterMask is_horz_border;
+ // Y or UV planes, 5 tx sizes: 4x4, 8x8, 16x16, 32x32, 64x64
+ FilterMask tx_size_ver[2][5];
+ FilterMask tx_size_hor[2][5];
+} LoopFilterMask;
+#endif // LOOP_FILTER_BITMASK
+
+struct loopfilter {
+ int filter_level[2];
+ int filter_level_u;
+ int filter_level_v;
+
+ int sharpness_level;
+
+ uint8_t mode_ref_delta_enabled;
+ uint8_t mode_ref_delta_update;
+
+ // 0 = Intra, Last, Last2+Last3,
+ // GF, BRF, ARF2, ARF
+ int8_t ref_deltas[REF_FRAMES];
+
+ // 0 = ZERO_MV, MV
+ int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+
+ int combine_vert_horz_lf;
+
+#if LOOP_FILTER_BITMASK
+ LoopFilterMask *lfm;
+ size_t lfm_num;
+ int lfm_stride;
+#endif // LOOP_FILTER_BITMASK
+};
+
+// Need to align this structure so when it is declared and
+// passed it can be loaded into vector registers.
+typedef struct {
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, mblim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, lim[SIMD_WIDTH]);
+ DECLARE_ALIGNED(SIMD_WIDTH, uint8_t, hev_thr[SIMD_WIDTH]);
+} loop_filter_thresh;
+
+typedef struct {
+ loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
+ uint8_t lvl[MAX_MB_PLANE][MAX_SEGMENTS][2][REF_FRAMES][MAX_MODE_LF_DELTAS];
+} loop_filter_info_n;
+
+/* assorted loopfilter functions which get used elsewhere */
+struct AV1Common;
+struct macroblockd;
+struct AV1LfSyncData;
+
+void av1_loop_filter_init(struct AV1Common *cm);
+
+void av1_loop_filter_frame_init(struct AV1Common *cm, int plane_start,
+ int plane_end);
+
+#if LOOP_FILTER_BITMASK
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *mbd, int is_decoding,
+ int plane_start, int plane_end, int partial_frame);
+#else
+void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *mbd, int plane_start,
+ int plane_end, int partial_frame);
+#endif
+
+void av1_filter_block_plane_vert(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col);
+
+void av1_filter_block_plane_horz(const struct AV1Common *const cm,
+ const MACROBLOCKD *const xd, const int plane,
+ const MACROBLOCKD_PLANE *const plane_ptr,
+ const uint32_t mi_row, const uint32_t mi_col);
+
+typedef struct LoopFilterWorkerData {
+ YV12_BUFFER_CONFIG *frame_buffer;
+ struct AV1Common *cm;
+ struct macroblockd_plane planes[MAX_MB_PLANE];
+ // TODO(Ranjit): When the filter functions are modified to use xd->lossless
+ // add lossless as a member here.
+ MACROBLOCKD *xd;
+} LFWorkerData;
+
+uint8_t get_filter_level(const struct AV1Common *cm,
+ const loop_filter_info_n *lfi_n, const int dir_idx,
+ int plane, const MB_MODE_INFO *mbmi);
+#if LOOP_FILTER_BITMASK
+void av1_setup_bitmask(struct AV1Common *const cm, int mi_row, int mi_col,
+ int plane, int subsampling_x, int subsampling_y,
+ int row_end, int col_end);
+
+void av1_filter_block_plane_ver(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane_ptr,
+ int pl, int mi_row, int mi_col);
+
+void av1_filter_block_plane_hor(struct AV1Common *const cm,
+ struct macroblockd_plane *const plane, int pl,
+ int mi_row, int mi_col);
+LoopFilterMask *get_loop_filter_mask(const struct AV1Common *const cm,
+ int mi_row, int mi_col);
+int get_index_shift(int mi_col, int mi_row, int *index);
+
+static const FilterMask left_txform_mask[TX_SIZES] = {
+ { { 0x0000000000000001ULL, // TX_4X4,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0000000000010001ULL, // TX_8X8,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_16X16,
+ 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_32X32,
+ 0x0001000100010001ULL, 0x0000000000000000ULL, 0x0000000000000000ULL } },
+
+ { { 0x0001000100010001ULL, // TX_64X64,
+ 0x0001000100010001ULL, 0x0001000100010001ULL, 0x0001000100010001ULL } },
+};
+
+static const uint64_t above_txform_mask[2][TX_SIZES] = {
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000003ULL, // TX_8X8
+ 0x000000000000000fULL, // TX_16X16
+ 0x00000000000000ffULL, // TX_32X32
+ 0x000000000000ffffULL, // TX_64X64
+ },
+ {
+ 0x0000000000000001ULL, // TX_4X4
+ 0x0000000000000005ULL, // TX_8X8
+ 0x0000000000000055ULL, // TX_16X16
+ 0x0000000000005555ULL, // TX_32X32
+ 0x0000000055555555ULL, // TX_64X64
+ },
+};
+
+extern const int mask_id_table_tx_4x4[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_8x8[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_16x16[BLOCK_SIZES_ALL];
+
+extern const int mask_id_table_tx_32x32[BLOCK_SIZES_ALL];
+
+extern const FilterMask left_mask_univariant_reordered[67];
+
+extern const FilterMask above_mask_univariant_reordered[67];
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_AV1_LOOPFILTER_H_
diff --git a/third_party/aom/av1/common/av1_rtcd.c b/third_party/aom/av1/common/av1_rtcd.c
new file mode 100644
index 0000000000..a77a4d2541
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd.c
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void av1_rtcd() {
+ // TODO(JBB): Remove this aom_once, by insuring that both the encoder and
+ // decoder setup functions are protected by aom_once();
+ aom_once(setup_rtcd_internal);
+}
diff --git a/third_party/aom/av1/common/av1_rtcd_defs.pl b/third_party/aom/av1/common/av1_rtcd_defs.pl
new file mode 100755
index 0000000000..dee1f1c79b
--- /dev/null
+++ b/third_party/aom/av1/common/av1_rtcd_defs.pl
@@ -0,0 +1,398 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub av1_common_forward_decls() {
+print <<EOF
+/*
+ * AV1
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/filter.h"
+#include "av1/common/convolve.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/odintrin.h"
+#include "av1/common/restoration.h"
+
+struct macroblockd;
+
+/* Encoder forward decls */
+struct macroblock;
+struct txfm_param;
+struct aom_variance_vtable;
+struct search_site_config;
+struct yv12_buffer_config;
+
+/* Function pointers return by CfL functions */
+typedef void (*cfl_subsample_lbd_fn)(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+typedef void (*cfl_subsample_hbd_fn)(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+typedef void (*cfl_subtract_average_fn)(const uint16_t *src, int16_t *dst);
+
+typedef void (*cfl_predict_lbd_fn)(const int16_t *src, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+typedef void (*cfl_predict_hbd_fn)(const int16_t *src, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+EOF
+}
+forward_decls qw/av1_common_forward_decls/;
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+ $mmx_x86_64 = 'mmx';
+ $sse2_x86_64 = 'sse2';
+ $ssse3_x86_64 = 'ssse3';
+ $avx_x86_64 = 'avx';
+ $avx2_x86_64 = 'avx2';
+}
+
+add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn";
+specialize qw/av1_convolve_horiz_rs sse4_1/;
+
+add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int x0_qn, int x_step_qn, int bd";
+specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
+
+add_proto qw/void av1_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params";
+
+add_proto qw/void av1_highbd_wiener_convolve_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bps";
+
+specialize qw/av1_wiener_convolve_add_src sse2 avx2 neon/;
+specialize qw/av1_highbd_wiener_convolve_add_src ssse3/;
+specialize qw/av1_highbd_wiener_convolve_add_src avx2/;
+
+
+# directional intra predictor functions
+add_proto qw/void av1_dr_prediction_z1/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z2/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, int dx, int dy";
+add_proto qw/void av1_dr_prediction_z3/, "uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy";
+
+# FILTER_INTRA predictor functions
+add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+specialize qw/av1_filter_intra_predictor sse4_1/;
+
+# High bitdepth functions
+
+#
+# Sub Pixel Filters
+#
+add_proto qw/void av1_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+add_proto qw/void av1_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_horiz/, "$sse2_x86_64";
+
+add_proto qw/void av1_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+specialize qw/av1_highbd_convolve8_vert/, "$sse2_x86_64";
+
+#inv txfm
+add_proto qw/void av1_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_inv_txfm_add ssse3 avx2 neon/;
+
+add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x8/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x8 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_8x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_8x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_16x16/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_16x16 sse4_1/;
+add_proto qw/void av1_highbd_inv_txfm_add_32x32/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
+specialize qw/av1_highbd_inv_txfm_add_32x32 sse4_1 avx2/;
+
+add_proto qw/void av1_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+add_proto qw/void av1_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_4x4 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+specialize qw/av1_inv_txfm2d_add_8x8 sse4_1/;
+add_proto qw/void av1_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x64/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_64x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+add_proto qw/void av1_inv_txfm2d_add_4x16/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_16x4/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_8x32/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+add_proto qw/void av1_inv_txfm2d_add_32x8/, "const int32_t *input, uint16_t *output, int stride, TX_TYPE tx_type, int bd";
+
+# directional intra predictor functions
+add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd";
+add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd";
+
+# build compound seg mask functions
+add_proto qw/void av1_build_compound_diffwtd_mask/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w";
+specialize qw/av1_build_compound_diffwtd_mask sse4_1 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_highbd/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int src0_stride, const uint8_t *src1, int src1_stride, int h, int w, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
+
+add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 avx2 neon/;
+
+#
+# Encoder functions below this point.
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+ # ENCODEMB INVOKE
+
+ # the transform coefficients are held in 32-bit
+ # values, so the assembler code for av1_block_error can no longer be used.
+ add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+ specialize qw/av1_block_error avx2/;
+
+ add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp sse2 avx2/;
+
+ add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp_32x32 avx2/;
+
+ add_proto qw/void av1_quantize_fp_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/av1_quantize_fp_64x64 avx2/;
+
+ # fdct functions
+
+ add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+ #fwd txfm
+ add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
+ specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2/;
+
+ add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_8x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_8x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x16 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_16x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x8 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_16x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_32x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_4x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_16x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_8x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_32x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_4x4 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_8x8 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_16x16 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_32x32 sse4_1/;
+
+ add_proto qw/void av1_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ specialize qw/av1_fwd_txfm2d_64x64 sse4_1/;
+ add_proto qw/void av1_fwd_txfm2d_32x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_64x32/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_16x64/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+ add_proto qw/void av1_fwd_txfm2d_64x16/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd";
+
+ #
+ # Motion search
+ #
+ add_proto qw/int av1_diamond_search_sad/, "struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
+
+ add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
+
+ add_proto qw/void av1_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+ specialize qw/av1_temporal_filter_apply sse2 msa/;
+
+ add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
+
+ # ENCODEMB INVOKE
+
+ add_proto qw/int64_t av1_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+ specialize qw/av1_highbd_block_error sse2/;
+
+ add_proto qw/void av1_highbd_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+
+ add_proto qw/void av1_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+ specialize qw/av1_highbd_quantize_fp sse4_1 avx2/;
+
+ add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+
+ # End av1_high encoder functions
+
+ # txb
+ add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts";
+ specialize qw/av1_get_nz_map_contexts sse2/;
+ add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels";
+ specialize qw/av1_txb_init_levels sse4_1 avx2/;
+
+ add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+ specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
+ add_proto qw/int av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+ specialize qw/av1_wedge_sign_from_residuals sse2 avx2/;
+ add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+ specialize qw/av1_wedge_compute_delta_squares sse2 avx2/;
+
+ # hash
+ add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, int length";
+ specialize qw/av1_get_crc32c_value sse4_2/;
+
+ add_proto qw/void av1_compute_stats/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, double *M, double *H";
+ specialize qw/av1_compute_stats sse4_1 avx2/;
+
+ add_proto qw/int64_t av1_lowbd_pixel_proj_error/, " const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params";
+ specialize qw/av1_lowbd_pixel_proj_error sse4_1 avx2/;
+}
+# end encoder functions
+
+# Deringing Functions
+
+add_proto qw/int cdef_find_dir/, "const uint16_t *img, int stride, int32_t *var, int coeff_shift";
+add_proto qw/void cdef_filter_block/, "uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int bsize, int max, int coeff_shift";
+
+add_proto qw/void copy_rect8_8bit_to_16bit/, "uint16_t *dst, int dstride, const uint8_t *src, int sstride, int v, int h";
+add_proto qw/void copy_rect8_16bit_to_16bit/, "uint16_t *dst, int dstride, const uint16_t *src, int sstride, int v, int h";
+
+# VS compiling for 32 bit targets does not support vector types in
+# structs as arguments, which makes the v256 type of the intrinsics
+# hard to support, so optimizations for this target are disabled.
+if ($opts{config} !~ /libs-x86-win32-vs.*/) {
+ specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/cdef_filter_block sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+ specialize qw/copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/;
+}
+
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_warp_affine sse4_1 neon/;
+
+add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta";
+specialize qw/av1_highbd_warp_affine sse4_1/;
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+ add_proto qw/double compute_cross_correlation/, "unsigned char *im1, int stride1, int x1, int y1, unsigned char *im2, int stride2, int x2, int y2";
+ specialize qw/compute_cross_correlation sse4_1/;
+}
+
+# LOOP_RESTORATION functions
+
+add_proto qw/void apply_selfguided_restoration/, "const uint8_t *dat, int width, int height, int stride, int eps, const int *xqd, uint8_t *dst, int dst_stride, int32_t *tmpbuf, int bit_depth, int highbd";
+specialize qw/apply_selfguided_restoration sse4_1 avx2 neon/;
+
+add_proto qw/int av1_selfguided_restoration/, "const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth, int highbd";
+specialize qw/av1_selfguided_restoration sse4_1 avx2 neon/;
+
+# CONVOLVE_ROUND/COMPOUND_ROUND functions
+
+add_proto qw/void av1_convolve_2d_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_2d_copy_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_x_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_convolve_y_sr/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_2d_copy/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_x/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_jnt_convolve_y/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params";
+add_proto qw/void av1_highbd_convolve_2d_copy_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_2d_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+add_proto qw/void av1_highbd_jnt_convolve_2d_copy/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+
+ add_proto qw/void av1_convolve_2d_scale/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_qn, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params";
+ add_proto qw/void av1_highbd_convolve_2d_scale/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int x_step_qn, const int subpel_y_q4, const int y_step_qn, ConvolveParams *conv_params, int bd";
+
+ specialize qw/av1_convolve_2d_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_2d_copy_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_x_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_y_sr sse2 avx2 neon/;
+ specialize qw/av1_convolve_2d_scale sse4_1/;
+ specialize qw/av1_jnt_convolve_2d ssse3 avx2 neon/;
+ specialize qw/av1_jnt_convolve_2d_copy sse2 avx2 neon/;
+ specialize qw/av1_jnt_convolve_x sse2 avx2 neon/;
+ specialize qw/av1_jnt_convolve_y sse2 avx2 neon/;
+ specialize qw/av1_highbd_convolve_2d_copy_sr sse2 avx2/;
+ specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_x_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_y_sr ssse3 avx2/;
+ specialize qw/av1_highbd_convolve_2d_scale sse4_1/;
+ specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
+ specialize qw/av1_highbd_jnt_convolve_x sse4_1 avx2/;
+ specialize qw/av1_highbd_jnt_convolve_y sse4_1 avx2/;
+ specialize qw/av1_highbd_jnt_convolve_2d_copy sse4_1 avx2/;
+
+# INTRA_EDGE functions
+add_proto qw/void av1_filter_intra_edge/, "uint8_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge sse4_1/;
+add_proto qw/void av1_upsample_intra_edge/, "uint8_t *p, int sz";
+specialize qw/av1_upsample_intra_edge sse4_1/;
+
+add_proto qw/void av1_filter_intra_edge_high/, "uint16_t *p, int sz, int strength";
+specialize qw/av1_filter_intra_edge_high sse4_1/;
+add_proto qw/void av1_upsample_intra_edge_high/, "uint16_t *p, int sz, int bd";
+specialize qw/av1_upsample_intra_edge_high sse4_1/;
+
+# CFL
+add_proto qw/cfl_subtract_average_fn get_subtract_average_fn/, "TX_SIZE tx_size";
+specialize qw/get_subtract_average_fn sse2 avx2 neon vsx/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_lbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_420_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
+
+add_proto qw/cfl_predict_hbd_fn get_predict_hbd_fn/, "TX_SIZE tx_size";
+specialize qw/get_predict_hbd_fn ssse3 avx2 neon/;
+
+1;
diff --git a/third_party/aom/av1/common/av1_txfm.c b/third_party/aom/av1/common/av1_txfm.c
new file mode 100644
index 0000000000..bb70eab703
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+
+// av1_cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+const int32_t av1_cospi_arr_data[7][64] = {
+ { 1024, 1024, 1023, 1021, 1019, 1016, 1013, 1009, 1004, 999, 993, 987, 980,
+ 972, 964, 955, 946, 936, 926, 915, 903, 891, 878, 865, 851, 837,
+ 822, 807, 792, 775, 759, 742, 724, 706, 688, 669, 650, 630, 610,
+ 590, 569, 548, 526, 505, 483, 460, 438, 415, 392, 369, 345, 321,
+ 297, 273, 249, 224, 200, 175, 150, 125, 100, 75, 50, 25 },
+ { 2048, 2047, 2046, 2042, 2038, 2033, 2026, 2018, 2009, 1998, 1987,
+ 1974, 1960, 1945, 1928, 1911, 1892, 1872, 1851, 1829, 1806, 1782,
+ 1757, 1730, 1703, 1674, 1645, 1615, 1583, 1551, 1517, 1483, 1448,
+ 1412, 1375, 1338, 1299, 1260, 1220, 1179, 1138, 1096, 1053, 1009,
+ 965, 921, 876, 830, 784, 737, 690, 642, 595, 546, 498,
+ 449, 400, 350, 301, 251, 201, 151, 100, 50 },
+ { 4096, 4095, 4091, 4085, 4076, 4065, 4052, 4036, 4017, 3996, 3973,
+ 3948, 3920, 3889, 3857, 3822, 3784, 3745, 3703, 3659, 3612, 3564,
+ 3513, 3461, 3406, 3349, 3290, 3229, 3166, 3102, 3035, 2967, 2896,
+ 2824, 2751, 2675, 2598, 2520, 2440, 2359, 2276, 2191, 2106, 2019,
+ 1931, 1842, 1751, 1660, 1567, 1474, 1380, 1285, 1189, 1092, 995,
+ 897, 799, 700, 601, 501, 401, 301, 201, 101 },
+ { 8192, 8190, 8182, 8170, 8153, 8130, 8103, 8071, 8035, 7993, 7946,
+ 7895, 7839, 7779, 7713, 7643, 7568, 7489, 7405, 7317, 7225, 7128,
+ 7027, 6921, 6811, 6698, 6580, 6458, 6333, 6203, 6070, 5933, 5793,
+ 5649, 5501, 5351, 5197, 5040, 4880, 4717, 4551, 4383, 4212, 4038,
+ 3862, 3683, 3503, 3320, 3135, 2948, 2760, 2570, 2378, 2185, 1990,
+ 1795, 1598, 1401, 1202, 1003, 803, 603, 402, 201 },
+ { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143, 16069, 15986, 15893,
+ 15791, 15679, 15557, 15426, 15286, 15137, 14978, 14811, 14635, 14449, 14256,
+ 14053, 13842, 13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866, 11585,
+ 11297, 11003, 10702, 10394, 10080, 9760, 9434, 9102, 8765, 8423, 8076,
+ 7723, 7366, 7005, 6639, 6270, 5897, 5520, 5139, 4756, 4370, 3981,
+ 3590, 3196, 2801, 2404, 2006, 1606, 1205, 804, 402 },
+ { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286, 32138, 31972, 31786,
+ 31581, 31357, 31114, 30853, 30572, 30274, 29957, 29622, 29269, 28899, 28511,
+ 28106, 27684, 27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732, 23170,
+ 22595, 22006, 21403, 20788, 20160, 19520, 18868, 18205, 17531, 16846, 16151,
+ 15447, 14733, 14010, 13279, 12540, 11793, 11039, 10279, 9512, 8740, 7962,
+ 7180, 6393, 5602, 4808, 4011, 3212, 2411, 1608, 804 },
+ { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571, 64277, 63944, 63572,
+ 63162, 62714, 62228, 61705, 61145, 60547, 59914, 59244, 58538, 57798, 57022,
+ 56212, 55368, 54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464, 46341,
+ 45190, 44011, 42806, 41576, 40320, 39040, 37736, 36410, 35062, 33692, 32303,
+ 30893, 29466, 28020, 26558, 25080, 23586, 22078, 20557, 19024, 17479, 15924,
+ 14359, 12785, 11204, 9616, 8022, 6424, 4821, 3216, 1608 }
+};
+
+// av1_sinpi_arr_data[i][j] = (int)round((sqrt(2) * sin(j*Pi/9) * 2 / 3) * (1
+// << (cos_bit_min + i))) modified so that elements j=1,2 sum to element j=4.
+const int32_t av1_sinpi_arr_data[7][5] = {
+ { 0, 330, 621, 836, 951 }, { 0, 660, 1241, 1672, 1901 },
+ { 0, 1321, 2482, 3344, 3803 }, { 0, 2642, 4964, 6689, 7606 },
+ { 0, 5283, 9929, 13377, 15212 }, { 0, 10566, 19858, 26755, 30424 },
+ { 0, 21133, 39716, 53510, 60849 }
+};
+
+void av1_round_shift_array_c(int32_t *arr, int size, int bit) {
+ int i;
+ if (bit == 0) {
+ return;
+ } else {
+ if (bit > 0) {
+ for (i = 0; i < size; i++) {
+ arr[i] = round_shift(arr[i], bit);
+ }
+ } else {
+ for (i = 0; i < size; i++) {
+ arr[i] = (int32_t)clamp64(((int64_t)1 << (-bit)) * arr[i], INT32_MIN,
+ INT32_MAX);
+ }
+ }
+ }
+}
+
+const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D] = {
+ { TXFM_TYPE_DCT4, TXFM_TYPE_ADST4, TXFM_TYPE_ADST4, TXFM_TYPE_IDENTITY4 },
+ { TXFM_TYPE_DCT8, TXFM_TYPE_ADST8, TXFM_TYPE_ADST8, TXFM_TYPE_IDENTITY8 },
+ { TXFM_TYPE_DCT16, TXFM_TYPE_ADST16, TXFM_TYPE_ADST16, TXFM_TYPE_IDENTITY16 },
+ { TXFM_TYPE_DCT32, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID,
+ TXFM_TYPE_IDENTITY32 },
+ { TXFM_TYPE_DCT64, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID, TXFM_TYPE_INVALID }
+};
+
+const int8_t av1_txfm_stage_num_list[TXFM_TYPES] = {
+ 4, // TXFM_TYPE_DCT4
+ 6, // TXFM_TYPE_DCT8
+ 8, // TXFM_TYPE_DCT16
+ 10, // TXFM_TYPE_DCT32
+ 12, // TXFM_TYPE_DCT64
+ 7, // TXFM_TYPE_ADST4
+ 8, // TXFM_TYPE_ADST8
+ 10, // TXFM_TYPE_ADST16
+ 1, // TXFM_TYPE_IDENTITY4
+ 1, // TXFM_TYPE_IDENTITY8
+ 1, // TXFM_TYPE_IDENTITY16
+ 1, // TXFM_TYPE_IDENTITY32
+};
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+
+ int in_range = 1;
+
+ for (int i = 0; i < size; ++i) {
+ if (buf[i] < min_value || buf[i] > max_value) {
+ in_range = 0;
+ }
+ }
+
+ if (!in_range) {
+ fprintf(stderr, "Error: coeffs contain out-of-range values\n");
+ fprintf(stderr, "size: %d\n", size);
+ fprintf(stderr, "stage: %d\n", stage);
+ fprintf(stderr, "allowed range: [%" PRId64 ";%" PRId64 "]\n", min_value,
+ max_value);
+
+ fprintf(stderr, "coeffs: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", input[j]);
+ }
+ fprintf(stderr, "]\n");
+
+ fprintf(stderr, " buf: ");
+
+ fprintf(stderr, "[");
+ for (int j = 0; j < size; j++) {
+ if (j > 0) fprintf(stderr, ", ");
+ fprintf(stderr, "%d", buf[j]);
+ }
+ fprintf(stderr, "]\n\n");
+ }
+
+ assert(in_range);
+#else
+ (void)stage;
+ (void)input;
+ (void)buf;
+ (void)size;
+ (void)bit;
+#endif
+}
diff --git a/third_party/aom/av1/common/av1_txfm.h b/third_party/aom/av1/common/av1_txfm.h
new file mode 100644
index 0000000000..59d64ca4af
--- /dev/null
+++ b/third_party/aom/av1/common/av1_txfm.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_AV1_TXFM_H_
+#define AOM_AV1_COMMON_AV1_TXFM_H_
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/blockd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(DO_RANGE_CHECK_CLAMP)
+#define DO_RANGE_CHECK_CLAMP 0
+#endif
+
+extern const int32_t av1_cospi_arr_data[7][64];
+extern const int32_t av1_sinpi_arr_data[7][5];
+
+#define MAX_TXFM_STAGE_NUM 12
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+#define NewSqrt2Bits ((int32_t)12)
+// 2^12 * sqrt(2)
+static const int32_t NewSqrt2 = 5793;
+// 2^12 / sqrt(2)
+static const int32_t NewInvSqrt2 = 2896;
+
+static INLINE const int32_t *cospi_arr(int n) {
+ return av1_cospi_arr_data[n - cos_bit_min];
+}
+
+static INLINE const int32_t *sinpi_arr(int n) {
+ return av1_sinpi_arr_data[n - cos_bit_min];
+}
+
+static INLINE int32_t range_check_value(int32_t value, int8_t bit) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ const int64_t max_value = (1LL << (bit - 1)) - 1;
+ const int64_t min_value = -(1LL << (bit - 1));
+ if (value < min_value || value > max_value) {
+ fprintf(stderr, "coeff out of bit range, value: %d bit %d\n", value, bit);
+ assert(0);
+ }
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+#if DO_RANGE_CHECK_CLAMP
+ bit = AOMMIN(bit, 31);
+ return clamp(value, -(1 << (bit - 1)), (1 << (bit - 1)) - 1);
+#endif // DO_RANGE_CHECK_CLAMP
+ (void)bit;
+ return value;
+}
+
+static INLINE int32_t round_shift(int64_t value, int bit) {
+ assert(bit >= 1);
+ return (int32_t)((value + (1ll << (bit - 1))) >> bit);
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+ int bit) {
+ int64_t result_64 = (int64_t)(w0 * in0) + (int64_t)(w1 * in1);
+ int64_t intermediate = result_64 + (1LL << (bit - 1));
+ // NOTE(david.barker): The value 'result_64' may not necessarily fit
+ // into 32 bits. However, the result of this function is nominally
+ // ROUND_POWER_OF_TWO_64(result_64, bit)
+ // and that is required to fit into stage_range[stage] many bits
+ // (checked by range_check_buf()).
+ //
+ // Here we've unpacked that rounding operation, and it can be shown
+ // that the value of 'intermediate' here *does* fit into 32 bits
+ // for any conformant bitstream.
+ // The upshot is that, if you do all this calculation using
+ // wrapping 32-bit arithmetic instead of (non-wrapping) 64-bit arithmetic,
+ // then you'll still get the correct result.
+ // To provide a check on this logic, we assert that 'intermediate'
+ // would fit into an int32 if range checking is enabled.
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(intermediate >= INT32_MIN && intermediate <= INT32_MAX);
+#endif
+ return (int32_t)(intermediate >> bit);
+}
+
+static INLINE uint16_t highbd_clip_pixel_add(uint16_t dest, tran_high_t trans,
+ int bd) {
+ return clip_pixel_highbd(dest + (int)trans, bd);
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+
+typedef void (*FwdTxfm2dFunc)(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd);
+
+typedef enum TXFM_TYPE {
+ TXFM_TYPE_DCT4,
+ TXFM_TYPE_DCT8,
+ TXFM_TYPE_DCT16,
+ TXFM_TYPE_DCT32,
+ TXFM_TYPE_DCT64,
+ TXFM_TYPE_ADST4,
+ TXFM_TYPE_ADST8,
+ TXFM_TYPE_ADST16,
+ TXFM_TYPE_IDENTITY4,
+ TXFM_TYPE_IDENTITY8,
+ TXFM_TYPE_IDENTITY16,
+ TXFM_TYPE_IDENTITY32,
+ TXFM_TYPES,
+ TXFM_TYPE_INVALID,
+} TXFM_TYPE;
+
+typedef struct TXFM_2D_FLIP_CFG {
+ TX_SIZE tx_size;
+ int ud_flip; // flip upside down
+ int lr_flip; // flip left to right
+ const int8_t *shift;
+ int8_t cos_bit_col;
+ int8_t cos_bit_row;
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ TXFM_TYPE txfm_type_col;
+ TXFM_TYPE txfm_type_row;
+ int stage_num_col;
+ int stage_num_row;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ break;
+ case IDTX:
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ break;
+ case FLIPADST_DCT:
+ case FLIPADST_ADST:
+ case V_FLIPADST:
+ *ud_flip = 1;
+ *lr_flip = 0;
+ break;
+ case DCT_FLIPADST:
+ case ADST_FLIPADST:
+ case H_FLIPADST:
+ *ud_flip = 0;
+ *lr_flip = 1;
+ break;
+ case FLIPADST_FLIPADST:
+ *ud_flip = 1;
+ *lr_flip = 1;
+ break;
+ default:
+ *ud_flip = 0;
+ *lr_flip = 0;
+ assert(0);
+ }
+}
+
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+ get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
+// Utility function that returns the log of the ratio of the col and row
+// sizes.
+static INLINE int get_rect_tx_log_ratio(int col, int row) {
+ if (col == row) return 0;
+ if (col > row) {
+ if (col == row * 2) return 1;
+ if (col == row * 4) return 2;
+ assert(0 && "Unsupported transform size");
+ } else {
+ if (row == col * 2) return -1;
+ if (row == col * 4) return -2;
+ assert(0 && "Unsupported transform size");
+ }
+ return 0; // Invalid
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, int bd);
+
+void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
+ int bd);
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg);
+void av1_get_inv_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg);
+extern const TXFM_TYPE av1_txfm_type_ls[5][TX_TYPES_1D];
+extern const int8_t av1_txfm_stage_num_list[TXFM_TYPES];
+static INLINE int get_txw_idx(TX_SIZE tx_size) {
+ return tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+}
+static INLINE int get_txh_idx(TX_SIZE tx_size) {
+ return tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+}
+
+void av1_range_check_buf(int32_t stage, const int32_t *input,
+ const int32_t *buf, int32_t size, int8_t bit);
+#define MAX_TXWH_IDX 5
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // AOM_AV1_COMMON_AV1_TXFM_H_
diff --git a/third_party/aom/av1/common/blockd.c b/third_party/aom/av1/common/blockd.c
new file mode 100644
index 0000000000..2e796b6560
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/system_state.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi) {
+ if (!left_mi) return DC_PRED;
+ assert(!is_inter_block(left_mi) || is_intrabc_block(left_mi));
+ return left_mi->mode;
+}
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi) {
+ if (!above_mi) return DC_PRED;
+ assert(!is_inter_block(above_mi) || is_intrabc_block(above_mi));
+ return above_mi->mode;
+}
+
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+ int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff) {
+ ENTROPY_CONTEXT *const a = pd->above_context + aoff;
+ ENTROPY_CONTEXT *const l = pd->left_context + loff;
+ const int txs_wide = tx_size_wide_unit[tx_size];
+ const int txs_high = tx_size_high_unit[tx_size];
+
+ // above
+ if (has_eob && xd->mb_to_right_edge < 0) {
+ const int blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int above_contexts = AOMMIN(txs_wide, blocks_wide - aoff);
+ memset(a, has_eob, sizeof(*a) * above_contexts);
+ memset(a + above_contexts, 0, sizeof(*a) * (txs_wide - above_contexts));
+ } else {
+ memset(a, has_eob, sizeof(*a) * txs_wide);
+ }
+
+ // left
+ if (has_eob && xd->mb_to_bottom_edge < 0) {
+ const int blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int left_contexts = AOMMIN(txs_high, blocks_high - loff);
+ memset(l, has_eob, sizeof(*l) * left_contexts);
+ memset(l + left_contexts, 0, sizeof(*l) * (txs_high - left_contexts));
+ } else {
+ memset(l, has_eob, sizeof(*l) * txs_high);
+ }
+}
+void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, const int num_planes) {
+ int i;
+ int nplanes;
+ int chroma_ref;
+ chroma_ref =
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+ nplanes = 1 + (num_planes - 1) * chroma_ref;
+ for (i = 0; i < nplanes; i++) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int txs_wide = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int txs_high = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ memset(pd->above_context, 0, sizeof(ENTROPY_CONTEXT) * txs_wide);
+ memset(pd->left_context, 0, sizeof(ENTROPY_CONTEXT) * txs_high);
+ }
+}
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes) {
+ xd->delta_lf_from_base = 0;
+ const int frame_lf_count =
+ num_planes > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) xd->delta_lf[lf_id] = 0;
+}
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes) {
+ for (int p = 0; p < num_planes; ++p) {
+ set_default_wiener(xd->wiener_info + p);
+ set_default_sgrproj(xd->sgrproj_info + p);
+ }
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+ const int num_planes) {
+ int i;
+
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].plane_type = get_plane_type(i);
+ xd->plane[i].subsampling_x = i ? ss_x : 0;
+ xd->plane[i].subsampling_y = i ? ss_y : 0;
+ }
+ for (i = num_planes; i < MAX_MB_PLANE; i++) {
+ xd->plane[i].subsampling_x = 1;
+ xd->plane[i].subsampling_y = 1;
+ }
+}
+
+const int16_t dr_intra_derivative[90] = {
+ // More evenly spread out angles and limited to 10-bit
+ // Values that are 0 will never be used
+ // Approx angle
+ 0, 0, 0, //
+ 1023, 0, 0, // 3, ...
+ 547, 0, 0, // 6, ...
+ 372, 0, 0, 0, 0, // 9, ...
+ 273, 0, 0, // 14, ...
+ 215, 0, 0, // 17, ...
+ 178, 0, 0, // 20, ...
+ 151, 0, 0, // 23, ... (113 & 203 are base angles)
+ 132, 0, 0, // 26, ...
+ 116, 0, 0, // 29, ...
+ 102, 0, 0, 0, // 32, ...
+ 90, 0, 0, // 36, ...
+ 80, 0, 0, // 39, ...
+ 71, 0, 0, // 42, ...
+ 64, 0, 0, // 45, ... (45 & 135 are base angles)
+ 57, 0, 0, // 48, ...
+ 51, 0, 0, // 51, ...
+ 45, 0, 0, 0, // 54, ...
+ 40, 0, 0, // 58, ...
+ 35, 0, 0, // 61, ...
+ 31, 0, 0, // 64, ...
+ 27, 0, 0, // 67, ... (67 & 157 are base angles)
+ 23, 0, 0, // 70, ...
+ 19, 0, 0, // 73, ...
+ 15, 0, 0, 0, 0, // 76, ...
+ 11, 0, 0, // 81, ...
+ 7, 0, 0, // 84, ...
+ 3, 0, 0, // 87, ...
+};
diff --git a/third_party/aom/av1/common/blockd.h b/third_party/aom/av1/common/blockd.h
new file mode 100644
index 0000000000..a2311c1b00
--- /dev/null
+++ b/third_party/aom/av1/common/blockd.h
@@ -0,0 +1,1176 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_BLOCKD_H_
+#define AOM_AV1_COMMON_BLOCKD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#include "av1/common/common_data.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mv.h"
+#include "av1/common/scale.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_B_QUANT_NO_TRELLIS 1
+
+#define MAX_MB_PLANE 3
+
+#define MAX_DIFFWTD_MASK_BITS 1
+
+// DIFFWTD_MASK_TYPES should not surpass 1 << MAX_DIFFWTD_MASK_BITS
+typedef enum ATTRIBUTE_PACKED {
+ DIFFWTD_38 = 0,
+ DIFFWTD_38_INV,
+ DIFFWTD_MASK_TYPES,
+} DIFFWTD_MASK_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ KEY_FRAME = 0,
+ INTER_FRAME = 1,
+ INTRA_ONLY_FRAME = 2, // replaces intra-only
+ S_FRAME = 3,
+ FRAME_TYPES,
+} FRAME_TYPE;
+
+static INLINE int is_comp_ref_allowed(BLOCK_SIZE bsize) {
+ return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+ return mode >= INTER_MODE_START && mode < INTER_MODE_END;
+}
+
+typedef struct {
+ uint8_t *plane[MAX_MB_PLANE];
+ int stride[MAX_MB_PLANE];
+} BUFFER_SET;
+
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+ return mode >= SINGLE_INTER_MODE_START && mode < SINGLE_INTER_MODE_END;
+}
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+ return mode >= COMP_INTER_MODE_START && mode < COMP_INTER_MODE_END;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+ static PREDICTION_MODE lut[] = {
+ MB_MODE_COUNT, // DC_PRED
+ MB_MODE_COUNT, // V_PRED
+ MB_MODE_COUNT, // H_PRED
+ MB_MODE_COUNT, // D45_PRED
+ MB_MODE_COUNT, // D135_PRED
+ MB_MODE_COUNT, // D113_PRED
+ MB_MODE_COUNT, // D157_PRED
+ MB_MODE_COUNT, // D203_PRED
+ MB_MODE_COUNT, // D67_PRED
+ MB_MODE_COUNT, // SMOOTH_PRED
+ MB_MODE_COUNT, // SMOOTH_V_PRED
+ MB_MODE_COUNT, // SMOOTH_H_PRED
+ MB_MODE_COUNT, // PAETH_PRED
+ MB_MODE_COUNT, // NEARESTMV
+ MB_MODE_COUNT, // NEARMV
+ MB_MODE_COUNT, // GLOBALMV
+ MB_MODE_COUNT, // NEWMV
+ NEARESTMV, // NEAREST_NEARESTMV
+ NEARMV, // NEAR_NEARMV
+ NEARESTMV, // NEAREST_NEWMV
+ NEWMV, // NEW_NEARESTMV
+ NEARMV, // NEAR_NEWMV
+ NEWMV, // NEW_NEARMV
+ GLOBALMV, // GLOBAL_GLOBALMV
+ NEWMV, // NEW_NEWMV
+ };
+ assert(NELEMENTS(lut) == MB_MODE_COUNT);
+ assert(is_inter_compound_mode(mode));
+ return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+ static PREDICTION_MODE lut[] = {
+ MB_MODE_COUNT, // DC_PRED
+ MB_MODE_COUNT, // V_PRED
+ MB_MODE_COUNT, // H_PRED
+ MB_MODE_COUNT, // D45_PRED
+ MB_MODE_COUNT, // D135_PRED
+ MB_MODE_COUNT, // D113_PRED
+ MB_MODE_COUNT, // D157_PRED
+ MB_MODE_COUNT, // D203_PRED
+ MB_MODE_COUNT, // D67_PRED
+ MB_MODE_COUNT, // SMOOTH_PRED
+ MB_MODE_COUNT, // SMOOTH_V_PRED
+ MB_MODE_COUNT, // SMOOTH_H_PRED
+ MB_MODE_COUNT, // PAETH_PRED
+ MB_MODE_COUNT, // NEARESTMV
+ MB_MODE_COUNT, // NEARMV
+ MB_MODE_COUNT, // GLOBALMV
+ MB_MODE_COUNT, // NEWMV
+ NEARESTMV, // NEAREST_NEARESTMV
+ NEARMV, // NEAR_NEARMV
+ NEWMV, // NEAREST_NEWMV
+ NEARESTMV, // NEW_NEARESTMV
+ NEWMV, // NEAR_NEWMV
+ NEARMV, // NEW_NEARMV
+ GLOBALMV, // GLOBAL_GLOBALMV
+ NEWMV, // NEW_NEWMV
+ };
+ assert(NELEMENTS(lut) == MB_MODE_COUNT);
+ assert(is_inter_compound_mode(mode));
+ return lut[mode];
+}
+
+static INLINE int have_nearmv_in_inter_mode(PREDICTION_MODE mode) {
+ return (mode == NEARMV || mode == NEAR_NEARMV || mode == NEAR_NEWMV ||
+ mode == NEW_NEARMV);
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+ return (mode == NEWMV || mode == NEW_NEWMV || mode == NEAREST_NEWMV ||
+ mode == NEW_NEARESTMV || mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+
+static INLINE int is_masked_compound_type(COMPOUND_TYPE type) {
+ return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
+}
+
+/* For keyframes, intra block modes are predicted by the (already decoded)
+ modes for the Y blocks to the left and above us; for interframes, there
+ is a single probability table. */
+
+typedef int8_t MV_REFERENCE_FRAME;
+
+typedef struct {
+ // Number of base colors for Y (0) and UV (1)
+ uint8_t palette_size[2];
+ // Value of base colors for Y, U, and V
+ uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+} PALETTE_MODE_INFO;
+
+typedef struct {
+ uint8_t use_filter_intra;
+ FILTER_INTRA_MODE filter_intra_mode;
+} FILTER_INTRA_MODE_INFO;
+
+static const PREDICTION_MODE fimode_to_intradir[FILTER_INTRA_MODES] = {
+ DC_PRED, V_PRED, H_PRED, D157_PRED, DC_PRED
+};
+
+#if CONFIG_RD_DEBUG
+#define TXB_COEFF_COST_MAP_SIZE (MAX_MIB_SIZE)
+#endif
+
+typedef struct RD_STATS {
+ int rate;
+ int64_t dist;
+ // Please be careful of using rdcost, it's not guaranteed to be set all the
+ // time.
+ // TODO(angiebird): Create a set of functions to manipulate the RD_STATS. In
+ // these functions, make sure rdcost is always up-to-date according to
+ // rate/dist.
+ int64_t rdcost;
+ int64_t sse;
+ int skip; // sse should equal to dist when skip == 1
+ int64_t ref_rdcost;
+ int zero_rate;
+ uint8_t invalid_rate;
+#if CONFIG_RD_DEBUG
+ int txb_coeff_cost[MAX_MB_PLANE];
+ int txb_coeff_cost_map[MAX_MB_PLANE][TXB_COEFF_COST_MAP_SIZE]
+ [TXB_COEFF_COST_MAP_SIZE];
+#endif // CONFIG_RD_DEBUG
+} RD_STATS;
+
+// This struct is used to group function args that are commonly
+// sent together in functions related to interinter compound modes
+typedef struct {
+ int wedge_index;
+ int wedge_sign;
+ DIFFWTD_MASK_TYPE mask_type;
+ uint8_t *seg_mask;
+ COMPOUND_TYPE type;
+} INTERINTER_COMPOUND_DATA;
+
+#define INTER_TX_SIZE_BUF_LEN 16
+#define TXK_TYPE_BUF_LEN 64
+// This structure now relates to 4x4 block regions.
+typedef struct MB_MODE_INFO {
+ // Common for both INTER and INTRA blocks
+ BLOCK_SIZE sb_type;
+ PREDICTION_MODE mode;
+ TX_SIZE tx_size;
+ uint8_t inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ int8_t skip;
+ int8_t skip_mode;
+ int8_t segment_id;
+ int8_t seg_id_predicted; // valid only when temporal_update is enabled
+
+ // Only for INTRA blocks
+ UV_PREDICTION_MODE uv_mode;
+
+ PALETTE_MODE_INFO palette_mode_info;
+ uint8_t use_intrabc;
+
+ // Only for INTER blocks
+ InterpFilters interp_filters;
+ MV_REFERENCE_FRAME ref_frame[2];
+
+ TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+
+ // The actual prediction angle is the base angle + (angle_delta * step).
+ int8_t angle_delta[PLANE_TYPES];
+
+ // interintra members
+ INTERINTRA_MODE interintra_mode;
+ // TODO(debargha): Consolidate these flags
+ int use_wedge_interintra;
+ int interintra_wedge_index;
+ int interintra_wedge_sign;
+ // interinter members
+ INTERINTER_COMPOUND_DATA interinter_comp;
+ MOTION_MODE motion_mode;
+ int overlappable_neighbors[2];
+ int_mv mv[2];
+ uint8_t ref_mv_idx;
+ PARTITION_TYPE partition;
+ /* deringing gain *per-superblock* */
+ int8_t cdef_strength;
+ int current_qindex;
+ int delta_lf_from_base;
+ int delta_lf[FRAME_LF_COUNT];
+#if CONFIG_RD_DEBUG
+ RD_STATS rd_stats;
+ int mi_row;
+ int mi_col;
+#endif
+ int num_proj_ref;
+ WarpedMotionParams wm_params;
+
+ // Index of the alpha Cb and alpha Cr combination
+ int cfl_alpha_idx;
+ // Joint sign of alpha Cb and alpha Cr
+ int cfl_alpha_signs;
+
+ int compound_idx;
+ int comp_group_idx;
+} MB_MODE_INFO;
+
+static INLINE int is_intrabc_block(const MB_MODE_INFO *mbmi) {
+ return mbmi->use_intrabc;
+}
+
+static INLINE PREDICTION_MODE get_uv_mode(UV_PREDICTION_MODE mode) {
+ assert(mode < UV_INTRA_MODES);
+ static const PREDICTION_MODE uv2y[] = {
+ DC_PRED, // UV_DC_PRED
+ V_PRED, // UV_V_PRED
+ H_PRED, // UV_H_PRED
+ D45_PRED, // UV_D45_PRED
+ D135_PRED, // UV_D135_PRED
+ D113_PRED, // UV_D113_PRED
+ D157_PRED, // UV_D157_PRED
+ D203_PRED, // UV_D203_PRED
+ D67_PRED, // UV_D67_PRED
+ SMOOTH_PRED, // UV_SMOOTH_PRED
+ SMOOTH_V_PRED, // UV_SMOOTH_V_PRED
+ SMOOTH_H_PRED, // UV_SMOOTH_H_PRED
+ PAETH_PRED, // UV_PAETH_PRED
+ DC_PRED, // UV_CFL_PRED
+ INTRA_INVALID, // UV_INTRA_MODES
+ INTRA_INVALID, // UV_MODE_INVALID
+ };
+ return uv2y[mode];
+}
+
+static INLINE int is_inter_block(const MB_MODE_INFO *mbmi) {
+ return is_intrabc_block(mbmi) || mbmi->ref_frame[0] > INTRA_FRAME;
+}
+
+static INLINE int has_second_ref(const MB_MODE_INFO *mbmi) {
+ return mbmi->ref_frame[1] > INTRA_FRAME;
+}
+
+static INLINE int has_uni_comp_refs(const MB_MODE_INFO *mbmi) {
+ return has_second_ref(mbmi) && (!((mbmi->ref_frame[0] >= BWDREF_FRAME) ^
+ (mbmi->ref_frame[1] >= BWDREF_FRAME)));
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref0(int ref_idx) {
+ static const MV_REFERENCE_FRAME lut[] = {
+ LAST_FRAME, // LAST_LAST2_FRAMES,
+ LAST_FRAME, // LAST_LAST3_FRAMES,
+ LAST_FRAME, // LAST_GOLDEN_FRAMES,
+ BWDREF_FRAME, // BWDREF_ALTREF_FRAMES,
+ LAST2_FRAME, // LAST2_LAST3_FRAMES
+ LAST2_FRAME, // LAST2_GOLDEN_FRAMES,
+ LAST3_FRAME, // LAST3_GOLDEN_FRAMES,
+ BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES,
+ ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES,
+ };
+ assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+ return lut[ref_idx];
+}
+
+static INLINE MV_REFERENCE_FRAME comp_ref1(int ref_idx) {
+ static const MV_REFERENCE_FRAME lut[] = {
+ LAST2_FRAME, // LAST_LAST2_FRAMES,
+ LAST3_FRAME, // LAST_LAST3_FRAMES,
+ GOLDEN_FRAME, // LAST_GOLDEN_FRAMES,
+ ALTREF_FRAME, // BWDREF_ALTREF_FRAMES,
+ LAST3_FRAME, // LAST2_LAST3_FRAMES
+ GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES,
+ GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES,
+ ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES,
+ ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES,
+ };
+ assert(NELEMENTS(lut) == TOTAL_UNIDIR_COMP_REFS);
+ return lut[ref_idx];
+}
+
+PREDICTION_MODE av1_left_block_mode(const MB_MODE_INFO *left_mi);
+
+PREDICTION_MODE av1_above_block_mode(const MB_MODE_INFO *above_mi);
+
+static INLINE int is_global_mv_block(const MB_MODE_INFO *const mbmi,
+ TransformationType type) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int block_size_allowed =
+ AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+ return (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) && type > TRANSLATION &&
+ block_size_allowed;
+}
+
+#if CONFIG_MISMATCH_DEBUG
+static INLINE void mi_to_pixel_loc(int *pixel_c, int *pixel_r, int mi_col,
+ int mi_row, int tx_blk_col, int tx_blk_row,
+ int subsampling_x, int subsampling_y) {
+ *pixel_c = ((mi_col >> subsampling_x) << MI_SIZE_LOG2) +
+ (tx_blk_col << tx_size_wide_log2[0]);
+ *pixel_r = ((mi_row >> subsampling_y) << MI_SIZE_LOG2) +
+ (tx_blk_row << tx_size_high_log2[0]);
+}
+#endif
+
+enum ATTRIBUTE_PACKED mv_precision { MV_PRECISION_Q3, MV_PRECISION_Q4 };
+
+struct buf_2d {
+ uint8_t *buf;
+ uint8_t *buf0;
+ int width;
+ int height;
+ int stride;
+};
+
+typedef struct eob_info {
+ uint16_t eob;
+ uint16_t max_scan_line;
+} eob_info;
+
+typedef struct {
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_MB_PLANE][MAX_SB_SQUARE]);
+ eob_info eob_data[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
+} CB_BUFFER;
+
+typedef struct macroblockd_plane {
+ tran_low_t *dqcoeff;
+ tran_low_t *dqcoeff_block;
+ eob_info *eob_data;
+ PLANE_TYPE plane_type;
+ int subsampling_x;
+ int subsampling_y;
+ struct buf_2d dst;
+ struct buf_2d pre[2];
+ ENTROPY_CONTEXT *above_context;
+ ENTROPY_CONTEXT *left_context;
+
+ // The dequantizers below are true dequntizers used only in the
+ // dequantization process. They have the same coefficient
+ // shift/scale as TX.
+ int16_t seg_dequant_QTX[MAX_SEGMENTS][2];
+ uint8_t *color_index_map;
+
+ // block size in pixels
+ uint8_t width, height;
+
+ qm_val_t *seg_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ qm_val_t *seg_qmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+ // the 'dequantizers' below are not literal dequantizer values.
+ // They're used by encoder RDO to generate ad-hoc lambda values.
+ // They use a hardwired Q3 coeff shift and do not necessarily match
+ // the TX scale in use.
+ const int16_t *dequant_Q3;
+} MACROBLOCKD_PLANE;
+
+#define BLOCK_OFFSET(x, i) \
+ ((x) + (i) * (1 << (tx_size_wide_log2[0] + tx_size_high_log2[0])))
+
+typedef struct RefBuffer {
+ int idx; // frame buf idx
+ int map_idx; // frame map idx
+ YV12_BUFFER_CONFIG *buf;
+ struct scale_factors sf;
+} RefBuffer;
+
+typedef struct {
+ DECLARE_ALIGNED(16, InterpKernel, vfilter);
+ DECLARE_ALIGNED(16, InterpKernel, hfilter);
+} WienerInfo;
+
+typedef struct {
+ int ep;
+ int xqd[2];
+} SgrprojInfo;
+
+#if CONFIG_DEBUG
+#define CFL_SUB8X8_VAL_MI_SIZE (4)
+#define CFL_SUB8X8_VAL_MI_SQUARE \
+ (CFL_SUB8X8_VAL_MI_SIZE * CFL_SUB8X8_VAL_MI_SIZE)
+#endif // CONFIG_DEBUG
+#define CFL_MAX_BLOCK_SIZE (BLOCK_32X32)
+#define CFL_BUF_LINE (32)
+#define CFL_BUF_LINE_I128 (CFL_BUF_LINE >> 3)
+#define CFL_BUF_LINE_I256 (CFL_BUF_LINE >> 4)
+#define CFL_BUF_SQUARE (CFL_BUF_LINE * CFL_BUF_LINE)
+typedef struct cfl_ctx {
+ // Q3 reconstructed luma pixels (only Q2 is required, but Q3 is used to avoid
+ // shifts)
+ uint16_t recon_buf_q3[CFL_BUF_SQUARE];
+ // Q3 AC contributions (reconstructed luma pixels - tx block avg)
+ int16_t ac_buf_q3[CFL_BUF_SQUARE];
+
+ // Cache the DC_PRED when performing RDO, so it does not have to be recomputed
+ // for every scaling parameter
+ int dc_pred_is_cached[CFL_PRED_PLANES];
+ // The DC_PRED cache is disable when decoding
+ int use_dc_pred_cache;
+ // Only cache the first row of the DC_PRED
+ int16_t dc_pred_cache[CFL_PRED_PLANES][CFL_BUF_LINE];
+
+ // Height and width currently used in the CfL prediction buffer.
+ int buf_height, buf_width;
+
+ int are_parameters_computed;
+
+ // Chroma subsampling
+ int subsampling_x, subsampling_y;
+
+ int mi_row, mi_col;
+
+ // Whether the reconstructed luma pixels need to be stored
+ int store_y;
+
+#if CONFIG_DEBUG
+ int rate;
+#endif // CONFIG_DEBUG
+
+ int is_chroma_reference;
+} CFL_CTX;
+
+typedef struct jnt_comp_params {
+ int use_jnt_comp_avg;
+ int fwd_offset;
+ int bck_offset;
+} JNT_COMP_PARAMS;
+
+// Most/all of the pointers are mere pointers to actual arrays are allocated
+// elsewhere. This is mostly for coding convenience.
+typedef struct macroblockd {
+ struct macroblockd_plane plane[MAX_MB_PLANE];
+
+ TileInfo tile;
+
+ int mi_stride;
+
+ MB_MODE_INFO **mi;
+ MB_MODE_INFO *left_mbmi;
+ MB_MODE_INFO *above_mbmi;
+ MB_MODE_INFO *chroma_left_mbmi;
+ MB_MODE_INFO *chroma_above_mbmi;
+
+ int up_available;
+ int left_available;
+ int chroma_up_available;
+ int chroma_left_available;
+
+ /* Distance of MB away from frame edges in subpixels (1/8th pixel) */
+ int mb_to_left_edge;
+ int mb_to_right_edge;
+ int mb_to_top_edge;
+ int mb_to_bottom_edge;
+
+ /* pointers to reference frames */
+ const RefBuffer *block_refs[2];
+
+ /* pointer to current frame */
+ const YV12_BUFFER_CONFIG *cur_buf;
+
+ ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+ ENTROPY_CONTEXT left_context[MAX_MB_PLANE][MAX_MIB_SIZE];
+
+ PARTITION_CONTEXT *above_seg_context;
+ PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+
+ TXFM_CONTEXT *above_txfm_context;
+ TXFM_CONTEXT *left_txfm_context;
+ TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+ WienerInfo wiener_info[MAX_MB_PLANE];
+ SgrprojInfo sgrproj_info[MAX_MB_PLANE];
+
+ // block dimension in the unit of mode_info.
+ uint8_t n4_w, n4_h;
+
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+ uint8_t is_sec_rect;
+
+ // Counts of each reference frame in the above and left neighboring blocks.
+ // NOTE: Take into account both single and comp references.
+ uint8_t neighbors_ref_counts[REF_FRAMES];
+
+ FRAME_CONTEXT *tile_ctx;
+ /* Bit depth: 8, 10, 12 */
+ int bd;
+
+ int qindex[MAX_SEGMENTS];
+ int lossless[MAX_SEGMENTS];
+ int corrupted;
+ int cur_frame_force_integer_mv;
+ // same with that in AV1_COMMON
+ struct aom_internal_error_info *error_info;
+ const WarpedMotionParams *global_motion;
+ int delta_qindex;
+ int current_qindex;
+ // Since actual frame level loop filtering level value is not available
+ // at the beginning of the tile (only available during actual filtering)
+ // at encoder side.we record the delta_lf (against the frame level loop
+ // filtering level) and code the delta between previous superblock's delta
+ // lf and current delta lf. It is equivalent to the delta between previous
+ // superblock's actual lf and current lf.
+ int delta_lf_from_base;
+ // For this experiment, we have four frame filter levels for different plane
+ // and direction. So, to support the per superblock update, we need to add
+ // a few more params as below.
+ // 0: delta loop filter level for y plane vertical
+ // 1: delta loop filter level for y plane horizontal
+ // 2: delta loop filter level for u plane
+ // 3: delta loop filter level for v plane
+ // To make it consistent with the reference to each filter level in segment,
+ // we need to -1, since
+ // SEG_LVL_ALT_LF_Y_V = 1;
+ // SEG_LVL_ALT_LF_Y_H = 2;
+ // SEG_LVL_ALT_LF_U = 3;
+ // SEG_LVL_ALT_LF_V = 4;
+ int delta_lf[FRAME_LF_COUNT];
+ int cdef_preset[4];
+
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *mc_buf[2];
+ CFL_CTX cfl;
+
+ JNT_COMP_PARAMS jcp_param;
+
+ uint16_t cb_offset[MAX_MB_PLANE];
+ uint16_t txb_offset[MAX_MB_PLANE];
+ uint16_t color_index_map_offset[2];
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+} MACROBLOCKD;
+
+static INLINE int get_bitdepth_data_path_index(const MACROBLOCKD *xd) {
+ return xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ? 1 : 0;
+}
+
+static INLINE uint8_t *get_buf_by_bd(const MACROBLOCKD *xd, uint8_t *buf16) {
+ return (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ ? CONVERT_TO_BYTEPTR(buf16)
+ : buf16;
+}
+
+static INLINE int get_sqr_bsize_idx(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_4X4: return 0;
+ case BLOCK_8X8: return 1;
+ case BLOCK_16X16: return 2;
+ case BLOCK_32X32: return 3;
+ case BLOCK_64X64: return 4;
+ case BLOCK_128X128: return 5;
+ default: return SQR_BLOCK_SIZES;
+ }
+}
+
+// For a square block size 'bsize', returns the size of the sub-blocks used by
+// the given partition type. If the partition produces sub-blocks of different
+// sizes, then the function returns the largest sub-block size.
+// Implements the Partition_Subsize lookup table in the spec (Section 9.3.
+// Conversion tables).
+// Note: the input block size should be square.
+// Otherwise it's considered invalid.
+static INLINE BLOCK_SIZE get_partition_subsize(BLOCK_SIZE bsize,
+ PARTITION_TYPE partition) {
+ if (partition == PARTITION_INVALID) {
+ return BLOCK_INVALID;
+ } else {
+ const int sqr_bsize_idx = get_sqr_bsize_idx(bsize);
+ return sqr_bsize_idx >= SQR_BLOCK_SIZES
+ ? BLOCK_INVALID
+ : subsize_lookup[partition][sqr_bsize_idx];
+ }
+}
+
+static TX_TYPE intra_mode_to_tx_type(const MB_MODE_INFO *mbmi,
+ PLANE_TYPE plane_type) {
+ static const TX_TYPE _intra_mode_to_tx_type[INTRA_MODES] = {
+ DCT_DCT, // DC
+ ADST_DCT, // V
+ DCT_ADST, // H
+ DCT_DCT, // D45
+ ADST_ADST, // D135
+ ADST_DCT, // D117
+ DCT_ADST, // D153
+ DCT_ADST, // D207
+ ADST_DCT, // D63
+ ADST_ADST, // SMOOTH
+ ADST_DCT, // SMOOTH_V
+ DCT_ADST, // SMOOTH_H
+ ADST_ADST, // PAETH
+ };
+ const PREDICTION_MODE mode =
+ (plane_type == PLANE_TYPE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+ assert(mode < INTRA_MODES);
+ return _intra_mode_to_tx_type[mode];
+}
+
+static INLINE int is_rect_tx(TX_SIZE tx_size) { return tx_size >= TX_SIZES; }
+
+static INLINE int block_signals_txsize(BLOCK_SIZE bsize) {
+ return bsize > BLOCK_4X4;
+}
+
+// Number of transform types in each set type
+static const int av1_num_ext_tx_set[EXT_TX_SET_TYPES] = {
+ 1, 2, 5, 7, 12, 16,
+};
+
+static const int av1_ext_tx_used[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+
+static const uint16_t av1_ext_tx_used_flag[EXT_TX_SET_TYPES] = {
+ 0x0001, // 0000 0000 0000 0001
+ 0x0201, // 0000 0010 0000 0001
+ 0x020F, // 0000 0010 0000 1111
+ 0x0E0F, // 0000 1110 0000 1111
+ 0x0FFF, // 0000 1111 1111 1111
+ 0xFFFF, // 1111 1111 1111 1111
+};
+
+static INLINE TxSetType av1_get_ext_tx_set_type(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
+ if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
+ if (tx_size_sqr_up == TX_32X32)
+ return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DCTONLY;
+ if (use_reduced_set)
+ return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
+ const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
+ if (is_inter) {
+ return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT9_IDTX_1DDCT
+ : EXT_TX_SET_ALL16);
+ } else {
+ return (tx_size_sqr == TX_16X16 ? EXT_TX_SET_DTT4_IDTX
+ : EXT_TX_SET_DTT4_IDTX_1DDCT);
+ }
+}
+
+// Maps tx set types to the indices.
+static const int ext_tx_set_index[2][EXT_TX_SET_TYPES] = {
+ { // Intra
+ 0, -1, 2, 1, -1, -1 },
+ { // Inter
+ 0, 3, -1, -1, 2, 1 },
+};
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const TxSetType set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+ return ext_tx_set_index[is_inter][set_type];
+}
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, int is_inter,
+ int use_reduced_set) {
+ const int set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, use_reduced_set);
+ return av1_num_ext_tx_set[set_type];
+}
+
+#define TXSIZEMAX(t1, t2) (tx_size_2d[(t1)] >= tx_size_2d[(t2)] ? (t1) : (t2))
+#define TXSIZEMIN(t1, t2) (tx_size_2d[(t1)] <= tx_size_2d[(t2)] ? (t1) : (t2))
+
+static INLINE TX_SIZE tx_size_from_tx_mode(BLOCK_SIZE bsize, TX_MODE tx_mode) {
+ const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+ const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bsize];
+ if (bsize == BLOCK_4X4)
+ return AOMMIN(max_txsize_lookup[bsize], largest_tx_size);
+ if (txsize_sqr_map[max_rect_tx_size] <= largest_tx_size)
+ return max_rect_tx_size;
+ else
+ return largest_tx_size;
+}
+
+extern const int16_t dr_intra_derivative[90];
+static const uint8_t mode_to_angle_map[] = {
+ 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0,
+};
+
+// Converts block_index for given transform size to index of the block in raster
+// order.
+static INLINE int av1_block_index_to_raster_order(TX_SIZE tx_size,
+ int block_idx) {
+ // For transform size 4x8, the possible block_idx values are 0 & 2, because
+ // block_idx values are incremented in steps of size 'tx_width_unit x
+ // tx_height_unit'. But, for this transform size, block_idx = 2 corresponds to
+ // block number 1 in raster order, inside an 8x8 MI block.
+ // For any other transform size, the two indices are equivalent.
+ return (tx_size == TX_4X8 && block_idx == 2) ? 1 : block_idx;
+}
+
+// Inverse of above function.
+// Note: only implemented for transform sizes 4x4, 4x8 and 8x4 right now.
+static INLINE int av1_raster_order_to_block_index(TX_SIZE tx_size,
+ int raster_order) {
+ assert(tx_size == TX_4X4 || tx_size == TX_4X8 || tx_size == TX_8X4);
+ // We ensure that block indices are 0 & 2 if tx size is 4x8 or 8x4.
+ return (tx_size == TX_4X4) ? raster_order : (raster_order > 0) ? 2 : 0;
+}
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd,
+ TX_SIZE tx_size) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+ xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+ return DCT_DCT;
+
+ return intra_mode_to_tx_type(mbmi, plane_type);
+}
+
+// Implements the get_plane_residual_size() function in the spec (Section
+// 5.11.38. Get plane residual size function).
+static INLINE BLOCK_SIZE get_plane_block_size(BLOCK_SIZE bsize,
+ int subsampling_x,
+ int subsampling_y) {
+ if (bsize == BLOCK_INVALID) return BLOCK_INVALID;
+ return ss_size_lookup[bsize][subsampling_x][subsampling_y];
+}
+
+static INLINE int av1_get_txb_size_index(BLOCK_SIZE bsize, int blk_row,
+ int blk_col) {
+ TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ for (int level = 0; level < MAX_VARTX_DEPTH - 1; ++level)
+ txs = sub_tx_size_map[txs];
+ const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ const int bw_log2 = mi_size_wide_log2[bsize];
+ const int stride_log2 = bw_log2 - tx_w_log2;
+ const int index =
+ ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+ assert(index < INTER_TX_SIZE_BUF_LEN);
+ return index;
+}
+
+static INLINE int av1_get_txk_type_index(BLOCK_SIZE bsize, int blk_row,
+ int blk_col) {
+ TX_SIZE txs = max_txsize_rect_lookup[bsize];
+ for (int level = 0; level < MAX_VARTX_DEPTH; ++level)
+ txs = sub_tx_size_map[txs];
+ const int tx_w_log2 = tx_size_wide_log2[txs] - MI_SIZE_LOG2;
+ const int tx_h_log2 = tx_size_high_log2[txs] - MI_SIZE_LOG2;
+ const int bw_uint_log2 = mi_size_wide_log2[bsize];
+ const int stride_log2 = bw_uint_log2 - tx_w_log2;
+ const int index =
+ ((blk_row >> tx_h_log2) << stride_log2) + (blk_col >> tx_w_log2);
+ assert(index < TXK_TYPE_BUF_LEN);
+ return index;
+}
+
+static INLINE void update_txk_array(TX_TYPE *txk_type, BLOCK_SIZE bsize,
+ int blk_row, int blk_col, TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ const int txk_type_idx = av1_get_txk_type_index(bsize, blk_row, blk_col);
+ txk_type[txk_type_idx] = tx_type;
+
+ const int txw = tx_size_wide_unit[tx_size];
+ const int txh = tx_size_high_unit[tx_size];
+ // The 16x16 unit is due to the constraint from tx_64x64 which sets the
+ // maximum tx size for chroma as 32x32. Coupled with 4x1 transform block
+ // size, the constraint takes effect in 32x16 / 16x32 size too. To solve
+ // the intricacy, cover all the 16x16 units inside a 64 level transform.
+ if (txw == tx_size_wide_unit[TX_64X64] ||
+ txh == tx_size_high_unit[TX_64X64]) {
+ const int tx_unit = tx_size_wide_unit[TX_16X16];
+ for (int idy = 0; idy < txh; idy += tx_unit) {
+ for (int idx = 0; idx < txw; idx += tx_unit) {
+ const int this_index =
+ av1_get_txk_type_index(bsize, blk_row + idy, blk_col + idx);
+ txk_type[this_index] = tx_type;
+ }
+ }
+ }
+}
+
+static INLINE TX_TYPE av1_get_tx_type(PLANE_TYPE plane_type,
+ const MACROBLOCKD *xd, int blk_row,
+ int blk_col, TX_SIZE tx_size,
+ int reduced_tx_set) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane_type];
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter_block(mbmi), reduced_tx_set);
+
+ TX_TYPE tx_type;
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32) {
+ tx_type = DCT_DCT;
+ } else {
+ if (plane_type == PLANE_TYPE_Y) {
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+ tx_type = mbmi->txk_type[txk_type_idx];
+ } else if (is_inter_block(mbmi)) {
+ // scale back to y plane's coordinate
+ blk_row <<= pd->subsampling_y;
+ blk_col <<= pd->subsampling_x;
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+ tx_type = mbmi->txk_type[txk_type_idx];
+ } else {
+ // In intra mode, uv planes don't share the same prediction mode as y
+ // plane, so the tx_type should not be shared
+ tx_type = intra_mode_to_tx_type(mbmi, PLANE_TYPE_UV);
+ }
+ }
+ assert(tx_type < TX_TYPES);
+ if (!av1_ext_tx_used[tx_set_type][tx_type]) return DCT_DCT;
+ return tx_type;
+}
+
+void av1_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y,
+ const int num_planes);
+
+static INLINE int bsize_to_max_depth(BLOCK_SIZE bsize) {
+ TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ int depth = 0;
+ while (depth < MAX_TX_DEPTH && tx_size != TX_4X4) {
+ depth++;
+ tx_size = sub_tx_size_map[tx_size];
+ }
+ return depth;
+}
+
+static INLINE int bsize_to_tx_size_cat(BLOCK_SIZE bsize) {
+ TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ assert(tx_size != TX_4X4);
+ int depth = 0;
+ while (tx_size != TX_4X4) {
+ depth++;
+ tx_size = sub_tx_size_map[tx_size];
+ assert(depth < 10);
+ }
+ assert(depth <= MAX_TX_CATS);
+ return depth - 1;
+}
+
+static INLINE TX_SIZE depth_to_tx_size(int depth, BLOCK_SIZE bsize) {
+ TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ TX_SIZE tx_size = max_tx_size;
+ for (int d = 0; d < depth; ++d) tx_size = sub_tx_size_map[tx_size];
+ return tx_size;
+}
+
+static INLINE TX_SIZE av1_get_adjusted_tx_size(TX_SIZE tx_size) {
+ switch (tx_size) {
+ case TX_64X64:
+ case TX_64X32:
+ case TX_32X64: return TX_32X32;
+ case TX_64X16: return TX_32X16;
+ case TX_16X64: return TX_16X32;
+ default: return tx_size;
+ }
+}
+
+static INLINE TX_SIZE av1_get_max_uv_txsize(BLOCK_SIZE bsize, int subsampling_x,
+ int subsampling_y) {
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, subsampling_x, subsampling_y);
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ const TX_SIZE uv_tx = max_txsize_rect_lookup[plane_bsize];
+ return av1_get_adjusted_tx_size(uv_tx);
+}
+
+static INLINE TX_SIZE av1_get_tx_size(int plane, const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ if (xd->lossless[mbmi->segment_id]) return TX_4X4;
+ if (plane == 0) return mbmi->tx_size;
+ const MACROBLOCKD_PLANE *pd = &xd->plane[plane];
+ return av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y);
+}
+
+void av1_reset_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, const int num_planes);
+
+void av1_reset_loop_filter_delta(MACROBLOCKD *xd, int num_planes);
+
+void av1_reset_loop_restoration(MACROBLOCKD *xd, const int num_planes);
+
+typedef void (*foreach_transformed_block_visitor)(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
+
+void av1_set_contexts(const MACROBLOCKD *xd, struct macroblockd_plane *pd,
+ int plane, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ int has_eob, int aoff, int loff);
+
+#define MAX_INTERINTRA_SB_SQUARE 32 * 32
+static INLINE int is_interintra_mode(const MB_MODE_INFO *mbmi) {
+ return (mbmi->ref_frame[0] > INTRA_FRAME &&
+ mbmi->ref_frame[1] == INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+ return (bsize >= BLOCK_8X8) && (bsize <= BLOCK_32X32);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+ return (mode >= SINGLE_INTER_MODE_START) && (mode < SINGLE_INTER_MODE_END);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+ return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+ return is_interintra_allowed_bsize(mbmi->sb_type) &&
+ is_interintra_allowed_mode(mbmi->mode) &&
+ is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(int group) {
+ int i;
+ for (i = 0; i < BLOCK_SIZES_ALL; i++) {
+ if (size_group_lookup[i] == group &&
+ is_interintra_allowed_bsize((BLOCK_SIZE)i)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+ return mbmi->ref_frame[0] > INTRA_FRAME &&
+ mbmi->ref_frame[1] == INTRA_FRAME && is_interintra_allowed(mbmi);
+}
+
+static INLINE int get_vartx_max_txsize(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+ const TX_SIZE max_txsize = max_txsize_rect_lookup[bsize];
+ if (plane == 0) return max_txsize; // luma
+ return av1_get_adjusted_tx_size(max_txsize); // chroma
+}
+
+static INLINE int is_motion_variation_allowed_bsize(BLOCK_SIZE bsize) {
+ return AOMMIN(block_size_wide[bsize], block_size_high[bsize]) >= 8;
+}
+
+static INLINE int is_motion_variation_allowed_compound(
+ const MB_MODE_INFO *mbmi) {
+ if (!has_second_ref(mbmi))
+ return 1;
+ else
+ return 0;
+}
+
+// input: log2 of length, 0(4), 1(8), ...
+static const int max_neighbor_obmc[6] = { 0, 1, 2, 3, 4, 4 };
+
+static INLINE int check_num_overlappable_neighbors(const MB_MODE_INFO *mbmi) {
+ return !(mbmi->overlappable_neighbors[0] == 0 &&
+ mbmi->overlappable_neighbors[1] == 0);
+}
+
+static INLINE MOTION_MODE
+motion_mode_allowed(const WarpedMotionParams *gm_params, const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi, int allow_warped_motion) {
+ if (xd->cur_frame_force_integer_mv == 0) {
+ const TransformationType gm_type = gm_params[mbmi->ref_frame[0]].wmtype;
+ if (is_global_mv_block(mbmi, gm_type)) return SIMPLE_TRANSLATION;
+ }
+ if (is_motion_variation_allowed_bsize(mbmi->sb_type) &&
+ is_inter_mode(mbmi->mode) && mbmi->ref_frame[1] != INTRA_FRAME &&
+ is_motion_variation_allowed_compound(mbmi)) {
+ if (!check_num_overlappable_neighbors(mbmi)) return SIMPLE_TRANSLATION;
+ assert(!has_second_ref(mbmi));
+ if (mbmi->num_proj_ref >= 1 &&
+ (allow_warped_motion && !av1_is_scaled(&(xd->block_refs[0]->sf)))) {
+ if (xd->cur_frame_force_integer_mv) {
+ return OBMC_CAUSAL;
+ }
+ return WARPED_CAUSAL;
+ }
+ return OBMC_CAUSAL;
+ } else {
+ return SIMPLE_TRANSLATION;
+ }
+}
+
+static INLINE void assert_motion_mode_valid(MOTION_MODE mode,
+ const WarpedMotionParams *gm_params,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi,
+ int allow_warped_motion) {
+ const MOTION_MODE last_motion_mode_allowed =
+ motion_mode_allowed(gm_params, xd, mbmi, allow_warped_motion);
+
+ // Check that the input mode is not illegal
+ if (last_motion_mode_allowed < mode)
+ assert(0 && "Illegal motion mode selected");
+}
+
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+ return (is_inter_block(mbmi));
+}
+
+static INLINE int av1_allow_palette(int allow_screen_content_tools,
+ BLOCK_SIZE sb_type) {
+ return allow_screen_content_tools && block_size_wide[sb_type] <= 64 &&
+ block_size_high[sb_type] <= 64 && sb_type >= BLOCK_8X8;
+}
+
+// Returns sub-sampled dimensions of the given block.
+// The output values for 'rows_within_bounds' and 'cols_within_bounds' will
+// differ from 'height' and 'width' when part of the block is outside the
+// right
+// and/or bottom image boundary.
+static INLINE void av1_get_block_dimensions(BLOCK_SIZE bsize, int plane,
+ const MACROBLOCKD *xd, int *width,
+ int *height,
+ int *rows_within_bounds,
+ int *cols_within_bounds) {
+ const int block_height = block_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ const int block_rows = (xd->mb_to_bottom_edge >= 0)
+ ? block_height
+ : (xd->mb_to_bottom_edge >> 3) + block_height;
+ const int block_cols = (xd->mb_to_right_edge >= 0)
+ ? block_width
+ : (xd->mb_to_right_edge >> 3) + block_width;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_x == 0));
+ assert(IMPLIES(plane == PLANE_TYPE_Y, pd->subsampling_y == 0));
+ assert(block_width >= block_cols);
+ assert(block_height >= block_rows);
+ const int plane_block_width = block_width >> pd->subsampling_x;
+ const int plane_block_height = block_height >> pd->subsampling_y;
+ // Special handling for chroma sub8x8.
+ const int is_chroma_sub8_x = plane > 0 && plane_block_width < 4;
+ const int is_chroma_sub8_y = plane > 0 && plane_block_height < 4;
+ if (width) *width = plane_block_width + 2 * is_chroma_sub8_x;
+ if (height) *height = plane_block_height + 2 * is_chroma_sub8_y;
+ if (rows_within_bounds) {
+ *rows_within_bounds =
+ (block_rows >> pd->subsampling_y) + 2 * is_chroma_sub8_y;
+ }
+ if (cols_within_bounds) {
+ *cols_within_bounds =
+ (block_cols >> pd->subsampling_x) + 2 * is_chroma_sub8_x;
+ }
+}
+
+/* clang-format off */
+typedef aom_cdf_prob (*MapCdf)[PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+typedef const int (*ColorCost)[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+/* clang-format on */
+
+typedef struct {
+ int rows;
+ int cols;
+ int n_colors;
+ int plane_width;
+ int plane_height;
+ uint8_t *color_map;
+ MapCdf map_cdf;
+ ColorCost color_cost;
+} Av1ColorMapParam;
+
+static INLINE int is_nontrans_global_motion(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int ref;
+
+ // First check if all modes are GLOBALMV
+ if (mbmi->mode != GLOBALMV && mbmi->mode != GLOBAL_GLOBALMV) return 0;
+
+ if (AOMMIN(mi_size_wide[mbmi->sb_type], mi_size_high[mbmi->sb_type]) < 2)
+ return 0;
+
+ // Now check if all global motion is non translational
+ for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ if (xd->global_motion[mbmi->ref_frame[ref]].wmtype == TRANSLATION) return 0;
+ }
+ return 1;
+}
+
+static INLINE PLANE_TYPE get_plane_type(int plane) {
+ return (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+}
+
+static INLINE int av1_get_max_eob(TX_SIZE tx_size) {
+ if (tx_size == TX_64X64 || tx_size == TX_64X32 || tx_size == TX_32X64) {
+ return 1024;
+ }
+ if (tx_size == TX_16X64 || tx_size == TX_64X16) {
+ return 512;
+ }
+ return tx_size_2d[tx_size];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_BLOCKD_H_
diff --git a/third_party/aom/av1/common/cdef.c b/third_party/aom/av1/common/cdef.c
new file mode 100644
index 0000000000..e9e2b0e42c
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col) {
+ int maxc, maxr;
+ int skip = 1;
+ maxc = cm->mi_cols - mi_col;
+ maxr = cm->mi_rows - mi_row;
+
+ maxr = AOMMIN(maxr, MI_SIZE_64X64);
+ maxc = AOMMIN(maxc, MI_SIZE_64X64);
+
+ for (int r = 0; r < maxr; r++) {
+ for (int c = 0; c < maxc; c++) {
+ skip =
+ skip &&
+ cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]->skip;
+ }
+ }
+ return skip;
+}
+
+static int is_8x8_block_skip(MB_MODE_INFO **grid, int mi_row, int mi_col,
+ int mi_stride) {
+ int is_skip = 1;
+ for (int r = 0; r < mi_size_high[BLOCK_8X8]; ++r)
+ for (int c = 0; c < mi_size_wide[BLOCK_8X8]; ++c)
+ is_skip &= grid[(mi_row + r) * mi_stride + (mi_col + c)]->skip;
+
+ return is_skip;
+}
+
+int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+ cdef_list *dlist, BLOCK_SIZE bs) {
+ MB_MODE_INFO **grid = cm->mi_grid_visible;
+ int maxc = cm->mi_cols - mi_col;
+ int maxr = cm->mi_rows - mi_row;
+
+ if (bs == BLOCK_128X128 || bs == BLOCK_128X64)
+ maxc = AOMMIN(maxc, MI_SIZE_128X128);
+ else
+ maxc = AOMMIN(maxc, MI_SIZE_64X64);
+ if (bs == BLOCK_128X128 || bs == BLOCK_64X128)
+ maxr = AOMMIN(maxr, MI_SIZE_128X128);
+ else
+ maxr = AOMMIN(maxr, MI_SIZE_64X64);
+
+ const int r_step = mi_size_high[BLOCK_8X8];
+ const int c_step = mi_size_wide[BLOCK_8X8];
+ const int r_shift = (r_step == 2);
+ const int c_shift = (c_step == 2);
+
+ assert(r_step == 1 || r_step == 2);
+ assert(c_step == 1 || c_step == 2);
+
+ int count = 0;
+
+ for (int r = 0; r < maxr; r += r_step) {
+ for (int c = 0; c < maxc; c += c_step) {
+ if (!is_8x8_block_skip(grid, mi_row + r, mi_col + c, cm->mi_stride)) {
+ dlist[count].by = r >> r_shift;
+ dlist[count].bx = c >> c_shift;
+ dlist[count].skip = 0;
+ count++;
+ }
+ }
+ }
+ return count;
+}
+
+void copy_rect8_8bit_to_16bit_c(uint16_t *dst, int dstride, const uint8_t *src,
+ int sstride, int v, int h) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+void copy_rect8_16bit_to_16bit_c(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride, int v,
+ int h) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+static void copy_sb8_16(AOM_UNUSED AV1_COMMON *cm, uint16_t *dst, int dstride,
+ const uint8_t *src, int src_voffset, int src_hoffset,
+ int sstride, int vsize, int hsize) {
+ if (cm->seq_params.use_highbitdepth) {
+ const uint16_t *base =
+ &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset];
+ copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+ } else {
+ const uint8_t *base = &src[src_voffset * sstride + src_hoffset];
+ copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, vsize, hsize);
+ }
+}
+
+static INLINE void fill_rect(uint16_t *dst, int dstride, int v, int h,
+ uint16_t x) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = x;
+ }
+ }
+}
+
+static INLINE void copy_rect(uint16_t *dst, int dstride, const uint16_t *src,
+ int sstride, int v, int h) {
+ for (int i = 0; i < v; i++) {
+ for (int j = 0; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd) {
+ const int num_planes = av1_num_planes(cm);
+ DECLARE_ALIGNED(16, uint16_t, src[CDEF_INBUF_SIZE]);
+ uint16_t *linebuf[3];
+ uint16_t *colbuf[3];
+ cdef_list dlist[MI_SIZE_64X64 * MI_SIZE_64X64];
+ unsigned char *row_cdef, *prev_row_cdef, *curr_row_cdef;
+ int cdef_count;
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int mi_wide_l2[3];
+ int mi_high_l2[3];
+ int xdec[3];
+ int ydec[3];
+ int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+ const int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ const int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+ num_planes);
+ row_cdef = aom_malloc(sizeof(*row_cdef) * (nhfb + 2) * 2);
+ memset(row_cdef, 1, sizeof(*row_cdef) * (nhfb + 2) * 2);
+ prev_row_cdef = row_cdef + 1;
+ curr_row_cdef = prev_row_cdef + nhfb + 2;
+ for (int pli = 0; pli < num_planes; pli++) {
+ xdec[pli] = xd->plane[pli].subsampling_x;
+ ydec[pli] = xd->plane[pli].subsampling_y;
+ mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+ mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+ }
+ const int stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * CDEF_HBORDER;
+ for (int pli = 0; pli < num_planes; pli++) {
+ linebuf[pli] = aom_malloc(sizeof(*linebuf) * CDEF_VBORDER * stride);
+ colbuf[pli] =
+ aom_malloc(sizeof(*colbuf) *
+ ((CDEF_BLOCKSIZE << mi_high_l2[pli]) + 2 * CDEF_VBORDER) *
+ CDEF_HBORDER);
+ }
+ for (int fbr = 0; fbr < nvfb; fbr++) {
+ for (int pli = 0; pli < num_planes; pli++) {
+ const int block_height =
+ (MI_SIZE_64X64 << mi_high_l2[pli]) + 2 * CDEF_VBORDER;
+ fill_rect(colbuf[pli], CDEF_HBORDER, block_height, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ int cdef_left = 1;
+ for (int fbc = 0; fbc < nhfb; fbc++) {
+ int level, sec_strength;
+ int uv_level, uv_sec_strength;
+ int nhb, nvb;
+ int cstart = 0;
+ curr_row_cdef[fbc] = 0;
+ if (cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+ MI_SIZE_64X64 * fbc] == NULL ||
+ cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+ MI_SIZE_64X64 * fbc]
+ ->cdef_strength == -1) {
+ cdef_left = 0;
+ continue;
+ }
+ if (!cdef_left) cstart = -CDEF_HBORDER;
+ nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
+ nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+ int frame_top, frame_left, frame_bottom, frame_right;
+
+ int mi_row = MI_SIZE_64X64 * fbr;
+ int mi_col = MI_SIZE_64X64 * fbc;
+ // for the current filter block, it's top left corner mi structure (mi_tl)
+ // is first accessed to check whether the top and left boundaries are
+ // frame boundaries. Then bottom-left and top-right mi structures are
+ // accessed to check whether the bottom and right boundaries
+ // (respectively) are frame boundaries.
+ //
+ // Note that we can't just check the bottom-right mi structure - eg. if
+ // we're at the right-hand edge of the frame but not the bottom, then
+ // the bottom-right mi is NULL but the bottom-left is not.
+ frame_top = (mi_row == 0) ? 1 : 0;
+ frame_left = (mi_col == 0) ? 1 : 0;
+
+ if (fbr != nvfb - 1)
+ frame_bottom = (mi_row + MI_SIZE_64X64 == cm->mi_rows) ? 1 : 0;
+ else
+ frame_bottom = 1;
+
+ if (fbc != nhfb - 1)
+ frame_right = (mi_col + MI_SIZE_64X64 == cm->mi_cols) ? 1 : 0;
+ else
+ frame_right = 1;
+
+ const int mbmi_cdef_strength =
+ cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+ MI_SIZE_64X64 * fbc]
+ ->cdef_strength;
+ level = cm->cdef_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ sec_strength =
+ cm->cdef_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ sec_strength += sec_strength == 3;
+ uv_level = cm->cdef_uv_strengths[mbmi_cdef_strength] / CDEF_SEC_STRENGTHS;
+ uv_sec_strength =
+ cm->cdef_uv_strengths[mbmi_cdef_strength] % CDEF_SEC_STRENGTHS;
+ uv_sec_strength += uv_sec_strength == 3;
+ if ((level == 0 && sec_strength == 0 && uv_level == 0 &&
+ uv_sec_strength == 0) ||
+ (cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+ fbc * MI_SIZE_64X64, dlist,
+ BLOCK_64X64)) == 0) {
+ cdef_left = 0;
+ continue;
+ }
+
+ curr_row_cdef[fbc] = 1;
+ for (int pli = 0; pli < num_planes; pli++) {
+ int coffset;
+ int rend, cend;
+ int pri_damping = cm->cdef_pri_damping;
+ int sec_damping = cm->cdef_sec_damping;
+ int hsize = nhb << mi_wide_l2[pli];
+ int vsize = nvb << mi_high_l2[pli];
+
+ if (pli) {
+ level = uv_level;
+ sec_strength = uv_sec_strength;
+ }
+
+ if (fbc == nhfb - 1)
+ cend = hsize;
+ else
+ cend = hsize + CDEF_HBORDER;
+
+ if (fbr == nvfb - 1)
+ rend = vsize;
+ else
+ rend = vsize + CDEF_VBORDER;
+
+ coffset = fbc * MI_SIZE_64X64 << mi_wide_l2[pli];
+ if (fbc == nhfb - 1) {
+ /* On the last superblock column, fill in the right border with
+ CDEF_VERY_LARGE to avoid filtering with the outside. */
+ fill_rect(&src[cend + CDEF_HBORDER], CDEF_BSTRIDE,
+ rend + CDEF_VBORDER, hsize + CDEF_HBORDER - cend,
+ CDEF_VERY_LARGE);
+ }
+ if (fbr == nvfb - 1) {
+ /* On the last superblock row, fill in the bottom border with
+ CDEF_VERY_LARGE to avoid filtering with the outside. */
+ fill_rect(&src[(rend + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+ CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ /* Copy in the pixels we need from the current superblock for
+ deringing.*/
+ copy_sb8_16(cm,
+ &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER + cstart],
+ CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+ (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr, coffset + cstart,
+ xd->plane[pli].dst.stride, rend, cend - cstart);
+ if (!prev_row_cdef[fbc]) {
+ copy_sb8_16(cm, &src[CDEF_HBORDER], CDEF_BSTRIDE,
+ xd->plane[pli].dst.buf,
+ (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+ coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+ } else if (fbr > 0) {
+ copy_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, &linebuf[pli][coffset],
+ stride, CDEF_VBORDER, hsize);
+ } else {
+ fill_rect(&src[CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER, hsize,
+ CDEF_VERY_LARGE);
+ }
+ if (!prev_row_cdef[fbc - 1]) {
+ copy_sb8_16(cm, src, CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+ (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+ coffset - CDEF_HBORDER, xd->plane[pli].dst.stride,
+ CDEF_VBORDER, CDEF_HBORDER);
+ } else if (fbr > 0 && fbc > 0) {
+ copy_rect(src, CDEF_BSTRIDE, &linebuf[pli][coffset - CDEF_HBORDER],
+ stride, CDEF_VBORDER, CDEF_HBORDER);
+ } else {
+ fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (!prev_row_cdef[fbc + 1]) {
+ copy_sb8_16(cm, &src[CDEF_HBORDER + (nhb << mi_wide_l2[pli])],
+ CDEF_BSTRIDE, xd->plane[pli].dst.buf,
+ (MI_SIZE_64X64 << mi_high_l2[pli]) * fbr - CDEF_VBORDER,
+ coffset + hsize, xd->plane[pli].dst.stride, CDEF_VBORDER,
+ CDEF_HBORDER);
+ } else if (fbr > 0 && fbc < nhfb - 1) {
+ copy_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ &linebuf[pli][coffset + hsize], stride, CDEF_VBORDER,
+ CDEF_HBORDER);
+ } else {
+ fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE, CDEF_VBORDER,
+ CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (cdef_left) {
+ /* If we deringed the superblock on the left then we need to copy in
+ saved pixels. */
+ copy_rect(src, CDEF_BSTRIDE, colbuf[pli], CDEF_HBORDER,
+ rend + CDEF_VBORDER, CDEF_HBORDER);
+ }
+ /* Saving pixels in case we need to dering the superblock on the
+ right. */
+ copy_rect(colbuf[pli], CDEF_HBORDER, src + hsize, CDEF_BSTRIDE,
+ rend + CDEF_VBORDER, CDEF_HBORDER);
+ copy_sb8_16(
+ cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
+ (MI_SIZE_64X64 << mi_high_l2[pli]) * (fbr + 1) - CDEF_VBORDER,
+ coffset, xd->plane[pli].dst.stride, CDEF_VBORDER, hsize);
+
+ if (frame_top) {
+ fill_rect(src, CDEF_BSTRIDE, CDEF_VBORDER, hsize + 2 * CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (frame_left) {
+ fill_rect(src, CDEF_BSTRIDE, vsize + 2 * CDEF_VBORDER, CDEF_HBORDER,
+ CDEF_VERY_LARGE);
+ }
+ if (frame_bottom) {
+ fill_rect(&src[(vsize + CDEF_VBORDER) * CDEF_BSTRIDE], CDEF_BSTRIDE,
+ CDEF_VBORDER, hsize + 2 * CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+ if (frame_right) {
+ fill_rect(&src[hsize + CDEF_HBORDER], CDEF_BSTRIDE,
+ vsize + 2 * CDEF_VBORDER, CDEF_HBORDER, CDEF_VERY_LARGE);
+ }
+
+ if (cm->seq_params.use_highbitdepth) {
+ cdef_filter_fb(
+ NULL,
+ &CONVERT_TO_SHORTPTR(
+ xd->plane[pli]
+ .dst.buf)[xd->plane[pli].dst.stride *
+ (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+ (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+ xd->plane[pli].dst.stride,
+ &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+ ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+ sec_strength, pri_damping, sec_damping, coeff_shift);
+ } else {
+ cdef_filter_fb(
+ &xd->plane[pli]
+ .dst.buf[xd->plane[pli].dst.stride *
+ (MI_SIZE_64X64 * fbr << mi_high_l2[pli]) +
+ (fbc * MI_SIZE_64X64 << mi_wide_l2[pli])],
+ NULL, xd->plane[pli].dst.stride,
+ &src[CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER], xdec[pli],
+ ydec[pli], dir, NULL, var, pli, dlist, cdef_count, level,
+ sec_strength, pri_damping, sec_damping, coeff_shift);
+ }
+ }
+ cdef_left = 1;
+ }
+ {
+ unsigned char *tmp = prev_row_cdef;
+ prev_row_cdef = curr_row_cdef;
+ curr_row_cdef = tmp;
+ }
+ }
+ aom_free(row_cdef);
+ for (int pli = 0; pli < num_planes; pli++) {
+ aom_free(linebuf[pli]);
+ aom_free(colbuf[pli]);
+ }
+}
diff --git a/third_party/aom/av1/common/cdef.h b/third_party/aom/av1/common/cdef.h
new file mode 100644
index 0000000000..3b2eac8a5c
--- /dev/null
+++ b/third_party/aom/av1/common/cdef.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_CDEF_H_
+#define AOM_AV1_COMMON_CDEF_H_
+
+#define CDEF_STRENGTH_BITS 6
+
+#define CDEF_PRI_STRENGTHS 16
+#define CDEF_SEC_STRENGTHS 4
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "av1/common/cdef_block.h"
+#include "av1/common/onyxc_int.h"
+
+static INLINE int sign(int i) { return i < 0 ? -1 : 1; }
+
+static INLINE int constrain(int diff, int threshold, int damping) {
+ if (!threshold) return 0;
+
+ const int shift = AOMMAX(0, damping - get_msb(threshold));
+ return sign(diff) *
+ AOMMIN(abs(diff), AOMMAX(0, threshold - (abs(diff) >> shift)));
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int sb_all_skip(const AV1_COMMON *const cm, int mi_row, int mi_col);
+int sb_compute_cdef_list(const AV1_COMMON *const cm, int mi_row, int mi_col,
+ cdef_list *dlist, BLOCK_SIZE bsize);
+void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+ AV1_COMMON *cm, MACROBLOCKD *xd, int fast);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_COMMON_CDEF_H_
diff --git a/third_party/aom/av1/common/cdef_block.c b/third_party/aom/av1/common/cdef_block.c
new file mode 100644
index 0000000000..df1de89be3
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef.h"
+
+/* Generated from gen_filter_tables.c. */
+DECLARE_ALIGNED(16, const int, cdef_directions[8][2]) = {
+ { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 },
+ { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 },
+ { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }
+};
+
+/* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on.
+ The search minimizes the weighted variance along all the lines in a
+ particular direction, i.e. the squared error between the input and a
+ "predicted" block where each pixel is replaced by the average along a line
+ in a particular direction. Since each direction have the same sum(x^2) term,
+ that term is never computed. See Section 2, step 2, of:
+ http://jmvalin.ca/notes/intra_paint.pdf */
+int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8] = { 0 };
+ int partial[8][15] = { { 0 } };
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n.
+ The output is then 840 times larger, but we don't care for finding
+ the max. */
+ static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 };
+ for (i = 0; i < 8; i++) {
+ int j;
+ for (j = 0; j < 8; j++) {
+ int x;
+ /* We subtract 128 here to reduce the maximum range of the squared
+ partial sums. */
+ x = (img[i * stride + j] >> coeff_shift) - 128;
+ partial[0][i + j] += x;
+ partial[1][i + j / 2] += x;
+ partial[2][i] += x;
+ partial[3][3 + i - j / 2] += x;
+ partial[4][7 + i - j] += x;
+ partial[5][3 - i / 2 + j] += x;
+ partial[6][j] += x;
+ partial[7][i / 2 + j] += x;
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ cost[2] += partial[2][i] * partial[2][i];
+ cost[6] += partial[6][i] * partial[6][i];
+ }
+ cost[2] *= div_table[8];
+ cost[6] *= div_table[8];
+ for (i = 0; i < 7; i++) {
+ cost[0] += (partial[0][i] * partial[0][i] +
+ partial[0][14 - i] * partial[0][14 - i]) *
+ div_table[i + 1];
+ cost[4] += (partial[4][i] * partial[4][i] +
+ partial[4][14 - i] * partial[4][14 - i]) *
+ div_table[i + 1];
+ }
+ cost[0] += partial[0][7] * partial[0][7] * div_table[8];
+ cost[4] += partial[4][7] * partial[4][7] * div_table[8];
+ for (i = 1; i < 8; i += 2) {
+ int j;
+ for (j = 0; j < 4 + 1; j++) {
+ cost[i] += partial[i][3 + j] * partial[i][3 + j];
+ }
+ cost[i] *= div_table[8];
+ for (j = 0; j < 4 - 1; j++) {
+ cost[i] += (partial[i][j] * partial[i][j] +
+ partial[i][10 - j] * partial[i][10 - j]) *
+ div_table[2 * j + 2];
+ }
+ }
+ for (i = 0; i < 8; i++) {
+ if (cost[i] > best_cost) {
+ best_cost = cost[i];
+ best_dir = i;
+ }
+ }
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } };
+const int cdef_sec_taps[2][2] = { { 2, 1 }, { 2, 1 } };
+
+/* Smooth in the direction detected. */
+void cdef_filter_block_c(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int pri_strength, int sec_strength,
+ int dir, int pri_damping, int sec_damping, int bsize,
+ AOM_UNUSED int max_unused, int coeff_shift) {
+ int i, j, k;
+ const int s = CDEF_BSTRIDE;
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+ for (i = 0; i < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_4X8); i++) {
+ for (j = 0; j < 4 << (bsize == BLOCK_8X8 || bsize == BLOCK_8X4); j++) {
+ int16_t sum = 0;
+ int16_t y;
+ int16_t x = in[i * s + j];
+ int max = x;
+ int min = x;
+ for (k = 0; k < 2; k++) {
+ int16_t p0 = in[i * s + j + cdef_directions[dir][k]];
+ int16_t p1 = in[i * s + j - cdef_directions[dir][k]];
+ sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping);
+ sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping);
+ if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max);
+ if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max);
+ min = AOMMIN(p0, min);
+ min = AOMMIN(p1, min);
+ int16_t s0 = in[i * s + j + cdef_directions[(dir + 2) & 7][k]];
+ int16_t s1 = in[i * s + j - cdef_directions[(dir + 2) & 7][k]];
+ int16_t s2 = in[i * s + j + cdef_directions[(dir + 6) & 7][k]];
+ int16_t s3 = in[i * s + j - cdef_directions[(dir + 6) & 7][k]];
+ if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max);
+ if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max);
+ if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max);
+ if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max);
+ min = AOMMIN(s0, min);
+ min = AOMMIN(s1, min);
+ min = AOMMIN(s2, min);
+ min = AOMMIN(s3, min);
+ sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping);
+ sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping);
+ }
+ y = clamp((int16_t)x + ((8 + sum - (sum < 0)) >> 4), min, max);
+ if (dst8)
+ dst8[i * dstride + j] = (uint8_t)y;
+ else
+ dst16[i * dstride + j] = (uint16_t)y;
+ }
+ }
+}
+
+/* Compute the primary filter strength for an 8x8 block based on the
+ directional variance difference. A high variance difference means
+ that we have a highly directional pattern (e.g. a high contrast
+ edge), so we can apply more deringing. A low variance means that we
+ either have a low contrast edge, or a non-directional texture, so
+ we want to be careful not to blur. */
+static INLINE int adjust_strength(int strength, int32_t var) {
+ const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0;
+ /* We use the variance of 8x8 blocks to adjust the strength. */
+ return var ? (strength * (4 + i) + 8) >> 4 : 0;
+}
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+ int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+ int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int pri_damping, int sec_damping,
+ int coeff_shift) {
+ int bi;
+ int bx;
+ int by;
+ int bsize, bsizex, bsizey;
+
+ int pri_strength = level << coeff_shift;
+ sec_strength <<= coeff_shift;
+ sec_damping += coeff_shift - (pli != AOM_PLANE_Y);
+ pri_damping += coeff_shift - (pli != AOM_PLANE_Y);
+ bsize =
+ ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
+ bsizex = 3 - xdec;
+ bsizey = 3 - ydec;
+ if (dirinit && pri_strength == 0 && sec_strength == 0) {
+ // If we're here, both primary and secondary strengths are 0, and
+ // we still haven't written anything to y[] yet, so we just copy
+ // the input to y[]. This is necessary only for av1_cdef_search()
+ // and only av1_cdef_search() sets dirinit.
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ int iy, ix;
+ // TODO(stemidts/jmvalin): SIMD optimisations
+ for (iy = 0; iy < 1 << bsizey; iy++)
+ for (ix = 0; ix < 1 << bsizex; ix++)
+ dst16[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
+ in[((by << bsizey) + iy) * CDEF_BSTRIDE + (bx << bsizex) + ix];
+ }
+ return;
+ }
+
+ if (pli == 0) {
+ if (!dirinit || !*dirinit) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx],
+ CDEF_BSTRIDE, &var[by][bx], coeff_shift);
+ }
+ if (dirinit) *dirinit = 1;
+ }
+ }
+ if (pli == 1 && xdec != ydec) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 };
+ static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 };
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]];
+ }
+ }
+
+ for (bi = 0; bi < cdef_count; bi++) {
+ int t = dlist[bi].skip ? 0 : pri_strength;
+ int s = dlist[bi].skip ? 0 : sec_strength;
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ if (dst8)
+ cdef_filter_block(&dst8[(by << bsizey) * dstride + (bx << bsizex)], NULL,
+ dstride,
+ &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+ (pli ? t : adjust_strength(t, var[by][bx])), s,
+ t ? dir[by][bx] : 0, pri_damping, sec_damping, bsize,
+ (256 << coeff_shift) - 1, coeff_shift);
+ else
+ cdef_filter_block(
+ NULL,
+ &dst16[dirinit ? bi << (bsizex + bsizey)
+ : (by << bsizey) * dstride + (bx << bsizex)],
+ dirinit ? 1 << bsizex : dstride,
+ &in[(by * CDEF_BSTRIDE << bsizey) + (bx << bsizex)],
+ (pli ? t : adjust_strength(t, var[by][bx])), s, t ? dir[by][bx] : 0,
+ pri_damping, sec_damping, bsize, (256 << coeff_shift) - 1,
+ coeff_shift);
+ }
+}
diff --git a/third_party/aom/av1/common/cdef_block.h b/third_party/aom/av1/common/cdef_block.h
new file mode 100644
index 0000000000..6b4452cd64
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_H_
+
+#include "av1/common/odintrin.h"
+
+#define CDEF_BLOCKSIZE 64
+#define CDEF_BLOCKSIZE_LOG2 6
+#define CDEF_NBLOCKS ((1 << MAX_SB_SIZE_LOG2) / 8)
+#define CDEF_SB_SHIFT (MAX_SB_SIZE_LOG2 - CDEF_BLOCKSIZE_LOG2)
+
+/* We need to buffer three vertical lines. */
+#define CDEF_VBORDER (3)
+/* We only need to buffer three horizontal pixels too, but let's align to
+ 16 bytes (8 x 16 bits) to make vectorization easier. */
+#define CDEF_HBORDER (8)
+#define CDEF_BSTRIDE \
+ ALIGN_POWER_OF_TWO((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_HBORDER, 3)
+
+#define CDEF_VERY_LARGE (30000)
+#define CDEF_INBUF_SIZE \
+ (CDEF_BSTRIDE * ((1 << MAX_SB_SIZE_LOG2) + 2 * CDEF_VBORDER))
+
+extern const int cdef_pri_taps[2][2];
+extern const int cdef_sec_taps[2][2];
+DECLARE_ALIGNED(16, extern const int, cdef_directions[8][2]);
+
+typedef struct {
+ uint8_t by;
+ uint8_t bx;
+ uint8_t skip;
+} cdef_list;
+
+typedef void (*cdef_filter_block_func)(uint8_t *dst8, uint16_t *dst16,
+ int dstride, const uint16_t *in,
+ int pri_strength, int sec_strength,
+ int dir, int pri_damping,
+ int sec_damping, int bsize, int max,
+ int coeff_shift);
+void copy_cdef_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count, int bsize);
+
+void cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, uint16_t *in,
+ int xdec, int ydec, int dir[CDEF_NBLOCKS][CDEF_NBLOCKS],
+ int *dirinit, int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli,
+ cdef_list *dlist, int cdef_count, int level,
+ int sec_strength, int pri_damping, int sec_damping,
+ int coeff_shift);
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_H_
diff --git a/third_party/aom/av1/common/cdef_block_avx2.c b/third_party/aom/av1/common/cdef_block_avx2.c
new file mode 100644
index 0000000000..e2b85b3e28
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_avx2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_avx2
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_neon.c b/third_party/aom/av1/common/cdef_block_neon.c
new file mode 100644
index 0000000000..2d6bc65e31
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_neon.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_neon
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_simd.h b/third_party/aom/av1/common/cdef_block_simd.h
new file mode 100644
index 0000000000..14587a023a
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_simd.h
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+#define AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cdef_block.h"
+
+/* partial A is a 16-bit vector of the form:
+ [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form:
+ [0 y1 y2 y3 y4 y5 y6 y7].
+ This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
+ (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1
+ and const2. */
+static INLINE v128 fold_mul_and_sum(v128 partiala, v128 partialb, v128 const1,
+ v128 const2) {
+ v128 tmp;
+ /* Reverse partial B. */
+ partialb = v128_shuffle_8(
+ partialb, v128_from_32(0x0f0e0100, 0x03020504, 0x07060908, 0x0b0a0d0c));
+ /* Interleave the x and y values of identical indices and pair x8 with 0. */
+ tmp = partiala;
+ partiala = v128_ziplo_16(partialb, partiala);
+ partialb = v128_ziphi_16(partialb, tmp);
+ /* Square and add the corresponding x and y values. */
+ partiala = v128_madd_s16(partiala, partiala);
+ partialb = v128_madd_s16(partialb, partialb);
+ /* Multiply by constant. */
+ partiala = v128_mullo_s32(partiala, const1);
+ partialb = v128_mullo_s32(partialb, const2);
+ /* Sum all results. */
+ partiala = v128_add_32(partiala, partialb);
+ return partiala;
+}
+
+static INLINE v128 hsum4(v128 x0, v128 x1, v128 x2, v128 x3) {
+ v128 t0, t1, t2, t3;
+ t0 = v128_ziplo_32(x1, x0);
+ t1 = v128_ziplo_32(x3, x2);
+ t2 = v128_ziphi_32(x1, x0);
+ t3 = v128_ziphi_32(x3, x2);
+ x0 = v128_ziplo_64(t1, t0);
+ x1 = v128_ziphi_64(t1, t0);
+ x2 = v128_ziplo_64(t3, t2);
+ x3 = v128_ziphi_64(t3, t2);
+ return v128_add_32(v128_add_32(x0, x1), v128_add_32(x2, x3));
+}
+
+/* Computes cost for directions 0, 5, 6 and 7. We can call this function again
+ to compute the remaining directions. */
+static INLINE v128 compute_directions(v128 lines[8], int32_t tmp_cost1[4]) {
+ v128 partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
+ v128 partial6;
+ v128 tmp;
+ /* Partial sums for lines 0 and 1. */
+ partial4a = v128_shl_n_byte(lines[0], 14);
+ partial4b = v128_shr_n_byte(lines[0], 2);
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[1], 12));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[1], 4));
+ tmp = v128_add_16(lines[0], lines[1]);
+ partial5a = v128_shl_n_byte(tmp, 10);
+ partial5b = v128_shr_n_byte(tmp, 6);
+ partial7a = v128_shl_n_byte(tmp, 4);
+ partial7b = v128_shr_n_byte(tmp, 12);
+ partial6 = tmp;
+
+ /* Partial sums for lines 2 and 3. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[2], 10));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[2], 6));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[3], 8));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[3], 8));
+ tmp = v128_add_16(lines[2], lines[3]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 8));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 8));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 6));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 10));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 4 and 5. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[4], 6));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[4], 10));
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[5], 4));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[5], 12));
+ tmp = v128_add_16(lines[4], lines[5]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 6));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 10));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 8));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 8));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Partial sums for lines 6 and 7. */
+ partial4a = v128_add_16(partial4a, v128_shl_n_byte(lines[6], 2));
+ partial4b = v128_add_16(partial4b, v128_shr_n_byte(lines[6], 14));
+ partial4a = v128_add_16(partial4a, lines[7]);
+ tmp = v128_add_16(lines[6], lines[7]);
+ partial5a = v128_add_16(partial5a, v128_shl_n_byte(tmp, 4));
+ partial5b = v128_add_16(partial5b, v128_shr_n_byte(tmp, 12));
+ partial7a = v128_add_16(partial7a, v128_shl_n_byte(tmp, 10));
+ partial7b = v128_add_16(partial7b, v128_shr_n_byte(tmp, 6));
+ partial6 = v128_add_16(partial6, tmp);
+
+ /* Compute costs in terms of partial sums. */
+ partial4a =
+ fold_mul_and_sum(partial4a, partial4b, v128_from_32(210, 280, 420, 840),
+ v128_from_32(105, 120, 140, 168));
+ partial7a =
+ fold_mul_and_sum(partial7a, partial7b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial5a =
+ fold_mul_and_sum(partial5a, partial5b, v128_from_32(210, 420, 0, 0),
+ v128_from_32(105, 105, 105, 140));
+ partial6 = v128_madd_s16(partial6, partial6);
+ partial6 = v128_mullo_s32(partial6, v128_dup_32(105));
+
+ partial4a = hsum4(partial4a, partial5a, partial6, partial7a);
+ v128_store_unaligned(tmp_cost1, partial4a);
+ return partial4a;
+}
+
+/* transpose and reverse the order of the lines -- equivalent to a 90-degree
+ counter-clockwise rotation of the pixels. */
+static INLINE void array_reverse_transpose_8x8(v128 *in, v128 *res) {
+ const v128 tr0_0 = v128_ziplo_16(in[1], in[0]);
+ const v128 tr0_1 = v128_ziplo_16(in[3], in[2]);
+ const v128 tr0_2 = v128_ziphi_16(in[1], in[0]);
+ const v128 tr0_3 = v128_ziphi_16(in[3], in[2]);
+ const v128 tr0_4 = v128_ziplo_16(in[5], in[4]);
+ const v128 tr0_5 = v128_ziplo_16(in[7], in[6]);
+ const v128 tr0_6 = v128_ziphi_16(in[5], in[4]);
+ const v128 tr0_7 = v128_ziphi_16(in[7], in[6]);
+
+ const v128 tr1_0 = v128_ziplo_32(tr0_1, tr0_0);
+ const v128 tr1_1 = v128_ziplo_32(tr0_5, tr0_4);
+ const v128 tr1_2 = v128_ziphi_32(tr0_1, tr0_0);
+ const v128 tr1_3 = v128_ziphi_32(tr0_5, tr0_4);
+ const v128 tr1_4 = v128_ziplo_32(tr0_3, tr0_2);
+ const v128 tr1_5 = v128_ziplo_32(tr0_7, tr0_6);
+ const v128 tr1_6 = v128_ziphi_32(tr0_3, tr0_2);
+ const v128 tr1_7 = v128_ziphi_32(tr0_7, tr0_6);
+
+ res[7] = v128_ziplo_64(tr1_1, tr1_0);
+ res[6] = v128_ziphi_64(tr1_1, tr1_0);
+ res[5] = v128_ziplo_64(tr1_3, tr1_2);
+ res[4] = v128_ziphi_64(tr1_3, tr1_2);
+ res[3] = v128_ziplo_64(tr1_5, tr1_4);
+ res[2] = v128_ziphi_64(tr1_5, tr1_4);
+ res[1] = v128_ziplo_64(tr1_7, tr1_6);
+ res[0] = v128_ziphi_64(tr1_7, tr1_6);
+}
+
+int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var,
+ int coeff_shift) {
+ int i;
+ int32_t cost[8];
+ int32_t best_cost = 0;
+ int best_dir = 0;
+ v128 lines[8];
+ for (i = 0; i < 8; i++) {
+ lines[i] = v128_load_unaligned(&img[i * stride]);
+ lines[i] =
+ v128_sub_16(v128_shr_s16(lines[i], coeff_shift), v128_dup_16(128));
+ }
+
+ /* Compute "mostly vertical" directions. */
+ v128 dir47 = compute_directions(lines, cost + 4);
+
+ array_reverse_transpose_8x8(lines, lines);
+
+ /* Compute "mostly horizontal" directions. */
+ v128 dir03 = compute_directions(lines, cost);
+
+ v128 max = v128_max_s32(dir03, dir47);
+ max = v128_max_s32(max, v128_align(max, max, 8));
+ max = v128_max_s32(max, v128_align(max, max, 4));
+ best_cost = v128_low_u32(max);
+ v128 t =
+ v128_pack_s32_s16(v128_cmpeq_32(max, dir47), v128_cmpeq_32(max, dir03));
+ best_dir = v128_movemask_8(v128_pack_s16_s8(t, t));
+ best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros
+
+ /* Difference between the optimal variance and the variance along the
+ orthogonal direction. Again, the sum(x^2) terms cancel out. */
+ *var = best_cost - cost[(best_dir + 4) & 7];
+ /* We'd normally divide by 840, but dividing by 1024 is close enough
+ for what we're going to do with this. */
+ *var >>= 10;
+ return best_dir;
+}
+
+// sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp)))
+SIMD_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold,
+ unsigned int adjdamp) {
+ v256 diff = v256_sub_16(a, b);
+ const v256 sign = v256_shr_n_s16(diff, 15);
+ diff = v256_abs_s16(diff);
+ const v256 s =
+ v256_ssub_u16(v256_dup_16(threshold), v256_shr_u16(diff, adjdamp));
+ return v256_xor(v256_add_16(sign, v256_min_s16(diff, s)), sign);
+}
+
+// sign(a - b) * min(abs(a - b), max(0, strength - (abs(a - b) >> adjdamp)))
+SIMD_INLINE v128 constrain(v256 a, v256 b, unsigned int strength,
+ unsigned int adjdamp) {
+ const v256 diff16 = v256_sub_16(a, b);
+ v128 diff = v128_pack_s16_s8(v256_high_v128(diff16), v256_low_v128(diff16));
+ const v128 sign = v128_cmplt_s8(diff, v128_zero());
+ diff = v128_abs_s8(diff);
+ return v128_xor(
+ v128_add_8(sign,
+ v128_min_u8(diff, v128_ssub_u8(v128_dup_8(strength),
+ v128_shr_u8(diff, adjdamp)))),
+ sign);
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ AOM_UNUSED int max_unused,
+ int coeff_shift) {
+ v128 p0, p1, p2, p3;
+ v256 sum, row, tap, res;
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+ sum = v256_zero();
+ row = v256_from_v64(v64_load_aligned(&in[0 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[1 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[2 * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[3 * CDEF_BSTRIDE]));
+ max = min = row;
+
+ if (pri_strength) {
+ // Primary near taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+ // Primary far taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + po2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - po2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+ }
+
+ if (sec_strength) {
+ // Secondary near taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // Secondary far taps
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s1o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s1o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE + s2o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap = v256_from_v64(v64_load_unaligned(&in[0 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[1 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[2 * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[3 * CDEF_BSTRIDE - s2o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+ }
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ res = v256_min_s16(v256_max_s16(res, min), max);
+ res = v256_pack_s16_u8(res, res);
+
+ p0 = v256_low_v128(res);
+ u32_store_aligned(&dst[0 * dstride], v64_high_u32(v128_high_v64(p0)));
+ u32_store_aligned(&dst[1 * dstride], v64_low_u32(v128_high_v64(p0)));
+ u32_store_aligned(&dst[2 * dstride], v64_high_u32(v128_low_v64(p0)));
+ u32_store_aligned(&dst[3 * dstride], v64_low_u32(v128_low_v64(p0)));
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_8)(uint8_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ AOM_UNUSED int max_unused,
+ int coeff_shift) {
+ int i;
+ v128 p0, p1, p2, p3;
+ v256 sum, row, res, tap;
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ for (i = 0; i < 8; i += 2) {
+ sum = v256_zero();
+ row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+ max = min = row;
+ // Primary near taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+ // Primary far taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, pri_strength, pri_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(pri_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p1),
+ v128_ziplo_8(p0, p1))));
+
+ // Secondary near taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[0]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // Secondary far taps
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p0 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p1 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p2 = constrain(tap, row, sec_strength, sec_damping);
+ tap =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+ max = v256_max_s16(max, v256_andn(tap, v256_cmpeq_16(tap, large)));
+ min = v256_min_s16(min, tap);
+ p3 = constrain(tap, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ p0 = v128_add_8(p0, p1);
+ p2 = v128_add_8(p2, p3);
+ sum = v256_add_16(sum, v256_madd_us8(v256_dup_8(sec_taps[1]),
+ v256_from_v128(v128_ziphi_8(p0, p2),
+ v128_ziplo_8(p0, p2))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ res = v256_min_s16(v256_max_s16(res, min), max);
+ res = v256_pack_s16_u8(res, res);
+
+ p0 = v256_low_v128(res);
+ v64_store_aligned(&dst[i * dstride], v128_high_v64(p0));
+ v64_store_aligned(&dst[(i + 1) * dstride], v128_low_v64(p0));
+ }
+}
+
+void SIMD_FUNC(cdef_filter_block_4x4_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ AOM_UNUSED int max_unused,
+ int coeff_shift) {
+ int i;
+ v256 p0, p1, p2, p3, sum, row, res;
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+ for (i = 0; i < 4; i += 4) {
+ sum = v256_zero();
+ row = v256_from_v64(v64_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 2) * CDEF_BSTRIDE]),
+ v64_load_aligned(&in[(i + 3) * CDEF_BSTRIDE]));
+ min = max = row;
+
+ // Primary near taps
+ p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po1]));
+ p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po1]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ min = v256_min_s16(v256_min_s16(min, p0), p1);
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+ // Primary far taps
+ p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + po2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + po2]));
+ p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - po2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - po2]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ min = v256_min_s16(v256_min_s16(min, p0), p1);
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+ // Secondary near taps
+ p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o1]));
+ p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o1]));
+ p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o1]));
+ p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o1]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o1]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+ v256_andn(p3, v256_cmpeq_16(p3, large)));
+ min = v256_min_s16(
+ v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // Secondary far taps
+ p0 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s1o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s1o2]));
+ p1 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s1o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s1o2]));
+ p2 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE + s2o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE + s2o2]));
+ p3 = v256_from_v64(v64_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 2) * CDEF_BSTRIDE - s2o2]),
+ v64_load_unaligned(&in[(i + 3) * CDEF_BSTRIDE - s2o2]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+ v256_andn(p3, v256_cmpeq_16(p3, large)));
+ min = v256_min_s16(
+ v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ res = v256_min_s16(v256_max_s16(res, min), max);
+
+ v64_store_aligned(&dst[i * dstride], v128_high_v64(v256_high_v128(res)));
+ v64_store_aligned(&dst[(i + 1) * dstride],
+ v128_low_v64(v256_high_v128(res)));
+ v64_store_aligned(&dst[(i + 2) * dstride],
+ v128_high_v64(v256_low_v128(res)));
+ v64_store_aligned(&dst[(i + 3) * dstride],
+ v128_low_v64(v256_low_v128(res)));
+ }
+}
+
+void SIMD_FUNC(cdef_filter_block_8x8_16)(uint16_t *dst, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir,
+ int pri_damping, int sec_damping,
+ AOM_UNUSED int max_unused,
+ int coeff_shift) {
+ int i;
+ v256 sum, p0, p1, p2, p3, row, res;
+ v256 max, min, large = v256_dup_16(CDEF_VERY_LARGE);
+ int po1 = cdef_directions[dir][0];
+ int po2 = cdef_directions[dir][1];
+ int s1o1 = cdef_directions[(dir + 2) & 7][0];
+ int s1o2 = cdef_directions[(dir + 2) & 7][1];
+ int s2o1 = cdef_directions[(dir + 6) & 7][0];
+ int s2o2 = cdef_directions[(dir + 6) & 7][1];
+
+ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1];
+ const int *sec_taps = cdef_sec_taps[(pri_strength >> coeff_shift) & 1];
+
+ if (pri_strength)
+ pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength));
+ if (sec_strength)
+ sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength));
+
+ for (i = 0; i < 8; i += 2) {
+ sum = v256_zero();
+ row = v256_from_v128(v128_load_aligned(&in[i * CDEF_BSTRIDE]),
+ v128_load_aligned(&in[(i + 1) * CDEF_BSTRIDE]));
+
+ min = max = row;
+ // Primary near taps
+ p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po1]));
+ p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po1]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ min = v256_min_s16(v256_min_s16(min, p0), p1);
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[0] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[0]), v256_add_16(p0, p1)));
+
+ // Primary far taps
+ p0 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + po2]));
+ p1 = v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - po2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - po2]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ min = v256_min_s16(v256_min_s16(min, p0), p1);
+ p0 = constrain16(p0, row, pri_strength, pri_damping);
+ p1 = constrain16(p1, row, pri_strength, pri_damping);
+
+ // sum += pri_taps[1] * (p0 + p1)
+ sum = v256_add_16(
+ sum, v256_mullo_s16(v256_dup_16(pri_taps[1]), v256_add_16(p0, p1)));
+
+ // Secondary near taps
+ p0 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o1]));
+ p1 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o1]));
+ p2 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o1]));
+ p3 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o1]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o1]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+ v256_andn(p3, v256_cmpeq_16(p3, large)));
+ min = v256_min_s16(
+ v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[0] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[0]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // Secondary far taps
+ p0 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s1o2]));
+ p1 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s1o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s1o2]));
+ p2 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE + s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE + s2o2]));
+ p3 =
+ v256_from_v128(v128_load_unaligned(&in[i * CDEF_BSTRIDE - s2o2]),
+ v128_load_unaligned(&in[(i + 1) * CDEF_BSTRIDE - s2o2]));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p0, v256_cmpeq_16(p0, large))),
+ v256_andn(p1, v256_cmpeq_16(p1, large)));
+ max =
+ v256_max_s16(v256_max_s16(max, v256_andn(p2, v256_cmpeq_16(p2, large))),
+ v256_andn(p3, v256_cmpeq_16(p3, large)));
+ min = v256_min_s16(
+ v256_min_s16(v256_min_s16(v256_min_s16(min, p0), p1), p2), p3);
+ p0 = constrain16(p0, row, sec_strength, sec_damping);
+ p1 = constrain16(p1, row, sec_strength, sec_damping);
+ p2 = constrain16(p2, row, sec_strength, sec_damping);
+ p3 = constrain16(p3, row, sec_strength, sec_damping);
+
+ // sum += sec_taps[1] * (p0 + p1 + p2 + p3)
+ sum = v256_add_16(sum, v256_mullo_s16(v256_dup_16(sec_taps[1]),
+ v256_add_16(v256_add_16(p0, p1),
+ v256_add_16(p2, p3))));
+
+ // res = row + ((sum - (sum < 0) + 8) >> 4)
+ sum = v256_add_16(sum, v256_cmplt_s16(sum, v256_zero()));
+ res = v256_add_16(sum, v256_dup_16(8));
+ res = v256_shr_n_s16(res, 4);
+ res = v256_add_16(row, res);
+ res = v256_min_s16(v256_max_s16(res, min), max);
+ v128_store_unaligned(&dst[i * dstride], v256_high_v128(res));
+ v128_store_unaligned(&dst[(i + 1) * dstride], v256_low_v128(res));
+ }
+}
+
+void SIMD_FUNC(cdef_filter_block)(uint8_t *dst8, uint16_t *dst16, int dstride,
+ const uint16_t *in, int pri_strength,
+ int sec_strength, int dir, int pri_damping,
+ int sec_damping, int bsize, int max,
+ int coeff_shift) {
+ if (dst8) {
+ if (bsize == BLOCK_8X8) {
+ SIMD_FUNC(cdef_filter_block_8x8_8)
+ (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ } else if (bsize == BLOCK_4X8) {
+ SIMD_FUNC(cdef_filter_block_4x4_8)
+ (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ SIMD_FUNC(cdef_filter_block_4x4_8)
+ (dst8 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+ } else if (bsize == BLOCK_8X4) {
+ SIMD_FUNC(cdef_filter_block_4x4_8)
+ (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ SIMD_FUNC(cdef_filter_block_4x4_8)
+ (dst8 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ } else {
+ SIMD_FUNC(cdef_filter_block_4x4_8)
+ (dst8, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ }
+ } else {
+ if (bsize == BLOCK_8X8) {
+ SIMD_FUNC(cdef_filter_block_8x8_16)
+ (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ } else if (bsize == BLOCK_4X8) {
+ SIMD_FUNC(cdef_filter_block_4x4_16)
+ (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ SIMD_FUNC(cdef_filter_block_4x4_16)
+ (dst16 + 4 * dstride, dstride, in + 4 * CDEF_BSTRIDE, pri_strength,
+ sec_strength, dir, pri_damping, sec_damping, max, coeff_shift);
+ } else if (bsize == BLOCK_8X4) {
+ SIMD_FUNC(cdef_filter_block_4x4_16)
+ (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ SIMD_FUNC(cdef_filter_block_4x4_16)
+ (dst16 + 4, dstride, in + 4, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ } else {
+ assert(bsize == BLOCK_4X4);
+ SIMD_FUNC(cdef_filter_block_4x4_16)
+ (dst16, dstride, in, pri_strength, sec_strength, dir, pri_damping,
+ sec_damping, max, coeff_shift);
+ }
+ }
+}
+
+void SIMD_FUNC(copy_rect8_8bit_to_16bit)(uint16_t *dst, int dstride,
+ const uint8_t *src, int sstride, int v,
+ int h) {
+ int i, j;
+ for (i = 0; i < v; i++) {
+ for (j = 0; j < (h & ~0x7); j += 8) {
+ v64 row = v64_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row));
+ }
+ for (; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+void SIMD_FUNC(copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride,
+ const uint16_t *src, int sstride,
+ int v, int h) {
+ int i, j;
+ for (i = 0; i < v; i++) {
+ for (j = 0; j < (h & ~0x7); j += 8) {
+ v128 row = v128_load_unaligned(&src[i * sstride + j]);
+ v128_store_unaligned(&dst[i * dstride + j], row);
+ }
+ for (; j < h; j++) {
+ dst[i * dstride + j] = src[i * sstride + j];
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_CDEF_BLOCK_SIMD_H_
diff --git a/third_party/aom/av1/common/cdef_block_sse2.c b/third_party/aom/av1/common/cdef_block_sse2.c
new file mode 100644
index 0000000000..73f115d17c
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_sse2.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse2
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_sse4.c b/third_party/aom/av1/common/cdef_block_sse4.c
new file mode 100644
index 0000000000..349329af64
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_sse4.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_sse4_1
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cdef_block_ssse3.c b/third_party/aom/av1/common/cdef_block_ssse3.c
new file mode 100644
index 0000000000..3a93b150f3
--- /dev/null
+++ b/third_party/aom/av1/common/cdef_block_ssse3.c
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_simd.h"
+#define SIMD_FUNC(name) name##_ssse3
+#include "av1/common/cdef_block_simd.h"
diff --git a/third_party/aom/av1/common/cfl.c b/third_party/aom/av1/common/cfl.c
new file mode 100644
index 0000000000..ccc59b4eb7
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.c
@@ -0,0 +1,448 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/cfl.h"
+#include "av1/common/common_data.h"
+#include "av1/common/onyxc_int.h"
+
+#include "config/av1_rtcd.h"
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params) {
+ assert(block_size_wide[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+ assert(block_size_high[CFL_MAX_BLOCK_SIZE] == CFL_BUF_LINE);
+
+ memset(&cfl->recon_buf_q3, 0, sizeof(cfl->recon_buf_q3));
+ memset(&cfl->ac_buf_q3, 0, sizeof(cfl->ac_buf_q3));
+ cfl->subsampling_x = seq_params->subsampling_x;
+ cfl->subsampling_y = seq_params->subsampling_y;
+ cfl->are_parameters_computed = 0;
+ cfl->store_y = 0;
+ // The DC_PRED cache is disabled by default and is only enabled in
+ // cfl_rd_pick_alpha
+ cfl->use_dc_pred_cache = 0;
+ cfl->dc_pred_is_cached[CFL_PRED_U] = 0;
+ cfl->dc_pred_is_cached[CFL_PRED_V] = 0;
+}
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+ CFL_PRED_TYPE pred_plane, int width) {
+ assert(pred_plane < CFL_PRED_PLANES);
+ assert(width <= CFL_BUF_LINE);
+
+ if (get_bitdepth_data_path_index(xd)) {
+ uint16_t *const input_16 = CONVERT_TO_SHORTPTR(input);
+ memcpy(xd->cfl.dc_pred_cache[pred_plane], input_16, width << 1);
+ return;
+ }
+
+ memcpy(xd->cfl.dc_pred_cache[pred_plane], input, width);
+}
+
+static void cfl_load_dc_pred_lbd(const int16_t *dc_pred_cache, uint8_t *dst,
+ int dst_stride, int width, int height) {
+ for (int j = 0; j < height; j++) {
+ memcpy(dst, dc_pred_cache, width);
+ dst += dst_stride;
+ }
+}
+
+static void cfl_load_dc_pred_hbd(const int16_t *dc_pred_cache, uint16_t *dst,
+ int dst_stride, int width, int height) {
+ const size_t num_bytes = width << 1;
+ for (int j = 0; j < height; j++) {
+ memcpy(dst, dc_pred_cache, num_bytes);
+ dst += dst_stride;
+ }
+}
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, CFL_PRED_TYPE pred_plane) {
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ assert(pred_plane < CFL_PRED_PLANES);
+ assert(width <= CFL_BUF_LINE);
+ assert(height <= CFL_BUF_LINE);
+ if (get_bitdepth_data_path_index(xd)) {
+ uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+ cfl_load_dc_pred_hbd(xd->cfl.dc_pred_cache[pred_plane], dst_16, dst_stride,
+ width, height);
+ return;
+ }
+ cfl_load_dc_pred_lbd(xd->cfl.dc_pred_cache[pred_plane], dst, dst_stride,
+ width, height);
+}
+
+// Due to frame boundary issues, it is possible that the total area covered by
+// chroma exceeds that of luma. When this happens, we fill the missing pixels by
+// repeating the last columns and/or rows.
+static INLINE void cfl_pad(CFL_CTX *cfl, int width, int height) {
+ const int diff_width = width - cfl->buf_width;
+ const int diff_height = height - cfl->buf_height;
+
+ if (diff_width > 0) {
+ const int min_height = height - diff_height;
+ uint16_t *recon_buf_q3 = cfl->recon_buf_q3 + (width - diff_width);
+ for (int j = 0; j < min_height; j++) {
+ const uint16_t last_pixel = recon_buf_q3[-1];
+ assert(recon_buf_q3 + diff_width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+ for (int i = 0; i < diff_width; i++) {
+ recon_buf_q3[i] = last_pixel;
+ }
+ recon_buf_q3 += CFL_BUF_LINE;
+ }
+ cfl->buf_width = width;
+ }
+ if (diff_height > 0) {
+ uint16_t *recon_buf_q3 =
+ cfl->recon_buf_q3 + ((height - diff_height) * CFL_BUF_LINE);
+ for (int j = 0; j < diff_height; j++) {
+ const uint16_t *last_row_q3 = recon_buf_q3 - CFL_BUF_LINE;
+ assert(recon_buf_q3 + width <= cfl->recon_buf_q3 + CFL_BUF_SQUARE);
+ for (int i = 0; i < width; i++) {
+ recon_buf_q3[i] = last_row_q3[i];
+ }
+ recon_buf_q3 += CFL_BUF_LINE;
+ }
+ cfl->buf_height = height;
+ }
+}
+
+static void subtract_average_c(const uint16_t *src, int16_t *dst, int width,
+ int height, int round_offset, int num_pel_log2) {
+ int sum = round_offset;
+ const uint16_t *recon = src;
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ sum += recon[i];
+ }
+ recon += CFL_BUF_LINE;
+ }
+ const int avg = sum >> num_pel_log2;
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = src[i] - avg;
+ }
+ src += CFL_BUF_LINE;
+ dst += CFL_BUF_LINE;
+ }
+}
+
+CFL_SUB_AVG_FN(c)
+
+static INLINE int cfl_idx_to_alpha(int alpha_idx, int joint_sign,
+ CFL_PRED_TYPE pred_type) {
+ const int alpha_sign = (pred_type == CFL_PRED_U) ? CFL_SIGN_U(joint_sign)
+ : CFL_SIGN_V(joint_sign);
+ if (alpha_sign == CFL_SIGN_ZERO) return 0;
+ const int abs_alpha_q3 =
+ (pred_type == CFL_PRED_U) ? CFL_IDX_U(alpha_idx) : CFL_IDX_V(alpha_idx);
+ return (alpha_sign == CFL_SIGN_POS) ? abs_alpha_q3 + 1 : -abs_alpha_q3 - 1;
+}
+
+static INLINE void cfl_predict_lbd_c(const int16_t *ac_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = clip_pixel(get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i]);
+ }
+ dst += dst_stride;
+ ac_buf_q3 += CFL_BUF_LINE;
+ }
+}
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *ac_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3) {
+ (void)ac_buf_q3;
+ (void)dst;
+ (void)dst_stride;
+ (void)alpha_q3;
+ assert(0);
+}
+
+CFL_PREDICT_FN(c, lbd)
+
+void cfl_predict_hbd_c(const int16_t *ac_buf_q3, uint16_t *dst, int dst_stride,
+ int alpha_q3, int bit_depth, int width, int height) {
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ dst[i] = clip_pixel_highbd(
+ get_scaled_luma_q0(alpha_q3, ac_buf_q3[i]) + dst[i], bit_depth);
+ }
+ dst += dst_stride;
+ ac_buf_q3 += CFL_BUF_LINE;
+ }
+}
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *ac_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd) {
+ (void)ac_buf_q3;
+ (void)dst;
+ (void)dst_stride;
+ (void)alpha_q3;
+ (void)bd;
+ assert(0);
+}
+
+CFL_PREDICT_FN(c, hbd)
+
+static void cfl_compute_parameters(MACROBLOCKD *const xd, TX_SIZE tx_size) {
+ CFL_CTX *const cfl = &xd->cfl;
+ // Do not call cfl_compute_parameters multiple time on the same values.
+ assert(cfl->are_parameters_computed == 0);
+
+ cfl_pad(cfl, tx_size_wide[tx_size], tx_size_high[tx_size]);
+ get_subtract_average_fn(tx_size)(cfl->recon_buf_q3, cfl->ac_buf_q3);
+ cfl->are_parameters_computed = 1;
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane) {
+ CFL_CTX *const cfl = &xd->cfl;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(is_cfl_allowed(xd));
+
+ if (!cfl->are_parameters_computed) cfl_compute_parameters(xd, tx_size);
+
+ const int alpha_q3 =
+ cfl_idx_to_alpha(mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, plane - 1);
+ assert((tx_size_high[tx_size] - 1) * CFL_BUF_LINE + tx_size_wide[tx_size] <=
+ CFL_BUF_SQUARE);
+ if (get_bitdepth_data_path_index(xd)) {
+ uint16_t *dst_16 = CONVERT_TO_SHORTPTR(dst);
+ get_predict_hbd_fn(tx_size)(cfl->ac_buf_q3, dst_16, dst_stride, alpha_q3,
+ xd->bd);
+ return;
+ }
+ get_predict_lbd_fn(tx_size)(cfl->ac_buf_q3, dst, dst_stride, alpha_q3);
+}
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+ uint16_t *output_q3) {
+ (void)input;
+ (void)input_stride;
+ (void)output_q3;
+ assert(0);
+}
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+ uint16_t *output_q3) {
+ (void)input;
+ (void)input_stride;
+ (void)output_q3;
+ assert(0);
+}
+
+static void cfl_luma_subsampling_420_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j += 2) {
+ for (int i = 0; i < width; i += 2) {
+ const int bot = i + input_stride;
+ output_q3[i >> 1] =
+ (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+ }
+ input += input_stride << 1;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_422_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i += 2) {
+ output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_444_lbd_c(const uint8_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ output_q3[i] = input[i] << 3;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_420_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ for (int j = 0; j < height; j += 2) {
+ for (int i = 0; i < width; i += 2) {
+ const int bot = i + input_stride;
+ output_q3[i >> 1] =
+ (input[i] + input[i + 1] + input[bot] + input[bot + 1]) << 1;
+ }
+ input += input_stride << 1;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_422_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i += 2) {
+ output_q3[i >> 1] = (input[i] + input[i + 1]) << 2;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+static void cfl_luma_subsampling_444_hbd_c(const uint16_t *input,
+ int input_stride,
+ uint16_t *output_q3, int width,
+ int height) {
+ assert((height - 1) * CFL_BUF_LINE + width <= CFL_BUF_SQUARE);
+ for (int j = 0; j < height; j++) {
+ for (int i = 0; i < width; i++) {
+ output_q3[i] = input[i] << 3;
+ }
+ input += input_stride;
+ output_q3 += CFL_BUF_LINE;
+ }
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(c)
+
+static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
+ int sub_x, int sub_y) {
+ if (sub_x == 1) {
+ if (sub_y == 1) {
+ return cfl_get_luma_subsampling_420_hbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_422_hbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_444_hbd(tx_size);
+}
+
+static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
+ int sub_x, int sub_y) {
+ if (sub_x == 1) {
+ if (sub_y == 1) {
+ return cfl_get_luma_subsampling_420_lbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_422_lbd(tx_size);
+ }
+ return cfl_get_luma_subsampling_444_lbd(tx_size);
+}
+
+static void cfl_store(CFL_CTX *cfl, const uint8_t *input, int input_stride,
+ int row, int col, TX_SIZE tx_size, int use_hbd) {
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const int tx_off_log2 = tx_size_wide_log2[0];
+ const int sub_x = cfl->subsampling_x;
+ const int sub_y = cfl->subsampling_y;
+ const int store_row = row << (tx_off_log2 - sub_y);
+ const int store_col = col << (tx_off_log2 - sub_x);
+ const int store_height = height >> sub_y;
+ const int store_width = width >> sub_x;
+
+ // Invalidate current parameters
+ cfl->are_parameters_computed = 0;
+
+ // Store the surface of the pixel buffer that was written to, this way we
+ // can manage chroma overrun (e.g. when the chroma surfaces goes beyond the
+ // frame boundary)
+ if (col == 0 && row == 0) {
+ cfl->buf_width = store_width;
+ cfl->buf_height = store_height;
+ } else {
+ cfl->buf_width = OD_MAXI(store_col + store_width, cfl->buf_width);
+ cfl->buf_height = OD_MAXI(store_row + store_height, cfl->buf_height);
+ }
+
+ // Check that we will remain inside the pixel buffer.
+ assert(store_row + store_height <= CFL_BUF_LINE);
+ assert(store_col + store_width <= CFL_BUF_LINE);
+
+ // Store the input into the CfL pixel buffer
+ uint16_t *recon_buf_q3 =
+ cfl->recon_buf_q3 + (store_row * CFL_BUF_LINE + store_col);
+
+ if (use_hbd) {
+ cfl_subsampling_hbd(tx_size, sub_x, sub_y)(CONVERT_TO_SHORTPTR(input),
+ input_stride, recon_buf_q3);
+ } else {
+ cfl_subsampling_lbd(tx_size, sub_x, sub_y)(input, input_stride,
+ recon_buf_q3);
+ }
+}
+
+// Adjust the row and column of blocks smaller than 8X8, as chroma-referenced
+// and non-chroma-referenced blocks are stored together in the CfL buffer.
+static INLINE void sub8x8_adjust_offset(const CFL_CTX *cfl, int *row_out,
+ int *col_out) {
+ // Increment row index for bottom: 8x4, 16x4 or both bottom 4x4s.
+ if ((cfl->mi_row & 0x01) && cfl->subsampling_y) {
+ assert(*row_out == 0);
+ (*row_out)++;
+ }
+
+ // Increment col index for right: 4x8, 4x16 or both right 4x4s.
+ if ((cfl->mi_col & 0x01) && cfl->subsampling_x) {
+ assert(*col_out == 0);
+ (*col_out)++;
+ }
+}
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+ BLOCK_SIZE bsize) {
+ CFL_CTX *const cfl = &xd->cfl;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ uint8_t *dst =
+ &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+
+ if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+ // Only dimensions of size 4 can have an odd offset.
+ assert(!((col & 1) && tx_size_wide[tx_size] != 4));
+ assert(!((row & 1) && tx_size_high[tx_size] != 4));
+ sub8x8_adjust_offset(cfl, &row, &col);
+ }
+ cfl_store(cfl, dst, pd->dst.stride, row, col, tx_size,
+ get_bitdepth_data_path_index(xd));
+}
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ CFL_CTX *const cfl = &xd->cfl;
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_Y];
+ int row = 0;
+ int col = 0;
+
+ if (block_size_high[bsize] == 4 || block_size_wide[bsize] == 4) {
+ sub8x8_adjust_offset(cfl, &row, &col);
+ }
+ const int width = max_intra_block_width(xd, bsize, AOM_PLANE_Y, tx_size);
+ const int height = max_intra_block_height(xd, bsize, AOM_PLANE_Y, tx_size);
+ tx_size = get_tx_size(width, height);
+ cfl_store(cfl, pd->dst.buf, pd->dst.stride, row, col, tx_size,
+ get_bitdepth_data_path_index(xd));
+}
diff --git a/third_party/aom/av1/common/cfl.h b/third_party/aom/av1/common/cfl.h
new file mode 100644
index 0000000000..d627891bf5
--- /dev/null
+++ b/third_party/aom/av1/common/cfl.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CFL_H_
+#define AOM_AV1_COMMON_CFL_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+// Can we use CfL for the current block?
+static INLINE CFL_ALLOWED_TYPE is_cfl_allowed(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ assert(bsize < BLOCK_SIZES_ALL);
+ if (xd->lossless[mbmi->segment_id]) {
+ // In lossless, CfL is available when the partition size is equal to the
+ // transform size.
+ const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+ const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+ const int plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+ return (CFL_ALLOWED_TYPE)(plane_bsize == BLOCK_4X4);
+ }
+ // Spec: CfL is available to luma partitions lesser than or equal to 32x32
+ return (CFL_ALLOWED_TYPE)(block_size_wide[bsize] <= 32 &&
+ block_size_high[bsize] <= 32);
+}
+
+// Do we need to save the luma pixels from the current block,
+// for a possible future CfL prediction?
+static INLINE CFL_ALLOWED_TYPE store_cfl_required(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+
+ if (cm->seq_params.monochrome) return CFL_DISALLOWED;
+
+ if (!xd->cfl.is_chroma_reference) {
+ // For non-chroma-reference blocks, we should always store the luma pixels,
+ // in case the corresponding chroma-reference block uses CfL.
+ // Note that this can only happen for block sizes which are <8 on
+ // their shortest side, as otherwise they would be chroma reference
+ // blocks.
+ return CFL_ALLOWED;
+ }
+
+ // If this block has chroma information, we know whether we're
+ // actually going to perform a CfL prediction
+ return (CFL_ALLOWED_TYPE)(!is_inter_block(mbmi) &&
+ mbmi->uv_mode == UV_CFL_PRED);
+}
+
+static INLINE int get_scaled_luma_q0(int alpha_q3, int16_t pred_buf_q3) {
+ int scaled_luma_q6 = alpha_q3 * pred_buf_q3;
+ return ROUND_POWER_OF_TWO_SIGNED(scaled_luma_q6, 6);
+}
+
+static INLINE CFL_PRED_TYPE get_cfl_pred_type(PLANE_TYPE plane) {
+ assert(plane > 0);
+ return (CFL_PRED_TYPE)(plane - 1);
+}
+
+void cfl_predict_block(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, int plane);
+
+void cfl_store_block(MACROBLOCKD *const xd, BLOCK_SIZE bsize, TX_SIZE tx_size);
+
+void cfl_store_tx(MACROBLOCKD *const xd, int row, int col, TX_SIZE tx_size,
+ BLOCK_SIZE bsize);
+
+void cfl_store_dc_pred(MACROBLOCKD *const xd, const uint8_t *input,
+ CFL_PRED_TYPE pred_plane, int width);
+
+void cfl_load_dc_pred(MACROBLOCKD *const xd, uint8_t *dst, int dst_stride,
+ TX_SIZE tx_size, CFL_PRED_TYPE pred_plane);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_lbd_null(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_subsample_hbd_null(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// Allows the CFL_SUBSAMPLE function to switch types depending on the bitdepth.
+#define CFL_lbd_TYPE uint8_t *cfl_type
+#define CFL_hbd_TYPE uint16_t *cfl_type
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUBSAMPLE(arch, sub, bd, width, height) \
+ void subsample_##bd##_##sub##_##width##x##height##_##arch( \
+ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \
+ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \
+ output_q3, width, height); \
+ }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+ TX_SIZE tx_size) { \
+ CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ return subfn_##sub[tx_size]; \
+ }
+
+// Declare an architecture-specific array of function pointers for size-specific
+// wrappers.
+#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \
+ subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \
+ subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \
+ subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \
+ cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \
+ subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \
+ subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \
+ subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \
+ subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \
+ subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \
+ cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \
+ subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \
+ subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \
+ subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \
+ cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
+ };
+
+// The RTCD script does not support passing in an array, so we wrap it in this
+// function.
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
+
+// Null function used for invalid tx_sizes
+static INLINE void cfl_subtract_average_null(const uint16_t *src,
+ int16_t *dst) {
+ (void)dst;
+ (void)src;
+ assert(0);
+}
+
+// Declare a size-specific wrapper for the size-generic function. The compiler
+// will inline the size generic function in here, the advantage is that the size
+// will be constant allowing for loop unrolling and other constant propagated
+// goodness.
+#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \
+ void subtract_average_##width##x##height##_##arch(const uint16_t *src, \
+ int16_t *dst) { \
+ subtract_average_##arch(src, dst, width, height, round_offset, \
+ num_pel_log2); \
+ }
+
+// Declare size-specific wrappers for all valid CfL sizes.
+#define CFL_SUB_AVG_FN(arch) \
+ CFL_SUB_AVG_X(arch, 4, 4, 8, 4) \
+ CFL_SUB_AVG_X(arch, 4, 8, 16, 5) \
+ CFL_SUB_AVG_X(arch, 4, 16, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 4, 16, 5) \
+ CFL_SUB_AVG_X(arch, 8, 8, 32, 6) \
+ CFL_SUB_AVG_X(arch, 8, 16, 64, 7) \
+ CFL_SUB_AVG_X(arch, 8, 32, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 4, 32, 6) \
+ CFL_SUB_AVG_X(arch, 16, 8, 64, 7) \
+ CFL_SUB_AVG_X(arch, 16, 16, 128, 8) \
+ CFL_SUB_AVG_X(arch, 16, 32, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 8, 128, 8) \
+ CFL_SUB_AVG_X(arch, 32, 16, 256, 9) \
+ CFL_SUB_AVG_X(arch, 32, 32, 512, 10) \
+ cfl_subtract_average_fn get_subtract_average_fn_##arch(TX_SIZE tx_size) { \
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \
+ subtract_average_4x4_##arch, /* 4x4 */ \
+ subtract_average_8x8_##arch, /* 8x8 */ \
+ subtract_average_16x16_##arch, /* 16x16 */ \
+ subtract_average_32x32_##arch, /* 32x32 */ \
+ cfl_subtract_average_null, /* 64x64 (invalid CFL size) */ \
+ subtract_average_4x8_##arch, /* 4x8 */ \
+ subtract_average_8x4_##arch, /* 8x4 */ \
+ subtract_average_8x16_##arch, /* 8x16 */ \
+ subtract_average_16x8_##arch, /* 16x8 */ \
+ subtract_average_16x32_##arch, /* 16x32 */ \
+ subtract_average_32x16_##arch, /* 32x16 */ \
+ cfl_subtract_average_null, /* 32x64 (invalid CFL size) */ \
+ cfl_subtract_average_null, /* 64x32 (invalid CFL size) */ \
+ subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \
+ subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \
+ subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \
+ subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \
+ cfl_subtract_average_null, /* 16x64 (invalid CFL size) */ \
+ cfl_subtract_average_null, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return sub_avg[tx_size % TX_SIZES_ALL]; \
+ }
+
+// For VSX SIMD optimization, the C versions of width == 4 subtract are
+// faster than the VSX. As such, the VSX code calls the C versions.
+void subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
+
+#define CFL_PREDICT_lbd(arch, width, height) \
+ void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \
+ uint8_t *dst, int dst_stride, \
+ int alpha_q3) { \
+ cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \
+ height); \
+ }
+
+#define CFL_PREDICT_hbd(arch, width, height) \
+ void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \
+ uint16_t *dst, int dst_stride, \
+ int alpha_q3, int bd) { \
+ cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \
+ height); \
+ }
+
+// This wrapper exists because clang format does not like calling macros with
+// lowercase letters.
+#define CFL_PREDICT_X(arch, width, height, bd) \
+ CFL_PREDICT_##bd(arch, width, height)
+
+// Null function used for invalid tx_sizes
+void cfl_predict_lbd_null(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+// Null function used for invalid tx_sizes
+void cfl_predict_hbd_null(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+#define CFL_PREDICT_FN(arch, bd) \
+ CFL_PREDICT_X(arch, 4, 4, bd) \
+ CFL_PREDICT_X(arch, 4, 8, bd) \
+ CFL_PREDICT_X(arch, 4, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 4, bd) \
+ CFL_PREDICT_X(arch, 8, 8, bd) \
+ CFL_PREDICT_X(arch, 8, 16, bd) \
+ CFL_PREDICT_X(arch, 8, 32, bd) \
+ CFL_PREDICT_X(arch, 16, 4, bd) \
+ CFL_PREDICT_X(arch, 16, 8, bd) \
+ CFL_PREDICT_X(arch, 16, 16, bd) \
+ CFL_PREDICT_X(arch, 16, 32, bd) \
+ CFL_PREDICT_X(arch, 32, 8, bd) \
+ CFL_PREDICT_X(arch, 32, 16, bd) \
+ CFL_PREDICT_X(arch, 32, 32, bd) \
+ cfl_predict_##bd##_fn get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \
+ static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \
+ predict_##bd##_4x4_##arch, /* 4x4 */ \
+ predict_##bd##_8x8_##arch, /* 8x8 */ \
+ predict_##bd##_16x16_##arch, /* 16x16 */ \
+ predict_##bd##_32x32_##arch, /* 32x32 */ \
+ cfl_predict_##bd##_null, /* 64x64 (invalid CFL size) */ \
+ predict_##bd##_4x8_##arch, /* 4x8 */ \
+ predict_##bd##_8x4_##arch, /* 8x4 */ \
+ predict_##bd##_8x16_##arch, /* 8x16 */ \
+ predict_##bd##_16x8_##arch, /* 16x8 */ \
+ predict_##bd##_16x32_##arch, /* 16x32 */ \
+ predict_##bd##_32x16_##arch, /* 32x16 */ \
+ cfl_predict_##bd##_null, /* 32x64 (invalid CFL size) */ \
+ cfl_predict_##bd##_null, /* 64x32 (invalid CFL size) */ \
+ predict_##bd##_4x16_##arch, /* 4x16 */ \
+ predict_##bd##_16x4_##arch, /* 16x4 */ \
+ predict_##bd##_8x32_##arch, /* 8x32 */ \
+ predict_##bd##_32x8_##arch, /* 32x8 */ \
+ cfl_predict_##bd##_null, /* 16x64 (invalid CFL size) */ \
+ cfl_predict_##bd##_null, /* 64x16 (invalid CFL size) */ \
+ }; \
+ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \
+ /* index the function pointer array out of bounds. */ \
+ return pred[tx_size % TX_SIZES_ALL]; \
+ }
+
+#endif // AOM_AV1_COMMON_CFL_H_
diff --git a/third_party/aom/av1/common/common.h b/third_party/aom/av1/common/common.h
new file mode 100644
index 0000000000..bed6083db2
--- /dev/null
+++ b/third_party/aom/av1/common/common.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_H_
+#define AOM_AV1_COMMON_COMMON_H_
+
+/* Interface header for common constant data structures and lookup tables */
+
+#include <assert.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom/aom_integer.h"
+#include "aom_ports/bitops.h"
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+// Only need this for fixed-size arrays, for structs just assign.
+#define av1_copy(dest, src) \
+ { \
+ assert(sizeof(dest) == sizeof(src)); \
+ memcpy(dest, src, sizeof(src)); \
+ }
+
+// Use this for variably-sized arrays.
+#define av1_copy_array(dest, src, n) \
+ { \
+ assert(sizeof(*(dest)) == sizeof(*(src))); \
+ memcpy(dest, src, n * sizeof(*(src))); \
+ }
+
+#define av1_zero(dest) memset(&(dest), 0, sizeof(dest))
+#define av1_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
+
+static INLINE int get_unsigned_bits(unsigned int num_values) {
+ return num_values > 0 ? get_msb(num_values) + 1 : 0;
+}
+
+#define CHECK_MEM_ERROR(cm, lval, expr) \
+ AOM_CHECK_MEM_ERROR(&cm->error, lval, expr)
+
+#define AOM_FRAME_MARKER 0x2
+
+#define AV1_MIN_TILE_SIZE_BYTES 1
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_COMMON_H_
diff --git a/third_party/aom/av1/common/common_data.h b/third_party/aom/av1/common/common_data.h
new file mode 100644
index 0000000000..46e455fdb1
--- /dev/null
+++ b/third_party/aom/av1/common/common_data.h
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_COMMON_DATA_H_
+#define AOM_AV1_COMMON_COMMON_DATA_H_
+
+#include "av1/common/enums.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Log 2 conversion lookup tables in units of mode info (4x4).
+// The Mi_Width_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide_log2[BLOCK_SIZES_ALL] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 0, 2, 1, 3, 2, 4
+};
+// The Mi_Height_Log2 table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high_log2[BLOCK_SIZES_ALL] = {
+ 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 2, 0, 3, 1, 4, 2
+};
+
+// Width/height lookup tables in units of mode info (4x4).
+// The Num_4x4_Blocks_Wide table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_wide[BLOCK_SIZES_ALL] = {
+ 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 1, 4, 2, 8, 4, 16
+};
+
+// The Num_4x4_Blocks_High table in the spec (Section 9.3. Conversion tables).
+static const uint8_t mi_size_high[BLOCK_SIZES_ALL] = {
+ 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 4, 1, 8, 2, 16, 4
+};
+
+// Width/height lookup tables in units of samples.
+// The Block_Width table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_wide[BLOCK_SIZES_ALL] = {
+ 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32,
+ 64, 64, 64, 128, 128, 4, 16, 8, 32, 16, 64
+};
+
+// The Block_Height table in the spec (Section 9.3. Conversion tables).
+static const uint8_t block_size_high[BLOCK_SIZES_ALL] = {
+ 4, 8, 4, 8, 16, 8, 16, 32, 16, 32, 64,
+ 32, 64, 128, 64, 128, 16, 4, 32, 8, 64, 16
+};
+
+// Maps a block size to a context.
+// The Size_Group table in the spec (Section 9.3. Conversion tables).
+// AOMMIN(3, AOMMIN(mi_size_wide_log2(bsize), mi_size_high_log2(bsize)))
+static const uint8_t size_group_lookup[BLOCK_SIZES_ALL] = {
+ 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 0, 0, 1, 1, 2, 2
+};
+
+static const uint8_t num_pels_log2_lookup[BLOCK_SIZES_ALL] = {
+ 4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, 13, 13, 14, 6, 6, 8, 8, 10, 10
+};
+
+// A compressed version of the Partition_Subsize table in the spec (9.3.
+// Conversion tables), for square block sizes only.
+/* clang-format off */
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][SQR_BLOCK_SIZES] = {
+ { // PARTITION_NONE
+ BLOCK_4X4, BLOCK_8X8, BLOCK_16X16,
+ BLOCK_32X32, BLOCK_64X64, BLOCK_128X128
+ }, { // PARTITION_HORZ
+ BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_VERT
+ BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_SPLIT
+ BLOCK_INVALID, BLOCK_4X4, BLOCK_8X8,
+ BLOCK_16X16, BLOCK_32X32, BLOCK_64X64
+ }, { // PARTITION_HORZ_A
+ BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_HORZ_B
+ BLOCK_INVALID, BLOCK_8X4, BLOCK_16X8,
+ BLOCK_32X16, BLOCK_64X32, BLOCK_128X64
+ }, { // PARTITION_VERT_A
+ BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_VERT_B
+ BLOCK_INVALID, BLOCK_4X8, BLOCK_8X16,
+ BLOCK_16X32, BLOCK_32X64, BLOCK_64X128
+ }, { // PARTITION_HORZ_4
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X4,
+ BLOCK_32X8, BLOCK_64X16, BLOCK_INVALID
+ }, { // PARTITION_VERT_4
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ BLOCK_8X32, BLOCK_16X64, BLOCK_INVALID
+ }
+};
+
+static const TX_SIZE max_txsize_lookup[BLOCK_SIZES_ALL] = {
+ // 4X4
+ TX_4X4,
+ // 4X8, 8X4, 8X8
+ TX_4X4, TX_4X4, TX_8X8,
+ // 8X16, 16X8, 16X16
+ TX_8X8, TX_8X8, TX_16X16,
+ // 16X32, 32X16, 32X32
+ TX_16X16, TX_16X16, TX_32X32,
+ // 32X64, 64X32,
+ TX_32X32, TX_32X32,
+ // 64X64
+ TX_64X64,
+ // 64x128, 128x64, 128x128
+ TX_64X64, TX_64X64, TX_64X64,
+ // 4x16, 16x4, 8x32
+ TX_4X4, TX_4X4, TX_8X8,
+ // 32x8, 16x64 64x16
+ TX_8X8, TX_16X16, TX_16X16
+};
+
+static const TX_SIZE max_txsize_rect_lookup[BLOCK_SIZES_ALL] = {
+ // 4X4
+ TX_4X4,
+ // 4X8, 8X4, 8X8
+ TX_4X8, TX_8X4, TX_8X8,
+ // 8X16, 16X8, 16X16
+ TX_8X16, TX_16X8, TX_16X16,
+ // 16X32, 32X16, 32X32
+ TX_16X32, TX_32X16, TX_32X32,
+ // 32X64, 64X32,
+ TX_32X64, TX_64X32,
+ // 64X64
+ TX_64X64,
+ // 64x128, 128x64, 128x128
+ TX_64X64, TX_64X64, TX_64X64,
+ // 4x16, 16x4,
+ TX_4X16, TX_16X4,
+ // 8x32, 32x8
+ TX_8X32, TX_32X8,
+ // 16x64, 64x16
+ TX_16X64, TX_64X16
+};
+
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+ DCT_1D, ADST_1D, DCT_1D, ADST_1D,
+ FLIPADST_1D, DCT_1D, FLIPADST_1D, ADST_1D, FLIPADST_1D, IDTX_1D,
+ DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D, IDTX_1D,
+};
+
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+ DCT_1D, DCT_1D, ADST_1D, ADST_1D,
+ DCT_1D, FLIPADST_1D, FLIPADST_1D, FLIPADST_1D, ADST_1D, IDTX_1D,
+ IDTX_1D, DCT_1D, IDTX_1D, ADST_1D, IDTX_1D, FLIPADST_1D,
+};
+
+#define TXSIZE_CAT_INVALID (-1)
+
+/* clang-format on */
+
+static const TX_SIZE sub_tx_size_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_4X4, // TX_8X8
+ TX_8X8, // TX_16X16
+ TX_16X16, // TX_32X32
+ TX_32X32, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_4X8, // TX_4X16
+ TX_8X4, // TX_16X4
+ TX_8X16, // TX_8X32
+ TX_16X8, // TX_32X8
+ TX_16X32, // TX_16X64
+ TX_32X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_horz_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_8X8, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_16X16, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_32X32, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_64X64, // TX_64X32
+ TX_4X4, // TX_4X16
+ TX_16X16, // TX_16X4
+ TX_8X8, // TX_8X32
+ TX_32X32, // TX_32X8
+ TX_16X16, // TX_16X64
+ TX_64X64, // TX_64X16
+};
+
+static const TX_SIZE txsize_vert_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_8X8, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_16X16, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_32X32, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_64X64, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_16X16, // TX_4X16
+ TX_4X4, // TX_16X4
+ TX_32X32, // TX_8X32
+ TX_8X8, // TX_32X8
+ TX_64X64, // TX_16X64
+ TX_16X16, // TX_64X16
+};
+
+#define TX_SIZE_W_MIN 4
+
+// Transform block width in pixels
+static const int tx_size_wide[TX_SIZES_ALL] = {
+ 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64,
+};
+
+#define TX_SIZE_H_MIN 4
+
+// Transform block height in pixels
+static const int tx_size_high[TX_SIZES_ALL] = {
+ 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16,
+};
+
+// Transform block width in unit
+static const int tx_size_wide_unit[TX_SIZES_ALL] = {
+ 1, 2, 4, 8, 16, 1, 2, 2, 4, 4, 8, 8, 16, 1, 4, 2, 8, 4, 16,
+};
+
+// Transform block height in unit
+static const int tx_size_high_unit[TX_SIZES_ALL] = {
+ 1, 2, 4, 8, 16, 2, 1, 4, 2, 8, 4, 16, 8, 4, 1, 8, 2, 16, 4,
+};
+
+// Transform block width in log2
+static const int tx_size_wide_log2[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6,
+};
+
+// Transform block height in log2
+static const int tx_size_high_log2[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4,
+};
+
+static const int tx_size_2d[TX_SIZES_ALL + 1] = {
+ 16, 64, 256, 1024, 4096, 32, 32, 128, 128, 512,
+ 512, 2048, 2048, 64, 64, 256, 256, 1024, 1024,
+};
+
+static const BLOCK_SIZE txsize_to_bsize[TX_SIZES_ALL] = {
+ BLOCK_4X4, // TX_4X4
+ BLOCK_8X8, // TX_8X8
+ BLOCK_16X16, // TX_16X16
+ BLOCK_32X32, // TX_32X32
+ BLOCK_64X64, // TX_64X64
+ BLOCK_4X8, // TX_4X8
+ BLOCK_8X4, // TX_8X4
+ BLOCK_8X16, // TX_8X16
+ BLOCK_16X8, // TX_16X8
+ BLOCK_16X32, // TX_16X32
+ BLOCK_32X16, // TX_32X16
+ BLOCK_32X64, // TX_32X64
+ BLOCK_64X32, // TX_64X32
+ BLOCK_4X16, // TX_4X16
+ BLOCK_16X4, // TX_16X4
+ BLOCK_8X32, // TX_8X32
+ BLOCK_32X8, // TX_32X8
+ BLOCK_16X64, // TX_16X64
+ BLOCK_64X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_4X4, // TX_4X8
+ TX_4X4, // TX_8X4
+ TX_8X8, // TX_8X16
+ TX_8X8, // TX_16X8
+ TX_16X16, // TX_16X32
+ TX_16X16, // TX_32X16
+ TX_32X32, // TX_32X64
+ TX_32X32, // TX_64X32
+ TX_4X4, // TX_4X16
+ TX_4X4, // TX_16X4
+ TX_8X8, // TX_8X32
+ TX_8X8, // TX_32X8
+ TX_16X16, // TX_16X64
+ TX_16X16, // TX_64X16
+};
+
+static const TX_SIZE txsize_sqr_up_map[TX_SIZES_ALL] = {
+ TX_4X4, // TX_4X4
+ TX_8X8, // TX_8X8
+ TX_16X16, // TX_16X16
+ TX_32X32, // TX_32X32
+ TX_64X64, // TX_64X64
+ TX_8X8, // TX_4X8
+ TX_8X8, // TX_8X4
+ TX_16X16, // TX_8X16
+ TX_16X16, // TX_16X8
+ TX_32X32, // TX_16X32
+ TX_32X32, // TX_32X16
+ TX_64X64, // TX_32X64
+ TX_64X64, // TX_64X32
+ TX_16X16, // TX_4X16
+ TX_16X16, // TX_16X4
+ TX_32X32, // TX_8X32
+ TX_32X32, // TX_32X8
+ TX_64X64, // TX_16X64
+ TX_64X64, // TX_64X16
+};
+
+static const int8_t txsize_log2_minus4[TX_SIZES_ALL] = {
+ 0, // TX_4X4
+ 2, // TX_8X8
+ 4, // TX_16X16
+ 6, // TX_32X32
+ 6, // TX_64X64
+ 1, // TX_4X8
+ 1, // TX_8X4
+ 3, // TX_8X16
+ 3, // TX_16X8
+ 5, // TX_16X32
+ 5, // TX_32X16
+ 6, // TX_32X64
+ 6, // TX_64X32
+ 2, // TX_4X16
+ 2, // TX_16X4
+ 4, // TX_8X32
+ 4, // TX_32X8
+ 5, // TX_16X64
+ 5, // TX_64X16
+};
+
+/* clang-format off */
+static const TX_SIZE tx_mode_to_biggest_tx_size[TX_MODES] = {
+ TX_4X4, // ONLY_4X4
+ TX_64X64, // TX_MODE_LARGEST
+ TX_64X64, // TX_MODE_SELECT
+};
+
+// The Subsampled_Size table in the spec (Section 5.11.38. Get plane residual
+// size function).
+static const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES_ALL][2][2] = {
+ // ss_x == 0 ss_x == 0 ss_x == 1 ss_x == 1
+ // ss_y == 0 ss_y == 1 ss_y == 0 ss_y == 1
+ { { BLOCK_4X4, BLOCK_4X4 }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_4X8, BLOCK_4X4 }, { BLOCK_INVALID, BLOCK_4X4 } },
+ { { BLOCK_8X4, BLOCK_INVALID }, { BLOCK_4X4, BLOCK_4X4 } },
+ { { BLOCK_8X8, BLOCK_8X4 }, { BLOCK_4X8, BLOCK_4X4 } },
+ { { BLOCK_8X16, BLOCK_8X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X8, BLOCK_INVALID }, { BLOCK_8X8, BLOCK_8X4 } },
+ { { BLOCK_16X16, BLOCK_16X8 }, { BLOCK_8X16, BLOCK_8X8 } },
+ { { BLOCK_16X32, BLOCK_16X16 }, { BLOCK_INVALID, BLOCK_8X16 } },
+ { { BLOCK_32X16, BLOCK_INVALID }, { BLOCK_16X16, BLOCK_16X8 } },
+ { { BLOCK_32X32, BLOCK_32X16 }, { BLOCK_16X32, BLOCK_16X16 } },
+ { { BLOCK_32X64, BLOCK_32X32 }, { BLOCK_INVALID, BLOCK_16X32 } },
+ { { BLOCK_64X32, BLOCK_INVALID }, { BLOCK_32X32, BLOCK_32X16 } },
+ { { BLOCK_64X64, BLOCK_64X32 }, { BLOCK_32X64, BLOCK_32X32 } },
+ { { BLOCK_64X128, BLOCK_64X64 }, { BLOCK_INVALID, BLOCK_32X64 } },
+ { { BLOCK_128X64, BLOCK_INVALID }, { BLOCK_64X64, BLOCK_64X32 } },
+ { { BLOCK_128X128, BLOCK_128X64 }, { BLOCK_64X128, BLOCK_64X64 } },
+ { { BLOCK_4X16, BLOCK_4X8 }, { BLOCK_INVALID, BLOCK_4X8 } },
+ { { BLOCK_16X4, BLOCK_INVALID }, { BLOCK_8X4, BLOCK_8X4 } },
+ { { BLOCK_8X32, BLOCK_8X16 }, { BLOCK_INVALID, BLOCK_4X16 } },
+ { { BLOCK_32X8, BLOCK_INVALID }, { BLOCK_16X8, BLOCK_16X4 } },
+ { { BLOCK_16X64, BLOCK_16X32 }, { BLOCK_INVALID, BLOCK_8X32 } },
+ { { BLOCK_64X16, BLOCK_INVALID }, { BLOCK_32X16, BLOCK_32X8 } }
+};
+/* clang-format on */
+
+// Generates 5 bit field in which each bit set to 1 represents
+// a blocksize partition 11111 means we split 128x128, 64x64, 32x32, 16x16
+// and 8x8. 10000 means we just split the 128x128 to 64x64
+/* clang-format off */
+static const struct {
+ PARTITION_CONTEXT above;
+ PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES_ALL] = {
+ { 31, 31 }, // 4X4 - {0b11111, 0b11111}
+ { 31, 30 }, // 4X8 - {0b11111, 0b11110}
+ { 30, 31 }, // 8X4 - {0b11110, 0b11111}
+ { 30, 30 }, // 8X8 - {0b11110, 0b11110}
+ { 30, 28 }, // 8X16 - {0b11110, 0b11100}
+ { 28, 30 }, // 16X8 - {0b11100, 0b11110}
+ { 28, 28 }, // 16X16 - {0b11100, 0b11100}
+ { 28, 24 }, // 16X32 - {0b11100, 0b11000}
+ { 24, 28 }, // 32X16 - {0b11000, 0b11100}
+ { 24, 24 }, // 32X32 - {0b11000, 0b11000}
+ { 24, 16 }, // 32X64 - {0b11000, 0b10000}
+ { 16, 24 }, // 64X32 - {0b10000, 0b11000}
+ { 16, 16 }, // 64X64 - {0b10000, 0b10000}
+ { 16, 0 }, // 64X128- {0b10000, 0b00000}
+ { 0, 16 }, // 128X64- {0b00000, 0b10000}
+ { 0, 0 }, // 128X128-{0b00000, 0b00000}
+ { 31, 28 }, // 4X16 - {0b11111, 0b11100}
+ { 28, 31 }, // 16X4 - {0b11100, 0b11111}
+ { 30, 24 }, // 8X32 - {0b11110, 0b11000}
+ { 24, 30 }, // 32X8 - {0b11000, 0b11110}
+ { 28, 16 }, // 16X64 - {0b11100, 0b10000}
+ { 16, 28 }, // 64X16 - {0b10000, 0b11100}
+};
+/* clang-format on */
+
+static const int intra_mode_context[INTRA_MODES] = {
+ 0, 1, 2, 3, 4, 4, 4, 4, 3, 0, 1, 2, 0,
+};
+
+// Note: this is also used in unit tests. So whenever one changes the table,
+// the unit tests need to be changed accordingly.
+static const int quant_dist_weight[4][2] = {
+ { 2, 3 }, { 2, 5 }, { 2, 7 }, { 1, MAX_FRAME_DISTANCE }
+};
+static const int quant_dist_lookup_table[2][4][2] = {
+ { { 9, 7 }, { 11, 5 }, { 12, 4 }, { 13, 3 } },
+ { { 7, 9 }, { 5, 11 }, { 4, 12 }, { 3, 13 } },
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_COMMON_DATA_H_
diff --git a/third_party/aom/av1/common/convolve.c b/third_party/aom/av1/common/convolve.c
new file mode 100644
index 0000000000..1f11126fc3
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.c
@@ -0,0 +1,1295 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+ const int x_filter_idx =
+ (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ assert(x_filter_idx <= RS_SUBPEL_MASK);
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+ int sum = 0;
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ x_qn += x_step_qn;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn, int bd) {
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
+ const int x_filter_idx =
+ (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ assert(x_filter_idx <= RS_SUBPEL_MASK);
+ const int16_t *const x_filter =
+ &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
+ int sum = 0;
+ for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+ sum += src_x[k] * x_filter[k];
+ dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ x_qn += x_step_qn;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ dst[y * dst_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
+ }
+ }
+}
+
+void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
+ }
+ }
+}
+
+void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ for (int y = 0; y < h; ++y) {
+ memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ }
+}
+
+void av1_jnt_convolve_2d_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ dst8[y * dst8_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_y_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ res *= (1 << bits);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst8[y * dst8_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_x_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst8[y * dst8_stride + x] =
+ clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_2d_copy_c(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int bd = 8;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
+ int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bd = 8;
+
+ // horizontal filter
+ const uint8_t *src_horiz = src - fo_vert * src_stride;
+ for (int y = 0; y < im_h; ++y) {
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+ const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(x_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *x_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_x[k - fo_horiz];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ src_horiz += src_stride;
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int x = 0; x < w; ++x) {
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(y_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *y_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
+ }
+ src_vert++;
+ }
+}
+
+static void convolve_2d_scale_wrapper(
+ const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ if (conv_params->is_compound) {
+ assert(conv_params->dst != NULL);
+ }
+ av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
+ y_step_qn, conv_params);
+}
+
+// TODO(huisu@google.com): bilinear filtering only needs 2 taps in general. So
+// we may create optimized code to do 2-tap filtering for all bilinear filtering
+// usages, not just IntraBC.
+static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ int subpel_x_q4, int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0, conv_params);
+ } else if (subpel_x_q4 != 0) {
+ av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ } else {
+ av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, 0, 0, conv_params);
+ }
+}
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilters interp_filters, const int subpel_x_q4,
+ int x_step_q4, const int subpel_y_q4, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ const struct scale_factors *sf, int is_intrabc) {
+ assert(IMPLIES(is_intrabc, !scaled));
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)dst;
+ (void)dst_stride;
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
+ subpel_y_q4, conv_params);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
+ const InterpFilterParams *filter_params_y =
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
+
+ if (scaled) {
+ convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_q4,
+ x_step_q4, subpel_y_q4, y_step_q4, conv_params);
+ } else {
+ sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
+ }
+}
+
+void av1_highbd_convolve_2d_copy_sr_c(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)bd;
+
+ for (int y = 0; y < h; ++y) {
+ memcpy(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
+ }
+}
+
+void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ }
+}
+
+void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < im_h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
+ ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int src_stride,
+ uint16_t *dst16, int dst16_stride, int w,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ int x, y, k;
+ int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (y = 0; y < im_h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ (void)bd;
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ int32_t sum = 1 << offset_bits;
+ for (k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ dst16[y * dst16_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_x_c(const uint16_t *src, int src_stride,
+ uint16_t *dst16, int dst16_stride, int w,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+ assert(bits >= 0);
+ // horizontal filter
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
+ }
+ res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
+ res += round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst16[y * dst16_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_y_c(const uint16_t *src, int src_stride,
+ uint16_t *dst16, int dst16_stride, int w,
+ int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ assert(round_bits >= 0);
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ assert(bits >= 0);
+ // vertical filter
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ int32_t res = 0;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
+ }
+ res *= (1 << bits);
+ res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
+
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst16[y * dst16_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_copy_c(
+ const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
+ int w, int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ assert(bits >= 0);
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ for (int y = 0; y < h; ++y) {
+ for (int x = 0; x < w; ++x) {
+ CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
+ res += round_offset;
+ if (conv_params->do_average) {
+ int32_t tmp = dst[y * dst_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ tmp -= round_offset;
+ dst16[y * dst16_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst[y * dst_stride + x] = res;
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ int im_stride = w;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ assert(bits >= 0);
+ // horizontal filter
+ const uint16_t *src_horiz = src - fo_vert * src_stride;
+ for (int y = 0; y < im_h; ++y) {
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
+ const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(x_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *x_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < filter_params_x->taps; ++k) {
+ sum += x_filter[k] * src_x[k - fo_horiz];
+ }
+ assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
+ im_block[y * im_stride + x] =
+ (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
+ }
+ src_horiz += src_stride;
+ }
+
+ // vertical filter
+ int16_t *src_vert = im_block + fo_vert * im_stride;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ for (int x = 0; x < w; ++x) {
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
+ const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(y_filter_idx < SUBPEL_SHIFTS);
+ const int16_t *y_filter =
+ av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < filter_params_y->taps; ++k) {
+ sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
+ }
+ assert(0 <= sum && sum < (1 << (offset_bits + 2)));
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ }
+ }
+ src_vert++;
+ }
+}
+
+static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, int subpel_x_q4,
+ int subpel_y_q4,
+ ConvolveParams *conv_params,
+ int bd) {
+ const InterpFilterParams *filter_params_x =
+ subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
+ const InterpFilterParams *filter_params_y =
+ subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
+ if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
+ av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else if (subpel_x_q4 != 0) {
+ av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ } else {
+ av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, 0, 0,
+ conv_params, bd);
+ }
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+ uint8_t *dst8, int dst_stride, int w, int h,
+ InterpFilters interp_filters,
+ const int subpel_x_q4, int x_step_q4,
+ const int subpel_y_q4, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ const struct scale_factors *sf,
+ int is_intrabc, int bd) {
+ assert(IMPLIES(is_intrabc, !scaled));
+ (void)x_step_q4;
+ (void)y_step_q4;
+ (void)dst_stride;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+ if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
+ subpel_x_q4, subpel_y_q4, conv_params, bd);
+ return;
+ }
+
+ InterpFilter filter_x = 0;
+ InterpFilter filter_y = 0;
+ const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
+ const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
+ if (need_filter_params_x)
+ filter_x = av1_extract_interp_filter(interp_filters, 1);
+ if (need_filter_params_y)
+ filter_y = av1_extract_interp_filter(interp_filters, 0);
+ const InterpFilterParams *filter_params_x =
+ need_filter_params_x
+ ? av1_get_interp_filter_params_with_block_size(filter_x, w)
+ : NULL;
+ const InterpFilterParams *filter_params_y =
+ need_filter_params_y
+ ? av1_get_interp_filter_params_with_block_size(filter_y, h)
+ : NULL;
+
+ if (scaled) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ if (conv_params->is_compound) {
+ assert(conv_params->dst != NULL);
+ }
+ av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_q4,
+ x_step_q4, subpel_y_q4, y_step_q4, conv_params,
+ bd);
+ } else {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
+ 0][conv_params->is_compound](
+ src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
+ }
+}
+
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+// (1) Interpolate horizontally into an intermediate buffer, temp.
+// (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 128x128 pixels.
+// --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
+// original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
+#define WIENER_MAX_EXT_SIZE 263
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+ return sum;
+}
+
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+ ptrdiff_t a_stride,
+ const int16_t *b) {
+ int sum = 0;
+ for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+ return sum;
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+ // NOTE: This assumes that the filter table is 256-byte aligned.
+ // TODO(agrange) Modify to make independent of table alignment.
+ return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+ return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h,
+ int round0_bits) {
+ const int bd = 8;
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+ (1 << (bd + FILTER_BITS - 1));
+ const int sum = horz_scalar_product(src_x, x_filter) + rounding;
+ dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+ WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h,
+ int round1_bits) {
+ const int bd = 8;
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int rounding =
+ ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+ (1 << (bd + round1_bits - 1));
+ const int sum =
+ highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+ dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+
+ convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
+ x_step_q4, w, intermediate_height,
+ conv_params->round_0);
+ convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+ MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
+ y_step_q4, w, h, conv_params->round_1);
+}
+
+static void highbd_convolve_add_src_horiz_hip(
+ const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h, int round0_bits, int bd) {
+ const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ src -= SUBPEL_TAPS / 2 - 1;
+ for (int y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (int x = 0; x < w; ++x) {
+ const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
+ (1 << (bd + FILTER_BITS - 1));
+ const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
+ dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
+ extraprec_clamp_limit - 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void highbd_convolve_add_src_vert_hip(
+ const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
+ int y_step_q4, int w, int h, int round1_bits, int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (int x = 0; x < w; ++x) {
+ int y_q4 = y0_q4;
+ for (int y = 0; y < h; ++y) {
+ const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ const int rounding =
+ ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
+ (1 << (bd + round1_bits - 1));
+ const int sum =
+ highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
+ dst[y * dst_stride] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
+ y_q4 += y_step_q4;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+void av1_highbd_wiener_convolve_add_src_c(
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const ConvolveParams *conv_params, int bd) {
+ const InterpKernel *const filters_x = get_filter_base(filter_x);
+ const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+ const InterpKernel *const filters_y = get_filter_base(filter_y);
+ const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+ uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= MAX_SB_SIZE);
+ assert(h <= MAX_SB_SIZE);
+ assert(y_step_q4 <= 32);
+ assert(x_step_q4 <= 32);
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+
+ highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, MAX_SB_SIZE, filters_x,
+ x0_q4, x_step_q4, w, intermediate_height,
+ conv_params->round_0, bd);
+ highbd_convolve_add_src_vert_hip(
+ temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
+ filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
+}
diff --git a/third_party/aom/av1/common/convolve.h b/third_party/aom/av1/common/convolve.h
new file mode 100644
index 0000000000..4109dd8433
--- /dev/null
+++ b/third_party/aom/av1/common/convolve.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_CONVOLVE_H_
+#define AOM_AV1_COMMON_CONVOLVE_H_
+#include "av1/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t CONV_BUF_TYPE;
+typedef struct ConvolveParams {
+ int do_average;
+ CONV_BUF_TYPE *dst;
+ int dst_stride;
+ int round_0;
+ int round_1;
+ int plane;
+ int is_compound;
+ int use_jnt_comp_avg;
+ int fwd_offset;
+ int bck_offset;
+} ConvolveParams;
+
+#define ROUND0_BITS 3
+#define COMPOUND_ROUND1_BITS 7
+#define WIENER_ROUND0_BITS 3
+
+#define WIENER_CLAMP_LIMIT(r0, bd) (1 << ((bd) + 1 + FILTER_BITS - r0))
+
+typedef void (*aom_convolve_fn_t)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params);
+
+typedef void (*aom_highbd_convolve_fn_t)(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd);
+
+struct AV1Common;
+struct scale_factors;
+
+void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ InterpFilters interp_filters, const int subpel_x_q4,
+ int x_step_q4, const int subpel_y_q4, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ const struct scale_factors *sf, int is_intrabc);
+
+static INLINE ConvolveParams get_conv_params_no_round(int do_average, int plane,
+ CONV_BUF_TYPE *dst,
+ int dst_stride,
+ int is_compound, int bd) {
+ ConvolveParams conv_params;
+ conv_params.do_average = do_average;
+ assert(IMPLIES(do_average, is_compound));
+ conv_params.is_compound = is_compound;
+ conv_params.round_0 = ROUND0_BITS;
+ conv_params.round_1 = is_compound ? COMPOUND_ROUND1_BITS
+ : 2 * FILTER_BITS - conv_params.round_0;
+ const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+ assert(IMPLIES(bd < 12, intbufrange <= 16));
+ if (intbufrange > 16) {
+ conv_params.round_0 += intbufrange - 16;
+ if (!is_compound) conv_params.round_1 -= intbufrange - 16;
+ }
+ // TODO(yunqing): The following dst should only be valid while
+ // is_compound = 1;
+ conv_params.dst = dst;
+ conv_params.dst_stride = dst_stride;
+ conv_params.plane = plane;
+ return conv_params;
+}
+
+static INLINE ConvolveParams get_conv_params(int do_average, int plane,
+ int bd) {
+ return get_conv_params_no_round(do_average, plane, NULL, 0, 0, bd);
+}
+
+static INLINE ConvolveParams get_conv_params_wiener(int bd) {
+ ConvolveParams conv_params;
+ (void)bd;
+ conv_params.do_average = 0;
+ conv_params.is_compound = 0;
+ conv_params.round_0 = WIENER_ROUND0_BITS;
+ conv_params.round_1 = 2 * FILTER_BITS - conv_params.round_0;
+ const int intbufrange = bd + FILTER_BITS - conv_params.round_0 + 2;
+ assert(IMPLIES(bd < 12, intbufrange <= 16));
+ if (intbufrange > 16) {
+ conv_params.round_0 += intbufrange - 16;
+ conv_params.round_1 -= intbufrange - 16;
+ }
+ conv_params.dst = NULL;
+ conv_params.dst_stride = 0;
+ conv_params.plane = 0;
+ return conv_params;
+}
+
+void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ InterpFilters interp_filters,
+ const int subpel_x_q4, int x_step_q4,
+ const int subpel_y_q4, int y_step_q4,
+ int scaled, ConvolveParams *conv_params,
+ const struct scale_factors *sf,
+ int is_intrabc, int bd);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_CONVOLVE_H_
diff --git a/third_party/aom/av1/common/debugmodes.c b/third_party/aom/av1/common/debugmodes.c
new file mode 100644
index 0000000000..868f341b5b
--- /dev/null
+++ b/third_party/aom/av1/common/debugmodes.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+
+static void log_frame_info(AV1_COMMON *cm, const char *str, FILE *f) {
+ fprintf(f, "%s", str);
+ fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+ cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(AV1_COMMON *cm, FILE *file, const char *descriptor,
+ size_t member_offset) {
+ int mi_row, mi_col;
+ MB_MODE_INFO **mi = cm->mi_grid_visible;
+ int rows = cm->mi_rows;
+ int cols = cm->mi_cols;
+ char prefix = descriptor[0];
+
+ log_frame_info(cm, descriptor, file);
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(file, "%c ", prefix);
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(file, "%2d ", *((char *)((char *)(mi[0]) + member_offset)));
+ mi++;
+ }
+ fprintf(file, "\n");
+ mi += MAX_MIB_SIZE;
+ }
+ fprintf(file, "\n");
+}
+
+void av1_print_modes_and_motion_vectors(AV1_COMMON *cm, const char *file) {
+ int mi_row;
+ int mi_col;
+ FILE *mvs = fopen(file, "a");
+ MB_MODE_INFO **mi = cm->mi_grid_visible;
+ int rows = cm->mi_rows;
+ int cols = cm->mi_cols;
+
+ print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+ print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+ print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+ print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, tx_size));
+ print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+ // output skip infomation.
+ log_frame_info(cm, "Skips:", mvs);
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs, "S ");
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(mvs, "%2d ", mi[0]->skip);
+ mi++;
+ }
+ fprintf(mvs, "\n");
+ mi += MAX_MIB_SIZE;
+ }
+ fprintf(mvs, "\n");
+
+ // output motion vectors.
+ log_frame_info(cm, "Vectors ", mvs);
+ mi = cm->mi_grid_visible;
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs, "V ");
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ fprintf(mvs, "%4d:%4d ", mi[0]->mv[0].as_mv.row, mi[0]->mv[0].as_mv.col);
+ mi++;
+ }
+ fprintf(mvs, "\n");
+ mi += MAX_MIB_SIZE;
+ }
+ fprintf(mvs, "\n");
+
+ fclose(mvs);
+}
+
+void av1_print_uncompressed_frame_header(const uint8_t *data, int size,
+ const char *filename) {
+ FILE *hdrFile = fopen(filename, "w");
+ fwrite(data, size, sizeof(uint8_t), hdrFile);
+ fclose(hdrFile);
+}
+
+void av1_print_frame_contexts(const FRAME_CONTEXT *fc, const char *filename) {
+ FILE *fcFile = fopen(filename, "w");
+ const uint16_t *fcp = (uint16_t *)fc;
+ const unsigned int n_contexts = sizeof(FRAME_CONTEXT) / sizeof(uint16_t);
+ unsigned int i;
+
+ for (i = 0; i < n_contexts; ++i) fprintf(fcFile, "%d ", *fcp++);
+ fclose(fcFile);
+}
diff --git a/third_party/aom/av1/common/entropy.c b/third_party/aom/av1/common/entropy.c
new file mode 100644
index 0000000000..4f95ef69b0
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/scan.h"
+#include "av1/common/token_cdfs.h"
+#include "av1/common/txb_common.h"
+
+static int get_q_ctx(int q) {
+ if (q <= 20) return 0;
+ if (q <= 60) return 1;
+ if (q <= 120) return 2;
+ return 3;
+}
+
+void av1_default_coef_probs(AV1_COMMON *cm) {
+ const int index = get_q_ctx(cm->base_qindex);
+#if CONFIG_ENTROPY_STATS
+ cm->coef_cdf_category = index;
+#endif
+
+ av1_copy(cm->fc->txb_skip_cdf, av1_default_txb_skip_cdfs[index]);
+ av1_copy(cm->fc->eob_extra_cdf, av1_default_eob_extra_cdfs[index]);
+ av1_copy(cm->fc->dc_sign_cdf, av1_default_dc_sign_cdfs[index]);
+ av1_copy(cm->fc->coeff_br_cdf, av1_default_coeff_lps_multi_cdfs[index]);
+ av1_copy(cm->fc->coeff_base_cdf, av1_default_coeff_base_multi_cdfs[index]);
+ av1_copy(cm->fc->coeff_base_eob_cdf,
+ av1_default_coeff_base_eob_multi_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf16, av1_default_eob_multi16_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf32, av1_default_eob_multi32_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf64, av1_default_eob_multi64_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf128, av1_default_eob_multi128_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf256, av1_default_eob_multi256_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf512, av1_default_eob_multi512_cdfs[index]);
+ av1_copy(cm->fc->eob_flag_cdf1024, av1_default_eob_multi1024_cdfs[index]);
+}
+
+static void reset_cdf_symbol_counter(aom_cdf_prob *cdf_ptr, int num_cdfs,
+ int cdf_stride, int nsymbs) {
+ for (int i = 0; i < num_cdfs; i++) {
+ cdf_ptr[i * cdf_stride + nsymbs] = 0;
+ }
+}
+
+#define RESET_CDF_COUNTER(cname, nsymbs) \
+ RESET_CDF_COUNTER_STRIDE(cname, nsymbs, CDF_SIZE(nsymbs))
+
+#define RESET_CDF_COUNTER_STRIDE(cname, nsymbs, cdf_stride) \
+ do { \
+ aom_cdf_prob *cdf_ptr = (aom_cdf_prob *)cname; \
+ int array_size = (int)sizeof(cname) / sizeof(aom_cdf_prob); \
+ int num_cdfs = array_size / cdf_stride; \
+ reset_cdf_symbol_counter(cdf_ptr, num_cdfs, cdf_stride, nsymbs); \
+ } while (0)
+
+static void reset_nmv_counter(nmv_context *nmv) {
+ RESET_CDF_COUNTER(nmv->joints_cdf, 4);
+ for (int i = 0; i < 2; i++) {
+ RESET_CDF_COUNTER(nmv->comps[i].classes_cdf, MV_CLASSES);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_fp_cdf, MV_FP_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].fp_cdf, MV_FP_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].sign_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_hp_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].hp_cdf, 2);
+ RESET_CDF_COUNTER(nmv->comps[i].class0_cdf, CLASS0_SIZE);
+ RESET_CDF_COUNTER(nmv->comps[i].bits_cdf, 2);
+ }
+}
+
+void av1_reset_cdf_symbol_counters(FRAME_CONTEXT *fc) {
+ RESET_CDF_COUNTER(fc->txb_skip_cdf, 2);
+ RESET_CDF_COUNTER(fc->eob_extra_cdf, 2);
+ RESET_CDF_COUNTER(fc->dc_sign_cdf, 2);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf16, 5);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf32, 6);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf64, 7);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf128, 8);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf256, 9);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf512, 10);
+ RESET_CDF_COUNTER(fc->eob_flag_cdf1024, 11);
+ RESET_CDF_COUNTER(fc->coeff_base_eob_cdf, 3);
+ RESET_CDF_COUNTER(fc->coeff_base_cdf, 4);
+ RESET_CDF_COUNTER(fc->coeff_br_cdf, BR_CDF_SIZE);
+ RESET_CDF_COUNTER(fc->newmv_cdf, 2);
+ RESET_CDF_COUNTER(fc->zeromv_cdf, 2);
+ RESET_CDF_COUNTER(fc->refmv_cdf, 2);
+ RESET_CDF_COUNTER(fc->drl_cdf, 2);
+ RESET_CDF_COUNTER(fc->inter_compound_mode_cdf, INTER_COMPOUND_MODES);
+ RESET_CDF_COUNTER(fc->compound_type_cdf, COMPOUND_TYPES - 1);
+ RESET_CDF_COUNTER(fc->wedge_idx_cdf, 16);
+ RESET_CDF_COUNTER(fc->interintra_cdf, 2);
+ RESET_CDF_COUNTER(fc->wedge_interintra_cdf, 2);
+ RESET_CDF_COUNTER(fc->interintra_mode_cdf, INTERINTRA_MODES);
+ RESET_CDF_COUNTER(fc->motion_mode_cdf, MOTION_MODES);
+ RESET_CDF_COUNTER(fc->obmc_cdf, 2);
+ RESET_CDF_COUNTER(fc->palette_y_size_cdf, PALETTE_SIZES);
+ RESET_CDF_COUNTER(fc->palette_uv_size_cdf, PALETTE_SIZES);
+ for (int j = 0; j < PALETTE_SIZES; j++) {
+ int nsymbs = j + PALETTE_MIN_SIZE;
+ RESET_CDF_COUNTER_STRIDE(fc->palette_y_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ RESET_CDF_COUNTER_STRIDE(fc->palette_uv_color_index_cdf[j], nsymbs,
+ CDF_SIZE(PALETTE_COLORS));
+ }
+ RESET_CDF_COUNTER(fc->palette_y_mode_cdf, 2);
+ RESET_CDF_COUNTER(fc->palette_uv_mode_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_inter_cdf, 2);
+ RESET_CDF_COUNTER(fc->single_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_ref_type_cdf, 2);
+ RESET_CDF_COUNTER(fc->uni_comp_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_ref_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_bwdref_cdf, 2);
+ RESET_CDF_COUNTER(fc->txfm_partition_cdf, 2);
+ RESET_CDF_COUNTER(fc->compound_index_cdf, 2);
+ RESET_CDF_COUNTER(fc->comp_group_idx_cdf, 2);
+ RESET_CDF_COUNTER(fc->skip_mode_cdfs, 2);
+ RESET_CDF_COUNTER(fc->skip_cdfs, 2);
+ RESET_CDF_COUNTER(fc->intra_inter_cdf, 2);
+ reset_nmv_counter(&fc->nmvc);
+ reset_nmv_counter(&fc->ndvc);
+ RESET_CDF_COUNTER(fc->intrabc_cdf, 2);
+ RESET_CDF_COUNTER(fc->seg.tree_cdf, MAX_SEGMENTS);
+ RESET_CDF_COUNTER(fc->seg.pred_cdf, 2);
+ RESET_CDF_COUNTER(fc->seg.spatial_pred_seg_cdf, MAX_SEGMENTS);
+ RESET_CDF_COUNTER(fc->filter_intra_cdfs, 2);
+ RESET_CDF_COUNTER(fc->filter_intra_mode_cdf, FILTER_INTRA_MODES);
+ RESET_CDF_COUNTER(fc->switchable_restore_cdf, RESTORE_SWITCHABLE_TYPES);
+ RESET_CDF_COUNTER(fc->wiener_restore_cdf, 2);
+ RESET_CDF_COUNTER(fc->sgrproj_restore_cdf, 2);
+ RESET_CDF_COUNTER(fc->y_mode_cdf, INTRA_MODES);
+ RESET_CDF_COUNTER_STRIDE(fc->uv_mode_cdf[0], UV_INTRA_MODES - 1,
+ CDF_SIZE(UV_INTRA_MODES));
+ RESET_CDF_COUNTER(fc->uv_mode_cdf[1], UV_INTRA_MODES);
+ for (int i = 0; i < PARTITION_CONTEXTS; i++) {
+ if (i < 4) {
+ RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 4, CDF_SIZE(10));
+ } else if (i < 16) {
+ RESET_CDF_COUNTER(fc->partition_cdf[i], 10);
+ } else {
+ RESET_CDF_COUNTER_STRIDE(fc->partition_cdf[i], 8, CDF_SIZE(10));
+ }
+ }
+ RESET_CDF_COUNTER(fc->switchable_interp_cdf, SWITCHABLE_FILTERS);
+ RESET_CDF_COUNTER(fc->kf_y_cdf, INTRA_MODES);
+ RESET_CDF_COUNTER(fc->angle_delta_cdf, 2 * MAX_ANGLE_DELTA + 1);
+ RESET_CDF_COUNTER_STRIDE(fc->tx_size_cdf[0], MAX_TX_DEPTH,
+ CDF_SIZE(MAX_TX_DEPTH + 1));
+ RESET_CDF_COUNTER(fc->tx_size_cdf[1], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->tx_size_cdf[2], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->tx_size_cdf[3], MAX_TX_DEPTH + 1);
+ RESET_CDF_COUNTER(fc->delta_q_cdf, DELTA_Q_PROBS + 1);
+ RESET_CDF_COUNTER(fc->delta_lf_cdf, DELTA_LF_PROBS + 1);
+ for (int i = 0; i < FRAME_LF_COUNT; i++) {
+ RESET_CDF_COUNTER(fc->delta_lf_multi_cdf[i], DELTA_LF_PROBS + 1);
+ }
+ RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[1], 7, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->intra_ext_tx_cdf[2], 5, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[1], 16, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[2], 12, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER_STRIDE(fc->inter_ext_tx_cdf[3], 2, CDF_SIZE(TX_TYPES));
+ RESET_CDF_COUNTER(fc->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ RESET_CDF_COUNTER(fc->cfl_alpha_cdf, CFL_ALPHABET_SIZE);
+}
diff --git a/third_party/aom/av1/common/entropy.h b/third_party/aom/av1/common/entropy.h
new file mode 100644
index 0000000000..991692c2f0
--- /dev/null
+++ b/third_party/aom/av1/common/entropy.h
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPY_H_
+#define AOM_AV1_COMMON_ENTROPY_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/prob.h"
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define TOKEN_CDF_Q_CTXS 4
+
+#define TXB_SKIP_CONTEXTS 13
+
+#define EOB_COEF_CONTEXTS 9
+
+#define SIG_COEF_CONTEXTS_2D 26
+#define SIG_COEF_CONTEXTS_1D 16
+#define SIG_COEF_CONTEXTS_EOB 4
+#define SIG_COEF_CONTEXTS (SIG_COEF_CONTEXTS_2D + SIG_COEF_CONTEXTS_1D)
+
+#define COEFF_BASE_CONTEXTS (SIG_COEF_CONTEXTS)
+#define DC_SIGN_CONTEXTS 3
+
+#define BR_TMP_OFFSET 12
+#define BR_REF_CAT 4
+#define LEVEL_CONTEXTS 21
+
+#define NUM_BASE_LEVELS 2
+
+#define BR_CDF_SIZE (4)
+#define COEFF_BASE_RANGE (4 * (BR_CDF_SIZE - 1))
+
+#define COEFF_CONTEXT_BITS 6
+#define COEFF_CONTEXT_MASK ((1 << COEFF_CONTEXT_BITS) - 1)
+#define MAX_BASE_BR_RANGE (COEFF_BASE_RANGE + NUM_BASE_LEVELS + 1)
+
+#define BASE_CONTEXT_POSITION_NUM 12
+
+typedef enum TX_CLASS {
+ TX_CLASS_2D = 0,
+ TX_CLASS_HORIZ = 1,
+ TX_CLASS_VERT = 2,
+ TX_CLASSES = 3,
+} TX_CLASS;
+
+#define DCT_MAX_VALUE 16384
+#define DCT_MAX_VALUE_HIGH10 65536
+#define DCT_MAX_VALUE_HIGH12 262144
+
+/* Coefficients are predicted via a 3-dimensional probability table indexed on
+ * REF_TYPES, COEF_BANDS and COEF_CONTEXTS. */
+#define REF_TYPES 2 // intra=0, inter=1
+
+struct AV1Common;
+struct frame_contexts;
+void av1_reset_cdf_symbol_counters(struct frame_contexts *fc);
+void av1_default_coef_probs(struct AV1Common *cm);
+
+struct frame_contexts;
+
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+ ENTROPY_CONTEXT b) {
+ return (a != 0) + (b != 0);
+}
+
+static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
+ const ENTROPY_CONTEXT *l) {
+ ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
+
+ switch (tx_size) {
+ case TX_4X4:
+ above_ec = a[0] != 0;
+ left_ec = l[0] != 0;
+ break;
+ case TX_4X8:
+ above_ec = a[0] != 0;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_8X4:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = l[0] != 0;
+ break;
+ case TX_8X16:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_16X8:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X32:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_32X16:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_8X8:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X16:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_32X32:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_64X64:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_32X64:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_64X32:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_4X16:
+ above_ec = a[0] != 0;
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ case TX_16X4:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = l[0] != 0;
+ break;
+ case TX_8X32:
+ above_ec = !!*(const uint16_t *)a;
+ left_ec = !!*(const uint64_t *)l;
+ break;
+ case TX_32X8:
+ above_ec = !!*(const uint64_t *)a;
+ left_ec = !!*(const uint16_t *)l;
+ break;
+ case TX_16X64:
+ above_ec = !!*(const uint32_t *)a;
+ left_ec = !!(*(const uint64_t *)l | *(const uint64_t *)(l + 8));
+ break;
+ case TX_64X16:
+ above_ec = !!(*(const uint64_t *)a | *(const uint64_t *)(a + 8));
+ left_ec = !!*(const uint32_t *)l;
+ break;
+ default: assert(0 && "Invalid transform size."); break;
+ }
+ return combine_entropy_contexts(above_ec, left_ec);
+}
+
+static INLINE TX_SIZE get_txsize_entropy_ctx(TX_SIZE txsize) {
+ return (TX_SIZE)((txsize_sqr_map[txsize] + txsize_sqr_up_map[txsize] + 1) >>
+ 1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPY_H_
diff --git a/third_party/aom/av1/common/entropymode.c b/third_party/aom/av1/common/entropymode.c
new file mode 100644
index 0000000000..41dc30ddb8
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/reconinter.h"
+#include "av1/common/scan.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+
+static const aom_cdf_prob
+ default_kf_y_mode_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][CDF_SIZE(
+ INTRA_MODES)] = {
+ { { AOM_CDF13(15588, 17027, 19338, 20218, 20682, 21110, 21825, 23244,
+ 24189, 28165, 29093, 30466) },
+ { AOM_CDF13(12016, 18066, 19516, 20303, 20719, 21444, 21888, 23032,
+ 24434, 28658, 30172, 31409) },
+ { AOM_CDF13(10052, 10771, 22296, 22788, 23055, 23239, 24133, 25620,
+ 26160, 29336, 29929, 31567) },
+ { AOM_CDF13(14091, 15406, 16442, 18808, 19136, 19546, 19998, 22096,
+ 24746, 29585, 30958, 32462) },
+ { AOM_CDF13(12122, 13265, 15603, 16501, 18609, 20033, 22391, 25583,
+ 26437, 30261, 31073, 32475) } },
+ { { AOM_CDF13(10023, 19585, 20848, 21440, 21832, 22760, 23089, 24023,
+ 25381, 29014, 30482, 31436) },
+ { AOM_CDF13(5983, 24099, 24560, 24886, 25066, 25795, 25913, 26423,
+ 27610, 29905, 31276, 31794) },
+ { AOM_CDF13(7444, 12781, 20177, 20728, 21077, 21607, 22170, 23405,
+ 24469, 27915, 29090, 30492) },
+ { AOM_CDF13(8537, 14689, 15432, 17087, 17408, 18172, 18408, 19825,
+ 24649, 29153, 31096, 32210) },
+ { AOM_CDF13(7543, 14231, 15496, 16195, 17905, 20717, 21984, 24516,
+ 26001, 29675, 30981, 31994) } },
+ { { AOM_CDF13(12613, 13591, 21383, 22004, 22312, 22577, 23401, 25055,
+ 25729, 29538, 30305, 32077) },
+ { AOM_CDF13(9687, 13470, 18506, 19230, 19604, 20147, 20695, 22062,
+ 23219, 27743, 29211, 30907) },
+ { AOM_CDF13(6183, 6505, 26024, 26252, 26366, 26434, 27082, 28354, 28555,
+ 30467, 30794, 32086) },
+ { AOM_CDF13(10718, 11734, 14954, 17224, 17565, 17924, 18561, 21523,
+ 23878, 28975, 30287, 32252) },
+ { AOM_CDF13(9194, 9858, 16501, 17263, 18424, 19171, 21563, 25961, 26561,
+ 30072, 30737, 32463) } },
+ { { AOM_CDF13(12602, 14399, 15488, 18381, 18778, 19315, 19724, 21419,
+ 25060, 29696, 30917, 32409) },
+ { AOM_CDF13(8203, 13821, 14524, 17105, 17439, 18131, 18404, 19468,
+ 25225, 29485, 31158, 32342) },
+ { AOM_CDF13(8451, 9731, 15004, 17643, 18012, 18425, 19070, 21538, 24605,
+ 29118, 30078, 32018) },
+ { AOM_CDF13(7714, 9048, 9516, 16667, 16817, 16994, 17153, 18767, 26743,
+ 30389, 31536, 32528) },
+ { AOM_CDF13(8843, 10280, 11496, 15317, 16652, 17943, 19108, 22718,
+ 25769, 29953, 30983, 32485) } },
+ { { AOM_CDF13(12578, 13671, 15979, 16834, 19075, 20913, 22989, 25449,
+ 26219, 30214, 31150, 32477) },
+ { AOM_CDF13(9563, 13626, 15080, 15892, 17756, 20863, 22207, 24236,
+ 25380, 29653, 31143, 32277) },
+ { AOM_CDF13(8356, 8901, 17616, 18256, 19350, 20106, 22598, 25947, 26466,
+ 29900, 30523, 32261) },
+ { AOM_CDF13(10835, 11815, 13124, 16042, 17018, 18039, 18947, 22753,
+ 24615, 29489, 30883, 32482) },
+ { AOM_CDF13(7618, 8288, 9859, 10509, 15386, 18657, 22903, 28776, 29180,
+ 31355, 31802, 32593) } }
+ };
+
+static const aom_cdf_prob default_angle_delta_cdf[DIRECTIONAL_MODES][CDF_SIZE(
+ 2 * MAX_ANGLE_DELTA + 1)] = {
+ { AOM_CDF7(2180, 5032, 7567, 22776, 26989, 30217) },
+ { AOM_CDF7(2301, 5608, 8801, 23487, 26974, 30330) },
+ { AOM_CDF7(3780, 11018, 13699, 19354, 23083, 31286) },
+ { AOM_CDF7(4581, 11226, 15147, 17138, 21834, 28397) },
+ { AOM_CDF7(1737, 10927, 14509, 19588, 22745, 28823) },
+ { AOM_CDF7(2664, 10176, 12485, 17650, 21600, 30495) },
+ { AOM_CDF7(2240, 11096, 15453, 20341, 22561, 28917) },
+ { AOM_CDF7(3605, 10428, 12459, 17676, 21244, 30655) }
+};
+
+static const aom_cdf_prob default_if_y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ INTRA_MODES)] = { { AOM_CDF13(22801, 23489, 24293, 24756, 25601, 26123,
+ 26606, 27418, 27945, 29228, 29685, 30349) },
+ { AOM_CDF13(18673, 19845, 22631, 23318, 23950, 24649,
+ 25527, 27364, 28152, 29701, 29984, 30852) },
+ { AOM_CDF13(19770, 20979, 23396, 23939, 24241, 24654,
+ 25136, 27073, 27830, 29360, 29730, 30659) },
+ { AOM_CDF13(20155, 21301, 22838, 23178, 23261, 23533,
+ 23703, 24804, 25352, 26575, 27016, 28049) } };
+
+static const aom_cdf_prob
+ default_uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES][CDF_SIZE(
+ UV_INTRA_MODES)] = {
+ { { AOM_CDF13(22631, 24152, 25378, 25661, 25986, 26520, 27055, 27923,
+ 28244, 30059, 30941, 31961) },
+ { AOM_CDF13(9513, 26881, 26973, 27046, 27118, 27664, 27739, 27824,
+ 28359, 29505, 29800, 31796) },
+ { AOM_CDF13(9845, 9915, 28663, 28704, 28757, 28780, 29198, 29822, 29854,
+ 30764, 31777, 32029) },
+ { AOM_CDF13(13639, 13897, 14171, 25331, 25606, 25727, 25953, 27148,
+ 28577, 30612, 31355, 32493) },
+ { AOM_CDF13(9764, 9835, 9930, 9954, 25386, 27053, 27958, 28148, 28243,
+ 31101, 31744, 32363) },
+ { AOM_CDF13(11825, 13589, 13677, 13720, 15048, 29213, 29301, 29458,
+ 29711, 31161, 31441, 32550) },
+ { AOM_CDF13(14175, 14399, 16608, 16821, 17718, 17775, 28551, 30200,
+ 30245, 31837, 32342, 32667) },
+ { AOM_CDF13(12885, 13038, 14978, 15590, 15673, 15748, 16176, 29128,
+ 29267, 30643, 31961, 32461) },
+ { AOM_CDF13(12026, 13661, 13874, 15305, 15490, 15726, 15995, 16273,
+ 28443, 30388, 30767, 32416) },
+ { AOM_CDF13(19052, 19840, 20579, 20916, 21150, 21467, 21885, 22719,
+ 23174, 28861, 30379, 32175) },
+ { AOM_CDF13(18627, 19649, 20974, 21219, 21492, 21816, 22199, 23119,
+ 23527, 27053, 31397, 32148) },
+ { AOM_CDF13(17026, 19004, 19997, 20339, 20586, 21103, 21349, 21907,
+ 22482, 25896, 26541, 31819) },
+ { AOM_CDF13(12124, 13759, 14959, 14992, 15007, 15051, 15078, 15166,
+ 15255, 15753, 16039, 16606) } },
+ { { AOM_CDF14(10407, 11208, 12900, 13181, 13823, 14175, 14899, 15656,
+ 15986, 20086, 20995, 22455, 24212) },
+ { AOM_CDF14(4532, 19780, 20057, 20215, 20428, 21071, 21199, 21451,
+ 22099, 24228, 24693, 27032, 29472) },
+ { AOM_CDF14(5273, 5379, 20177, 20270, 20385, 20439, 20949, 21695, 21774,
+ 23138, 24256, 24703, 26679) },
+ { AOM_CDF14(6740, 7167, 7662, 14152, 14536, 14785, 15034, 16741, 18371,
+ 21520, 22206, 23389, 24182) },
+ { AOM_CDF14(4987, 5368, 5928, 6068, 19114, 20315, 21857, 22253, 22411,
+ 24911, 25380, 26027, 26376) },
+ { AOM_CDF14(5370, 6889, 7247, 7393, 9498, 21114, 21402, 21753, 21981,
+ 24780, 25386, 26517, 27176) },
+ { AOM_CDF14(4816, 4961, 7204, 7326, 8765, 8930, 20169, 20682, 20803,
+ 23188, 23763, 24455, 24940) },
+ { AOM_CDF14(6608, 6740, 8529, 9049, 9257, 9356, 9735, 18827, 19059,
+ 22336, 23204, 23964, 24793) },
+ { AOM_CDF14(5998, 7419, 7781, 8933, 9255, 9549, 9753, 10417, 18898,
+ 22494, 23139, 24764, 25989) },
+ { AOM_CDF14(10660, 11298, 12550, 12957, 13322, 13624, 14040, 15004,
+ 15534, 20714, 21789, 23443, 24861) },
+ { AOM_CDF14(10522, 11530, 12552, 12963, 13378, 13779, 14245, 15235,
+ 15902, 20102, 22696, 23774, 25838) },
+ { AOM_CDF14(10099, 10691, 12639, 13049, 13386, 13665, 14125, 15163,
+ 15636, 19676, 20474, 23519, 25208) },
+ { AOM_CDF14(3144, 5087, 7382, 7504, 7593, 7690, 7801, 8064, 8232, 9248,
+ 9875, 10521, 29048) } }
+ };
+
+static const aom_cdf_prob default_partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(
+ EXT_PARTITION_TYPES)] = {
+ { AOM_CDF4(19132, 25510, 30392) },
+ { AOM_CDF4(13928, 19855, 28540) },
+ { AOM_CDF4(12522, 23679, 28629) },
+ { AOM_CDF4(9896, 18783, 25853) },
+ { AOM_CDF10(15597, 20929, 24571, 26706, 27664, 28821, 29601, 30571, 31902) },
+ { AOM_CDF10(7925, 11043, 16785, 22470, 23971, 25043, 26651, 28701, 29834) },
+ { AOM_CDF10(5414, 13269, 15111, 20488, 22360, 24500, 25537, 26336, 32117) },
+ { AOM_CDF10(2662, 6362, 8614, 20860, 23053, 24778, 26436, 27829, 31171) },
+ { AOM_CDF10(18462, 20920, 23124, 27647, 28227, 29049, 29519, 30178, 31544) },
+ { AOM_CDF10(7689, 9060, 12056, 24992, 25660, 26182, 26951, 28041, 29052) },
+ { AOM_CDF10(6015, 9009, 10062, 24544, 25409, 26545, 27071, 27526, 32047) },
+ { AOM_CDF10(1394, 2208, 2796, 28614, 29061, 29466, 29840, 30185, 31899) },
+ { AOM_CDF10(20137, 21547, 23078, 29566, 29837, 30261, 30524, 30892, 31724) },
+ { AOM_CDF10(6732, 7490, 9497, 27944, 28250, 28515, 28969, 29630, 30104) },
+ { AOM_CDF10(5945, 7663, 8348, 28683, 29117, 29749, 30064, 30298, 32238) },
+ { AOM_CDF10(870, 1212, 1487, 31198, 31394, 31574, 31743, 31881, 32332) },
+ { AOM_CDF8(27899, 28219, 28529, 32484, 32539, 32619, 32639) },
+ { AOM_CDF8(6607, 6990, 8268, 32060, 32219, 32338, 32371) },
+ { AOM_CDF8(5429, 6676, 7122, 32027, 32227, 32531, 32582) },
+ { AOM_CDF8(711, 966, 1172, 32448, 32538, 32617, 32664) },
+};
+
+static const aom_cdf_prob default_intra_ext_tx_cdf
+ [EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES][CDF_SIZE(TX_TYPES)] = {
+ {
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ },
+ {
+ {
+ { AOM_CDF7(1535, 8035, 9461, 12751, 23467, 27825) },
+ { AOM_CDF7(564, 3335, 9709, 10870, 18143, 28094) },
+ { AOM_CDF7(672, 3247, 3676, 11982, 19415, 23127) },
+ { AOM_CDF7(5279, 13885, 15487, 18044, 23527, 30252) },
+ { AOM_CDF7(4423, 6074, 7985, 10416, 25693, 29298) },
+ { AOM_CDF7(1486, 4241, 9460, 10662, 16456, 27694) },
+ { AOM_CDF7(439, 2838, 3522, 6737, 18058, 23754) },
+ { AOM_CDF7(1190, 4233, 4855, 11670, 20281, 24377) },
+ { AOM_CDF7(1045, 4312, 8647, 10159, 18644, 29335) },
+ { AOM_CDF7(202, 3734, 4747, 7298, 17127, 24016) },
+ { AOM_CDF7(447, 4312, 6819, 8884, 16010, 23858) },
+ { AOM_CDF7(277, 4369, 5255, 8905, 16465, 22271) },
+ { AOM_CDF7(3409, 5436, 10599, 15599, 19687, 24040) },
+ },
+ {
+ { AOM_CDF7(1870, 13742, 14530, 16498, 23770, 27698) },
+ { AOM_CDF7(326, 8796, 14632, 15079, 19272, 27486) },
+ { AOM_CDF7(484, 7576, 7712, 14443, 19159, 22591) },
+ { AOM_CDF7(1126, 15340, 15895, 17023, 20896, 30279) },
+ { AOM_CDF7(655, 4854, 5249, 5913, 22099, 27138) },
+ { AOM_CDF7(1299, 6458, 8885, 9290, 14851, 25497) },
+ { AOM_CDF7(311, 5295, 5552, 6885, 16107, 22672) },
+ { AOM_CDF7(883, 8059, 8270, 11258, 17289, 21549) },
+ { AOM_CDF7(741, 7580, 9318, 10345, 16688, 29046) },
+ { AOM_CDF7(110, 7406, 7915, 9195, 16041, 23329) },
+ { AOM_CDF7(363, 7974, 9357, 10673, 15629, 24474) },
+ { AOM_CDF7(153, 7647, 8112, 9936, 15307, 19996) },
+ { AOM_CDF7(3511, 6332, 11165, 15335, 19323, 23594) },
+ },
+ {
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ },
+ {
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ { AOM_CDF7(4681, 9362, 14043, 18725, 23406, 28087) },
+ },
+ },
+ {
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ {
+ { AOM_CDF5(1127, 12814, 22772, 27483) },
+ { AOM_CDF5(145, 6761, 11980, 26667) },
+ { AOM_CDF5(362, 5887, 11678, 16725) },
+ { AOM_CDF5(385, 15213, 18587, 30693) },
+ { AOM_CDF5(25, 2914, 23134, 27903) },
+ { AOM_CDF5(60, 4470, 11749, 23991) },
+ { AOM_CDF5(37, 3332, 14511, 21448) },
+ { AOM_CDF5(157, 6320, 13036, 17439) },
+ { AOM_CDF5(119, 6719, 12906, 29396) },
+ { AOM_CDF5(47, 5537, 12576, 21499) },
+ { AOM_CDF5(269, 6076, 11258, 23115) },
+ { AOM_CDF5(83, 5615, 12001, 17228) },
+ { AOM_CDF5(1968, 5556, 12023, 18547) },
+ },
+ {
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ { AOM_CDF5(6554, 13107, 19661, 26214) },
+ },
+ },
+ };
+
+static const aom_cdf_prob
+ default_inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES][CDF_SIZE(
+ TX_TYPES)] = {
+ {
+ { 0 },
+ { 0 },
+ { 0 },
+ { 0 },
+ },
+ {
+ { AOM_CDF16(4458, 5560, 7695, 9709, 13330, 14789, 17537, 20266, 21504,
+ 22848, 23934, 25474, 27727, 28915, 30631) },
+ { AOM_CDF16(1645, 2573, 4778, 5711, 7807, 8622, 10522, 15357, 17674,
+ 20408, 22517, 25010, 27116, 28856, 30749) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ },
+ {
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ { AOM_CDF12(770, 2421, 5225, 12907, 15819, 18927, 21561, 24089, 26595,
+ 28526, 30529) },
+ { AOM_CDF12(2731, 5461, 8192, 10923, 13653, 16384, 19115, 21845,
+ 24576, 27307, 30037) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(4167) },
+ { AOM_CDF2(1998) },
+ { AOM_CDF2(748) },
+ },
+ };
+
+static const aom_cdf_prob default_cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)] = {
+ AOM_CDF8(1418, 2123, 13340, 18405, 26972, 28343, 32294)
+};
+
+static const aom_cdf_prob
+ default_cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)] = {
+ { AOM_CDF16(7637, 20719, 31401, 32481, 32657, 32688, 32692, 32696, 32700,
+ 32704, 32708, 32712, 32716, 32720, 32724) },
+ { AOM_CDF16(14365, 23603, 28135, 31168, 32167, 32395, 32487, 32573, 32620,
+ 32647, 32668, 32672, 32676, 32680, 32684) },
+ { AOM_CDF16(11532, 22380, 28445, 31360, 32349, 32523, 32584, 32649, 32673,
+ 32677, 32681, 32685, 32689, 32693, 32697) },
+ { AOM_CDF16(26990, 31402, 32282, 32571, 32692, 32696, 32700, 32704, 32708,
+ 32712, 32716, 32720, 32724, 32728, 32732) },
+ { AOM_CDF16(17248, 26058, 28904, 30608, 31305, 31877, 32126, 32321, 32394,
+ 32464, 32516, 32560, 32576, 32593, 32622) },
+ { AOM_CDF16(14738, 21678, 25779, 27901, 29024, 30302, 30980, 31843, 32144,
+ 32413, 32520, 32594, 32622, 32656, 32660) }
+ };
+
+static const aom_cdf_prob
+ default_switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS][CDF_SIZE(
+ SWITCHABLE_FILTERS)] = {
+ { AOM_CDF3(31935, 32720) }, { AOM_CDF3(5568, 32719) },
+ { AOM_CDF3(422, 2938) }, { AOM_CDF3(28244, 32608) },
+ { AOM_CDF3(31206, 31953) }, { AOM_CDF3(4862, 32121) },
+ { AOM_CDF3(770, 1152) }, { AOM_CDF3(20889, 25637) },
+ { AOM_CDF3(31910, 32724) }, { AOM_CDF3(4120, 32712) },
+ { AOM_CDF3(305, 2247) }, { AOM_CDF3(27403, 32636) },
+ { AOM_CDF3(31022, 32009) }, { AOM_CDF3(2963, 32093) },
+ { AOM_CDF3(601, 943) }, { AOM_CDF3(14969, 21398) }
+ };
+
+static const aom_cdf_prob default_newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+ { { AOM_CDF2(24035) }, { AOM_CDF2(16630) }, { AOM_CDF2(15339) },
+ { AOM_CDF2(8386) }, { AOM_CDF2(12222) }, { AOM_CDF2(4676) } };
+
+static const aom_cdf_prob default_zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(2175) }, { AOM_CDF2(1054) } };
+
+static const aom_cdf_prob default_refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)] =
+ { { AOM_CDF2(23974) }, { AOM_CDF2(24188) }, { AOM_CDF2(17848) },
+ { AOM_CDF2(28622) }, { AOM_CDF2(24312) }, { AOM_CDF2(19923) } };
+
+static const aom_cdf_prob default_drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(13104) }, { AOM_CDF2(24560) }, { AOM_CDF2(18945) }
+};
+
+static const aom_cdf_prob
+ default_inter_compound_mode_cdf[INTER_MODE_CONTEXTS][CDF_SIZE(
+ INTER_COMPOUND_MODES)] = {
+ { AOM_CDF8(7760, 13823, 15808, 17641, 19156, 20666, 26891) },
+ { AOM_CDF8(10730, 19452, 21145, 22749, 24039, 25131, 28724) },
+ { AOM_CDF8(10664, 20221, 21588, 22906, 24295, 25387, 28436) },
+ { AOM_CDF8(13298, 16984, 20471, 24182, 25067, 25736, 26422) },
+ { AOM_CDF8(18904, 23325, 25242, 27432, 27898, 28258, 30758) },
+ { AOM_CDF8(10725, 17454, 20124, 22820, 24195, 25168, 26046) },
+ { AOM_CDF8(17125, 24273, 25814, 27492, 28214, 28704, 30592) },
+ { AOM_CDF8(13046, 23214, 24505, 25942, 27435, 28442, 29330) }
+ };
+
+static const aom_cdf_prob default_interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(16384) },
+ { AOM_CDF2(26887) },
+ { AOM_CDF2(27597) },
+ { AOM_CDF2(30237) } };
+
+static const aom_cdf_prob
+ default_interintra_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTERINTRA_MODES)] =
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(1875, 11082, 27332) },
+ { AOM_CDF4(2473, 9996, 26388) },
+ { AOM_CDF4(4238, 11537, 25926) } };
+
+static const aom_cdf_prob
+ default_wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(20036) }, { AOM_CDF2(24957) }, { AOM_CDF2(26704) },
+ { AOM_CDF2(27530) }, { AOM_CDF2(29564) }, { AOM_CDF2(29444) },
+ { AOM_CDF2(26872) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+ };
+
+static const aom_cdf_prob
+ default_compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23431) }, { AOM_CDF2(13171) }, { AOM_CDF2(11470) },
+ { AOM_CDF2(9770) }, { AOM_CDF2(9100) }, { AOM_CDF2(8233) },
+ { AOM_CDF2(6172) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(11820) }, { AOM_CDF2(7701) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }
+ };
+
+static const aom_cdf_prob default_wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)] =
+ { { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2438, 4440, 6599, 8663, 11005, 12874, 15751, 18094, 20359,
+ 22362, 24127, 25702, 27752, 29450, 31171) },
+ { AOM_CDF16(806, 3266, 6005, 6738, 7218, 7367, 7771, 14588, 16323, 17367,
+ 18452, 19422, 22839, 26127, 29629) },
+ { AOM_CDF16(2779, 3738, 4683, 7213, 7775, 8017, 8655, 14357, 17939, 21332,
+ 24520, 27470, 29456, 30529, 31656) },
+ { AOM_CDF16(1684, 3625, 5675, 7108, 9302, 11274, 14429, 17144, 19163,
+ 20961, 22884, 24471, 26719, 28714, 30877) },
+ { AOM_CDF16(1142, 3491, 6277, 7314, 8089, 8355, 9023, 13624, 15369, 16730,
+ 18114, 19313, 22521, 26012, 29550) },
+ { AOM_CDF16(2742, 4195, 5727, 8035, 8980, 9336, 10146, 14124, 17270,
+ 20533, 23434, 25972, 27944, 29570, 31416) },
+ { AOM_CDF16(1727, 3948, 6101, 7796, 9841, 12344, 15766, 18944, 20638,
+ 22038, 23963, 25311, 26988, 28766, 31012) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(154, 987, 1925, 2051, 2088, 2111, 2151, 23033, 23703, 24284,
+ 24985, 25684, 27259, 28883, 30911) },
+ { AOM_CDF16(1135, 1322, 1493, 2635, 2696, 2737, 2770, 21016, 22935, 25057,
+ 27251, 29173, 30089, 30960, 31933) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) },
+ { AOM_CDF16(2048, 4096, 6144, 8192, 10240, 12288, 14336, 16384, 18432,
+ 20480, 22528, 24576, 26624, 28672, 30720) } };
+
+static const aom_cdf_prob default_motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(
+ MOTION_MODES)] = { { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) }, { AOM_CDF3(7651, 24760) },
+ { AOM_CDF3(4738, 24765) }, { AOM_CDF3(5391, 25528) },
+ { AOM_CDF3(19419, 26810) }, { AOM_CDF3(5123, 23606) },
+ { AOM_CDF3(11606, 24308) }, { AOM_CDF3(26260, 29116) },
+ { AOM_CDF3(20360, 28062) }, { AOM_CDF3(21679, 26830) },
+ { AOM_CDF3(29516, 30701) }, { AOM_CDF3(28898, 30397) },
+ { AOM_CDF3(30878, 31335) }, { AOM_CDF3(32507, 32558) },
+ { AOM_CDF3(10923, 21845) }, { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(28799, 31390) }, { AOM_CDF3(26431, 30774) },
+ { AOM_CDF3(28973, 31594) }, { AOM_CDF3(29742, 31203) } };
+
+static const aom_cdf_prob default_obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)] = {
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(10437) }, { AOM_CDF2(9371) }, { AOM_CDF2(9301) },
+ { AOM_CDF2(17432) }, { AOM_CDF2(14423) }, { AOM_CDF2(15142) },
+ { AOM_CDF2(25817) }, { AOM_CDF2(22823) }, { AOM_CDF2(22083) },
+ { AOM_CDF2(30128) }, { AOM_CDF2(31014) }, { AOM_CDF2(31560) },
+ { AOM_CDF2(32638) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(23664) }, { AOM_CDF2(20901) }, { AOM_CDF2(24008) },
+ { AOM_CDF2(26879) }
+};
+
+static const aom_cdf_prob default_intra_inter_cdf[INTRA_INTER_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { AOM_CDF2(806) },
+ { AOM_CDF2(16662) },
+ { AOM_CDF2(20186) },
+ { AOM_CDF2(26538) }
+ };
+
+static const aom_cdf_prob default_comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(26828) },
+ { AOM_CDF2(24035) },
+ { AOM_CDF2(12031) },
+ { AOM_CDF2(10640) },
+ { AOM_CDF2(2901) } };
+
+static const aom_cdf_prob default_comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { AOM_CDF2(1198) },
+ { AOM_CDF2(2070) },
+ { AOM_CDF2(9166) },
+ { AOM_CDF2(7499) },
+ { AOM_CDF2(22475) }
+ };
+
+static const aom_cdf_prob
+ default_uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS -
+ 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(5284) }, { AOM_CDF2(3865) }, { AOM_CDF2(3128) } },
+ { { AOM_CDF2(23152) }, { AOM_CDF2(14173) }, { AOM_CDF2(15270) } },
+ { { AOM_CDF2(31774) }, { AOM_CDF2(25120) }, { AOM_CDF2(26710) } }
+ };
+
+static const aom_cdf_prob default_single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1]
+ [CDF_SIZE(2)] = {
+ { { AOM_CDF2(4897) },
+ { AOM_CDF2(1555) },
+ { AOM_CDF2(4236) },
+ { AOM_CDF2(8650) },
+ { AOM_CDF2(904) },
+ { AOM_CDF2(1444) } },
+ { { AOM_CDF2(16973) },
+ { AOM_CDF2(16751) },
+ { AOM_CDF2(19647) },
+ { AOM_CDF2(24773) },
+ { AOM_CDF2(11014) },
+ { AOM_CDF2(15087) } },
+ { { AOM_CDF2(29744) },
+ { AOM_CDF2(30279) },
+ { AOM_CDF2(31194) },
+ { AOM_CDF2(31895) },
+ { AOM_CDF2(26875) },
+ { AOM_CDF2(30304) } }
+ };
+
+static const aom_cdf_prob
+ default_comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(4946) }, { AOM_CDF2(9468) }, { AOM_CDF2(1503) } },
+ { { AOM_CDF2(19891) }, { AOM_CDF2(22441) }, { AOM_CDF2(15160) } },
+ { { AOM_CDF2(30731) }, { AOM_CDF2(31059) }, { AOM_CDF2(27544) } }
+ };
+
+static const aom_cdf_prob
+ default_comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)] = {
+ { { AOM_CDF2(2235) }, { AOM_CDF2(1423) } },
+ { { AOM_CDF2(17182) }, { AOM_CDF2(15175) } },
+ { { AOM_CDF2(30606) }, { AOM_CDF2(30489) } }
+ };
+
+static const aom_cdf_prob
+ default_palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+ { AOM_CDF7(7952, 13000, 18149, 21478, 25527, 29241) },
+ { AOM_CDF7(7139, 11421, 16195, 19544, 23666, 28073) },
+ { AOM_CDF7(7788, 12741, 17325, 20500, 24315, 28530) },
+ { AOM_CDF7(8271, 14064, 18246, 21564, 25071, 28533) },
+ { AOM_CDF7(12725, 19180, 21863, 24839, 27535, 30120) },
+ { AOM_CDF7(9711, 14888, 16923, 21052, 25661, 27875) },
+ { AOM_CDF7(14940, 20797, 21678, 24186, 27033, 28999) }
+ };
+
+static const aom_cdf_prob
+ default_palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)] = {
+ { AOM_CDF7(8713, 19979, 27128, 29609, 31331, 32272) },
+ { AOM_CDF7(5839, 15573, 23581, 26947, 29848, 31700) },
+ { AOM_CDF7(4426, 11260, 17999, 21483, 25863, 29430) },
+ { AOM_CDF7(3228, 9464, 14993, 18089, 22523, 27420) },
+ { AOM_CDF7(3768, 8886, 13091, 17852, 22495, 27207) },
+ { AOM_CDF7(2464, 8451, 12861, 21632, 25525, 28555) },
+ { AOM_CDF7(1269, 5435, 10433, 18963, 21700, 25865) }
+ };
+
+static const aom_cdf_prob default_palette_y_mode_cdf
+ [PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { { AOM_CDF2(31676) }, { AOM_CDF2(3419) }, { AOM_CDF2(1261) } },
+ { { AOM_CDF2(31912) }, { AOM_CDF2(2859) }, { AOM_CDF2(980) } },
+ { { AOM_CDF2(31823) }, { AOM_CDF2(3400) }, { AOM_CDF2(781) } },
+ { { AOM_CDF2(32030) }, { AOM_CDF2(3561) }, { AOM_CDF2(904) } },
+ { { AOM_CDF2(32309) }, { AOM_CDF2(7337) }, { AOM_CDF2(1462) } },
+ { { AOM_CDF2(32265) }, { AOM_CDF2(4015) }, { AOM_CDF2(1521) } },
+ { { AOM_CDF2(32450) }, { AOM_CDF2(7946) }, { AOM_CDF2(129) } }
+ };
+
+static const aom_cdf_prob
+ default_palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(32461) }, { AOM_CDF2(21488) }
+ };
+
+static const aom_cdf_prob default_palette_y_color_index_cdf
+ [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+ {
+ { AOM_CDF2(28710) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(10553) },
+ { AOM_CDF2(27036) },
+ { AOM_CDF2(31603) },
+ },
+ {
+ { AOM_CDF3(27877, 30490) },
+ { AOM_CDF3(11532, 25697) },
+ { AOM_CDF3(6544, 30234) },
+ { AOM_CDF3(23018, 28072) },
+ { AOM_CDF3(31915, 32385) },
+ },
+ {
+ { AOM_CDF4(25572, 28046, 30045) },
+ { AOM_CDF4(9478, 21590, 27256) },
+ { AOM_CDF4(7248, 26837, 29824) },
+ { AOM_CDF4(19167, 24486, 28349) },
+ { AOM_CDF4(31400, 31825, 32250) },
+ },
+ {
+ { AOM_CDF5(24779, 26955, 28576, 30282) },
+ { AOM_CDF5(8669, 20364, 24073, 28093) },
+ { AOM_CDF5(4255, 27565, 29377, 31067) },
+ { AOM_CDF5(19864, 23674, 26716, 29530) },
+ { AOM_CDF5(31646, 31893, 32147, 32426) },
+ },
+ {
+ { AOM_CDF6(23132, 25407, 26970, 28435, 30073) },
+ { AOM_CDF6(7443, 17242, 20717, 24762, 27982) },
+ { AOM_CDF6(6300, 24862, 26944, 28784, 30671) },
+ { AOM_CDF6(18916, 22895, 25267, 27435, 29652) },
+ { AOM_CDF6(31270, 31550, 31808, 32059, 32353) },
+ },
+ {
+ { AOM_CDF7(23105, 25199, 26464, 27684, 28931, 30318) },
+ { AOM_CDF7(6950, 15447, 18952, 22681, 25567, 28563) },
+ { AOM_CDF7(7560, 23474, 25490, 27203, 28921, 30708) },
+ { AOM_CDF7(18544, 22373, 24457, 26195, 28119, 30045) },
+ { AOM_CDF7(31198, 31451, 31670, 31882, 32123, 32391) },
+ },
+ {
+ { AOM_CDF8(21689, 23883, 25163, 26352, 27506, 28827, 30195) },
+ { AOM_CDF8(6892, 15385, 17840, 21606, 24287, 26753, 29204) },
+ { AOM_CDF8(5651, 23182, 25042, 26518, 27982, 29392, 30900) },
+ { AOM_CDF8(19349, 22578, 24418, 25994, 27524, 29031, 30448) },
+ { AOM_CDF8(31028, 31270, 31504, 31705, 31927, 32153, 32392) },
+ },
+ };
+
+static const aom_cdf_prob default_palette_uv_color_index_cdf
+ [PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS][CDF_SIZE(PALETTE_COLORS)] = {
+ {
+ { AOM_CDF2(29089) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(8713) },
+ { AOM_CDF2(29257) },
+ { AOM_CDF2(31610) },
+ },
+ {
+ { AOM_CDF3(25257, 29145) },
+ { AOM_CDF3(12287, 27293) },
+ { AOM_CDF3(7033, 27960) },
+ { AOM_CDF3(20145, 25405) },
+ { AOM_CDF3(30608, 31639) },
+ },
+ {
+ { AOM_CDF4(24210, 27175, 29903) },
+ { AOM_CDF4(9888, 22386, 27214) },
+ { AOM_CDF4(5901, 26053, 29293) },
+ { AOM_CDF4(18318, 22152, 28333) },
+ { AOM_CDF4(30459, 31136, 31926) },
+ },
+ {
+ { AOM_CDF5(22980, 25479, 27781, 29986) },
+ { AOM_CDF5(8413, 21408, 24859, 28874) },
+ { AOM_CDF5(2257, 29449, 30594, 31598) },
+ { AOM_CDF5(19189, 21202, 25915, 28620) },
+ { AOM_CDF5(31844, 32044, 32281, 32518) },
+ },
+ {
+ { AOM_CDF6(22217, 24567, 26637, 28683, 30548) },
+ { AOM_CDF6(7307, 16406, 19636, 24632, 28424) },
+ { AOM_CDF6(4441, 25064, 26879, 28942, 30919) },
+ { AOM_CDF6(17210, 20528, 23319, 26750, 29582) },
+ { AOM_CDF6(30674, 30953, 31396, 31735, 32207) },
+ },
+ {
+ { AOM_CDF7(21239, 23168, 25044, 26962, 28705, 30506) },
+ { AOM_CDF7(6545, 15012, 18004, 21817, 25503, 28701) },
+ { AOM_CDF7(3448, 26295, 27437, 28704, 30126, 31442) },
+ { AOM_CDF7(15889, 18323, 21704, 24698, 26976, 29690) },
+ { AOM_CDF7(30988, 31204, 31479, 31734, 31983, 32325) },
+ },
+ {
+ { AOM_CDF8(21442, 23288, 24758, 26246, 27649, 28980, 30563) },
+ { AOM_CDF8(5863, 14933, 17552, 20668, 23683, 26411, 29273) },
+ { AOM_CDF8(3415, 25810, 26877, 27990, 29223, 30394, 31618) },
+ { AOM_CDF8(17965, 20084, 22232, 23974, 26274, 28402, 30390) },
+ { AOM_CDF8(31190, 31329, 31516, 31679, 31825, 32026, 32322) },
+ },
+ };
+
+static const aom_cdf_prob
+ default_txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(28581) }, { AOM_CDF2(23846) }, { AOM_CDF2(20847) },
+ { AOM_CDF2(24315) }, { AOM_CDF2(18196) }, { AOM_CDF2(12133) },
+ { AOM_CDF2(18791) }, { AOM_CDF2(10887) }, { AOM_CDF2(11005) },
+ { AOM_CDF2(27179) }, { AOM_CDF2(20004) }, { AOM_CDF2(11281) },
+ { AOM_CDF2(26549) }, { AOM_CDF2(19308) }, { AOM_CDF2(14224) },
+ { AOM_CDF2(28015) }, { AOM_CDF2(21546) }, { AOM_CDF2(14400) },
+ { AOM_CDF2(28165) }, { AOM_CDF2(22401) }, { AOM_CDF2(16088) }
+ };
+
+static const aom_cdf_prob default_skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(31671) }, { AOM_CDF2(16515) }, { AOM_CDF2(4576) }
+};
+
+static const aom_cdf_prob default_skip_mode_cdfs[SKIP_MODE_CONTEXTS][CDF_SIZE(
+ 2)] = { { AOM_CDF2(32621) }, { AOM_CDF2(20708) }, { AOM_CDF2(8127) } };
+
+static const aom_cdf_prob
+ default_compound_idx_cdfs[COMP_INDEX_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(18244) }, { AOM_CDF2(12865) }, { AOM_CDF2(7053) },
+ { AOM_CDF2(13259) }, { AOM_CDF2(9334) }, { AOM_CDF2(4644) }
+ };
+
+static const aom_cdf_prob
+ default_comp_group_idx_cdfs[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)] = {
+ { AOM_CDF2(26607) }, { AOM_CDF2(22891) }, { AOM_CDF2(18840) },
+ { AOM_CDF2(24594) }, { AOM_CDF2(19934) }, { AOM_CDF2(22674) }
+ };
+
+static const aom_cdf_prob default_intrabc_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 30531) };
+
+static const aom_cdf_prob default_filter_intra_mode_cdf[CDF_SIZE(
+ FILTER_INTRA_MODES)] = { AOM_CDF5(8949, 12776, 17211, 29558) };
+
+static const aom_cdf_prob default_filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(
+ 2)] = { { AOM_CDF2(4621) }, { AOM_CDF2(6743) }, { AOM_CDF2(5893) },
+ { AOM_CDF2(7866) }, { AOM_CDF2(12551) }, { AOM_CDF2(9394) },
+ { AOM_CDF2(12408) }, { AOM_CDF2(14301) }, { AOM_CDF2(12756) },
+ { AOM_CDF2(22343) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(16384) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) }, { AOM_CDF2(12770) }, { AOM_CDF2(10368) },
+ { AOM_CDF2(20229) }, { AOM_CDF2(18101) }, { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } };
+
+static const aom_cdf_prob default_switchable_restore_cdf[CDF_SIZE(
+ RESTORE_SWITCHABLE_TYPES)] = { AOM_CDF3(9413, 22581) };
+
+static const aom_cdf_prob default_wiener_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 11570) };
+
+static const aom_cdf_prob default_sgrproj_restore_cdf[CDF_SIZE(2)] = { AOM_CDF2(
+ 16855) };
+
+static const aom_cdf_prob default_delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)] = {
+ AOM_CDF4(28160, 32120, 32677)
+};
+
+static const aom_cdf_prob default_delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(
+ DELTA_LF_PROBS + 1)] = { { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) },
+ { AOM_CDF4(28160, 32120, 32677) } };
+static const aom_cdf_prob default_delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)] = {
+ AOM_CDF4(28160, 32120, 32677)
+};
+
+// FIXME(someone) need real defaults here
+static const aom_cdf_prob default_seg_tree_cdf[CDF_SIZE(MAX_SEGMENTS)] = {
+ AOM_CDF8(4096, 8192, 12288, 16384, 20480, 24576, 28672)
+};
+
+static const aom_cdf_prob
+ default_segment_pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)] = {
+ { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }, { AOM_CDF2(128 * 128) }
+ };
+
+static const aom_cdf_prob
+ default_spatial_pred_seg_tree_cdf[SPATIAL_PREDICTION_PROBS][CDF_SIZE(
+ MAX_SEGMENTS)] = {
+ {
+ AOM_CDF8(5622, 7893, 16093, 18233, 27809, 28373, 32533),
+ },
+ {
+ AOM_CDF8(14274, 18230, 22557, 24935, 29980, 30851, 32344),
+ },
+ {
+ AOM_CDF8(27527, 28487, 28723, 28890, 32397, 32647, 32679),
+ },
+ };
+
+static const aom_cdf_prob default_tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+ [CDF_SIZE(MAX_TX_DEPTH + 1)] = {
+ { { AOM_CDF2(19968) },
+ { AOM_CDF2(19968) },
+ { AOM_CDF2(24320) } },
+ { { AOM_CDF3(12272, 30172) },
+ { AOM_CDF3(12272, 30172) },
+ { AOM_CDF3(18677, 30848) } },
+ { { AOM_CDF3(12986, 15180) },
+ { AOM_CDF3(12986, 15180) },
+ { AOM_CDF3(24302, 25602) } },
+ { { AOM_CDF3(5782, 11475) },
+ { AOM_CDF3(5782, 11475) },
+ { AOM_CDF3(16803, 22759) } },
+ };
+
+#define MAX_COLOR_CONTEXT_HASH 8
+// Negative values are invalid
+static const int palette_color_index_context_lookup[MAX_COLOR_CONTEXT_HASH +
+ 1] = { -1, -1, 0, -1, -1,
+ 4, 3, 2, 1 };
+
+#define NUM_PALETTE_NEIGHBORS 3 // left, top-left and top.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+ int r, int c, int palette_size,
+ uint8_t *color_order, int *color_idx) {
+ assert(palette_size <= PALETTE_MAX_SIZE);
+ assert(r > 0 || c > 0);
+
+ // Get color indices of neighbors.
+ int color_neighbors[NUM_PALETTE_NEIGHBORS];
+ color_neighbors[0] = (c - 1 >= 0) ? color_map[r * stride + c - 1] : -1;
+ color_neighbors[1] =
+ (c - 1 >= 0 && r - 1 >= 0) ? color_map[(r - 1) * stride + c - 1] : -1;
+ color_neighbors[2] = (r - 1 >= 0) ? color_map[(r - 1) * stride + c] : -1;
+
+ // The +10 below should not be needed. But we get a warning "array subscript
+ // is above array bounds [-Werror=array-bounds]" without it, possibly due to
+ // this (or similar) bug: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124
+ int scores[PALETTE_MAX_SIZE + 10] = { 0 };
+ int i;
+ static const int weights[NUM_PALETTE_NEIGHBORS] = { 2, 1, 2 };
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ if (color_neighbors[i] >= 0) {
+ scores[color_neighbors[i]] += weights[i];
+ }
+ }
+
+ int inverse_color_order[PALETTE_MAX_SIZE];
+ for (i = 0; i < PALETTE_MAX_SIZE; ++i) {
+ color_order[i] = i;
+ inverse_color_order[i] = i;
+ }
+
+ // Get the top NUM_PALETTE_NEIGHBORS scores (sorted from large to small).
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ int max = scores[i];
+ int max_idx = i;
+ for (int j = i + 1; j < palette_size; ++j) {
+ if (scores[j] > max) {
+ max = scores[j];
+ max_idx = j;
+ }
+ }
+ if (max_idx != i) {
+ // Move the score at index 'max_idx' to index 'i', and shift the scores
+ // from 'i' to 'max_idx - 1' by 1.
+ const int max_score = scores[max_idx];
+ const uint8_t max_color_order = color_order[max_idx];
+ for (int k = max_idx; k > i; --k) {
+ scores[k] = scores[k - 1];
+ color_order[k] = color_order[k - 1];
+ inverse_color_order[color_order[k]] = k;
+ }
+ scores[i] = max_score;
+ color_order[i] = max_color_order;
+ inverse_color_order[color_order[i]] = i;
+ }
+ }
+
+ if (color_idx != NULL)
+ *color_idx = inverse_color_order[color_map[r * stride + c]];
+
+ // Get hash value of context.
+ int color_index_ctx_hash = 0;
+ static const int hash_multipliers[NUM_PALETTE_NEIGHBORS] = { 1, 2, 2 };
+ for (i = 0; i < NUM_PALETTE_NEIGHBORS; ++i) {
+ color_index_ctx_hash += scores[i] * hash_multipliers[i];
+ }
+ assert(color_index_ctx_hash > 0);
+ assert(color_index_ctx_hash <= MAX_COLOR_CONTEXT_HASH);
+
+ // Lookup context from hash.
+ const int color_index_ctx =
+ palette_color_index_context_lookup[color_index_ctx_hash];
+ assert(color_index_ctx >= 0);
+ assert(color_index_ctx < PALETTE_COLOR_INDEX_CONTEXTS);
+ return color_index_ctx;
+}
+#undef NUM_PALETTE_NEIGHBORS
+#undef MAX_COLOR_CONTEXT_HASH
+
+static void init_mode_probs(FRAME_CONTEXT *fc) {
+ av1_copy(fc->palette_y_size_cdf, default_palette_y_size_cdf);
+ av1_copy(fc->palette_uv_size_cdf, default_palette_uv_size_cdf);
+ av1_copy(fc->palette_y_color_index_cdf, default_palette_y_color_index_cdf);
+ av1_copy(fc->palette_uv_color_index_cdf, default_palette_uv_color_index_cdf);
+ av1_copy(fc->kf_y_cdf, default_kf_y_mode_cdf);
+ av1_copy(fc->angle_delta_cdf, default_angle_delta_cdf);
+ av1_copy(fc->comp_inter_cdf, default_comp_inter_cdf);
+ av1_copy(fc->comp_ref_type_cdf, default_comp_ref_type_cdf);
+ av1_copy(fc->uni_comp_ref_cdf, default_uni_comp_ref_cdf);
+ av1_copy(fc->palette_y_mode_cdf, default_palette_y_mode_cdf);
+ av1_copy(fc->palette_uv_mode_cdf, default_palette_uv_mode_cdf);
+ av1_copy(fc->comp_ref_cdf, default_comp_ref_cdf);
+ av1_copy(fc->comp_bwdref_cdf, default_comp_bwdref_cdf);
+ av1_copy(fc->single_ref_cdf, default_single_ref_cdf);
+ av1_copy(fc->txfm_partition_cdf, default_txfm_partition_cdf);
+ av1_copy(fc->compound_index_cdf, default_compound_idx_cdfs);
+ av1_copy(fc->comp_group_idx_cdf, default_comp_group_idx_cdfs);
+ av1_copy(fc->newmv_cdf, default_newmv_cdf);
+ av1_copy(fc->zeromv_cdf, default_zeromv_cdf);
+ av1_copy(fc->refmv_cdf, default_refmv_cdf);
+ av1_copy(fc->drl_cdf, default_drl_cdf);
+ av1_copy(fc->motion_mode_cdf, default_motion_mode_cdf);
+ av1_copy(fc->obmc_cdf, default_obmc_cdf);
+ av1_copy(fc->inter_compound_mode_cdf, default_inter_compound_mode_cdf);
+ av1_copy(fc->compound_type_cdf, default_compound_type_cdf);
+ av1_copy(fc->wedge_idx_cdf, default_wedge_idx_cdf);
+ av1_copy(fc->interintra_cdf, default_interintra_cdf);
+ av1_copy(fc->wedge_interintra_cdf, default_wedge_interintra_cdf);
+ av1_copy(fc->interintra_mode_cdf, default_interintra_mode_cdf);
+ av1_copy(fc->seg.pred_cdf, default_segment_pred_cdf);
+ av1_copy(fc->seg.tree_cdf, default_seg_tree_cdf);
+ av1_copy(fc->filter_intra_cdfs, default_filter_intra_cdfs);
+ av1_copy(fc->filter_intra_mode_cdf, default_filter_intra_mode_cdf);
+ av1_copy(fc->switchable_restore_cdf, default_switchable_restore_cdf);
+ av1_copy(fc->wiener_restore_cdf, default_wiener_restore_cdf);
+ av1_copy(fc->sgrproj_restore_cdf, default_sgrproj_restore_cdf);
+ av1_copy(fc->y_mode_cdf, default_if_y_mode_cdf);
+ av1_copy(fc->uv_mode_cdf, default_uv_mode_cdf);
+ av1_copy(fc->switchable_interp_cdf, default_switchable_interp_cdf);
+ av1_copy(fc->partition_cdf, default_partition_cdf);
+ av1_copy(fc->intra_ext_tx_cdf, default_intra_ext_tx_cdf);
+ av1_copy(fc->inter_ext_tx_cdf, default_inter_ext_tx_cdf);
+ av1_copy(fc->skip_mode_cdfs, default_skip_mode_cdfs);
+ av1_copy(fc->skip_cdfs, default_skip_cdfs);
+ av1_copy(fc->intra_inter_cdf, default_intra_inter_cdf);
+ for (int i = 0; i < SPATIAL_PREDICTION_PROBS; i++)
+ av1_copy(fc->seg.spatial_pred_seg_cdf[i],
+ default_spatial_pred_seg_tree_cdf[i]);
+ av1_copy(fc->tx_size_cdf, default_tx_size_cdf);
+ av1_copy(fc->delta_q_cdf, default_delta_q_cdf);
+ av1_copy(fc->delta_lf_cdf, default_delta_lf_cdf);
+ av1_copy(fc->delta_lf_multi_cdf, default_delta_lf_multi_cdf);
+ av1_copy(fc->cfl_sign_cdf, default_cfl_sign_cdf);
+ av1_copy(fc->cfl_alpha_cdf, default_cfl_alpha_cdf);
+ av1_copy(fc->intrabc_cdf, default_intrabc_cdf);
+}
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas) {
+ assert(ref_deltas != NULL);
+
+ ref_deltas[INTRA_FRAME] = 1;
+ ref_deltas[LAST_FRAME] = 0;
+ ref_deltas[LAST2_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[LAST3_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[BWDREF_FRAME] = ref_deltas[LAST_FRAME];
+ ref_deltas[GOLDEN_FRAME] = -1;
+ ref_deltas[ALTREF2_FRAME] = -1;
+ ref_deltas[ALTREF_FRAME] = -1;
+}
+
+void av1_set_default_mode_deltas(int8_t *mode_deltas) {
+ assert(mode_deltas != NULL);
+
+ mode_deltas[0] = 0;
+ mode_deltas[1] = 0;
+}
+
+static void set_default_lf_deltas(struct loopfilter *lf) {
+ lf->mode_ref_delta_enabled = 1;
+ lf->mode_ref_delta_update = 1;
+
+ av1_set_default_ref_deltas(lf->ref_deltas);
+ av1_set_default_mode_deltas(lf->mode_deltas);
+}
+
+void av1_setup_frame_contexts(AV1_COMMON *cm) {
+ // Store the frame context into a special slot (not associated with any
+ // reference buffer), so that we can set up cm->pre_fc correctly later
+ // This function must ONLY be called when cm->fc has been initialized with
+ // default probs, either by av1_setup_past_independence or after manually
+ // initializing them
+ cm->frame_contexts[FRAME_CONTEXT_DEFAULTS] = *cm->fc;
+ if (cm->large_scale_tile) {
+ for (int i = 0; i < FRAME_CONTEXTS; ++i) cm->frame_contexts[i] = *cm->fc;
+ }
+}
+
+void av1_setup_past_independence(AV1_COMMON *cm) {
+ // Reset the segment feature data to the default stats:
+ // Features disabled, 0, with delta coding (Default state).
+ av1_clearall_segfeatures(&cm->seg);
+
+ cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
+ if (cm->current_frame_seg_map)
+ memset(cm->current_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+ // reset mode ref deltas
+ av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+ av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+ set_default_lf_deltas(&cm->lf);
+
+ av1_default_coef_probs(cm);
+ init_mode_probs(cm->fc);
+ av1_init_mv_probs(cm);
+ av1_init_lv_map(cm);
+ cm->fc->initialized = 1;
+ av1_setup_frame_contexts(cm);
+
+ // prev_mip will only be allocated in encoder.
+ if (frame_is_intra_only(cm) && cm->prev_mip)
+ memset(cm->prev_mip, 0,
+ cm->mi_stride * cm->mi_rows * sizeof(*cm->prev_mip));
+}
diff --git a/third_party/aom/av1/common/entropymode.h b/third_party/aom/av1/common/entropymode.h
new file mode 100644
index 0000000000..7047f34d2b
--- /dev/null
+++ b/third_party/aom/av1/common/entropymode.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMODE_H_
+#define AOM_AV1_COMMON_ENTROPYMODE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/filter.h"
+#include "av1/common/seg_common.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLOCK_SIZE_GROUPS 4
+
+#define TX_SIZE_CONTEXTS 3
+
+#define INTER_OFFSET(mode) ((mode)-NEARESTMV)
+#define INTER_COMPOUND_OFFSET(mode) (uint8_t)((mode)-NEAREST_NEARESTMV)
+
+// Number of possible contexts for a color index.
+// As can be seen from av1_get_palette_color_index_context(), the possible
+// contexts are (2,0,0), (2,2,1), (3,2,0), (4,1,0), (5,0,0). These are mapped to
+// a value from 0 to 4 using 'palette_color_index_context_lookup' table.
+#define PALETTE_COLOR_INDEX_CONTEXTS 5
+
+// Palette Y mode context for a block is determined by number of neighboring
+// blocks (top and/or left) using a palette for Y plane. So, possible Y mode'
+// context values are:
+// 0 if neither left nor top block uses palette for Y plane,
+// 1 if exactly one of left or top block uses palette for Y plane, and
+// 2 if both left and top blocks use palette for Y plane.
+#define PALETTE_Y_MODE_CONTEXTS 3
+
+// Palette UV mode context for a block is determined by whether this block uses
+// palette for the Y plane. So, possible values are:
+// 0 if this block doesn't use palette for Y plane.
+// 1 if this block uses palette for Y plane (i.e. Y palette size > 0).
+#define PALETTE_UV_MODE_CONTEXTS 2
+
+// Map the number of pixels in a block size to a context
+// 64(BLOCK_8X8, BLOCK_4x16, BLOCK_16X4) -> 0
+// 128(BLOCK_8X16, BLOCK_16x8) -> 1
+// ...
+// 4096(BLOCK_64X64) -> 6
+#define PALATTE_BSIZE_CTXS 7
+
+#define KF_MODE_CONTEXTS 5
+
+struct AV1Common;
+
+typedef struct {
+ const int16_t *scan;
+ const int16_t *iscan;
+ const int16_t *neighbors;
+} SCAN_ORDER;
+
+typedef struct frame_contexts {
+ aom_cdf_prob txb_skip_cdf[TX_SIZES][TXB_SKIP_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob eob_extra_cdf[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS]
+ [CDF_SIZE(2)];
+ aom_cdf_prob dc_sign_cdf[PLANE_TYPES][DC_SIGN_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob eob_flag_cdf16[PLANE_TYPES][2][CDF_SIZE(5)];
+ aom_cdf_prob eob_flag_cdf32[PLANE_TYPES][2][CDF_SIZE(6)];
+ aom_cdf_prob eob_flag_cdf64[PLANE_TYPES][2][CDF_SIZE(7)];
+ aom_cdf_prob eob_flag_cdf128[PLANE_TYPES][2][CDF_SIZE(8)];
+ aom_cdf_prob eob_flag_cdf256[PLANE_TYPES][2][CDF_SIZE(9)];
+ aom_cdf_prob eob_flag_cdf512[PLANE_TYPES][2][CDF_SIZE(10)];
+ aom_cdf_prob eob_flag_cdf1024[PLANE_TYPES][2][CDF_SIZE(11)];
+ aom_cdf_prob coeff_base_eob_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB]
+ [CDF_SIZE(3)];
+ aom_cdf_prob coeff_base_cdf[TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+ [CDF_SIZE(4)];
+ aom_cdf_prob coeff_br_cdf[TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+ [CDF_SIZE(BR_CDF_SIZE)];
+
+ aom_cdf_prob newmv_cdf[NEWMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob zeromv_cdf[GLOBALMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob refmv_cdf[REFMV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob drl_cdf[DRL_MODE_CONTEXTS][CDF_SIZE(2)];
+
+ aom_cdf_prob inter_compound_mode_cdf[INTER_MODE_CONTEXTS]
+ [CDF_SIZE(INTER_COMPOUND_MODES)];
+ aom_cdf_prob compound_type_cdf[BLOCK_SIZES_ALL][CDF_SIZE(COMPOUND_TYPES - 1)];
+ aom_cdf_prob wedge_idx_cdf[BLOCK_SIZES_ALL][CDF_SIZE(16)];
+ aom_cdf_prob interintra_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(2)];
+ aom_cdf_prob wedge_interintra_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob interintra_mode_cdf[BLOCK_SIZE_GROUPS]
+ [CDF_SIZE(INTERINTRA_MODES)];
+ aom_cdf_prob motion_mode_cdf[BLOCK_SIZES_ALL][CDF_SIZE(MOTION_MODES)];
+ aom_cdf_prob obmc_cdf[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob palette_y_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+ aom_cdf_prob palette_uv_size_cdf[PALATTE_BSIZE_CTXS][CDF_SIZE(PALETTE_SIZES)];
+ aom_cdf_prob palette_y_color_index_cdf[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+ aom_cdf_prob palette_uv_color_index_cdf[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [CDF_SIZE(PALETTE_COLORS)];
+ aom_cdf_prob palette_y_mode_cdf[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS]
+ [CDF_SIZE(2)];
+ aom_cdf_prob palette_uv_mode_cdf[PALETTE_UV_MODE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob comp_inter_cdf[COMP_INTER_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob single_ref_cdf[REF_CONTEXTS][SINGLE_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob comp_ref_type_cdf[COMP_REF_TYPE_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob uni_comp_ref_cdf[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+ [CDF_SIZE(2)];
+ aom_cdf_prob comp_ref_cdf[REF_CONTEXTS][FWD_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob comp_bwdref_cdf[REF_CONTEXTS][BWD_REFS - 1][CDF_SIZE(2)];
+ aom_cdf_prob txfm_partition_cdf[TXFM_PARTITION_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob compound_index_cdf[COMP_INDEX_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob comp_group_idx_cdf[COMP_GROUP_IDX_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob skip_mode_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob skip_cdfs[SKIP_CONTEXTS][CDF_SIZE(2)];
+ aom_cdf_prob intra_inter_cdf[INTRA_INTER_CONTEXTS][CDF_SIZE(2)];
+ nmv_context nmvc;
+ nmv_context ndvc;
+ aom_cdf_prob intrabc_cdf[CDF_SIZE(2)];
+ struct segmentation_probs seg;
+ aom_cdf_prob filter_intra_cdfs[BLOCK_SIZES_ALL][CDF_SIZE(2)];
+ aom_cdf_prob filter_intra_mode_cdf[CDF_SIZE(FILTER_INTRA_MODES)];
+ aom_cdf_prob switchable_restore_cdf[CDF_SIZE(RESTORE_SWITCHABLE_TYPES)];
+ aom_cdf_prob wiener_restore_cdf[CDF_SIZE(2)];
+ aom_cdf_prob sgrproj_restore_cdf[CDF_SIZE(2)];
+ aom_cdf_prob y_mode_cdf[BLOCK_SIZE_GROUPS][CDF_SIZE(INTRA_MODES)];
+ aom_cdf_prob uv_mode_cdf[CFL_ALLOWED_TYPES][INTRA_MODES]
+ [CDF_SIZE(UV_INTRA_MODES)];
+ aom_cdf_prob partition_cdf[PARTITION_CONTEXTS][CDF_SIZE(EXT_PARTITION_TYPES)];
+ aom_cdf_prob switchable_interp_cdf[SWITCHABLE_FILTER_CONTEXTS]
+ [CDF_SIZE(SWITCHABLE_FILTERS)];
+ /* kf_y_cdf is discarded after use, so does not require persistent storage.
+ However, we keep it with the other CDFs in this struct since it needs to
+ be copied to each tile to support parallelism just like the others.
+ */
+ aom_cdf_prob kf_y_cdf[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS]
+ [CDF_SIZE(INTRA_MODES)];
+
+ aom_cdf_prob angle_delta_cdf[DIRECTIONAL_MODES]
+ [CDF_SIZE(2 * MAX_ANGLE_DELTA + 1)];
+
+ aom_cdf_prob tx_size_cdf[MAX_TX_CATS][TX_SIZE_CONTEXTS]
+ [CDF_SIZE(MAX_TX_DEPTH + 1)];
+ aom_cdf_prob delta_q_cdf[CDF_SIZE(DELTA_Q_PROBS + 1)];
+ aom_cdf_prob delta_lf_multi_cdf[FRAME_LF_COUNT][CDF_SIZE(DELTA_LF_PROBS + 1)];
+ aom_cdf_prob delta_lf_cdf[CDF_SIZE(DELTA_LF_PROBS + 1)];
+ aom_cdf_prob intra_ext_tx_cdf[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [CDF_SIZE(TX_TYPES)];
+ aom_cdf_prob inter_ext_tx_cdf[EXT_TX_SETS_INTER][EXT_TX_SIZES]
+ [CDF_SIZE(TX_TYPES)];
+ aom_cdf_prob cfl_sign_cdf[CDF_SIZE(CFL_JOINT_SIGNS)];
+ aom_cdf_prob cfl_alpha_cdf[CFL_ALPHA_CONTEXTS][CDF_SIZE(CFL_ALPHABET_SIZE)];
+ int initialized;
+} FRAME_CONTEXT;
+
+static const int av1_ext_tx_ind[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 1, 5, 6, 4, 0, 0, 0, 0, 0, 0, 2, 3, 0, 0, 0, 0 },
+ { 3, 4, 5, 8, 6, 7, 9, 10, 11, 0, 1, 2, 0, 0, 0, 0 },
+ { 7, 8, 9, 12, 10, 11, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6 },
+};
+
+static const int av1_ext_tx_inv[EXT_TX_SET_TYPES][TX_TYPES] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 0, 10, 11, 3, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 9, 10, 11, 0, 1, 2, 4, 5, 3, 6, 7, 8, 0, 0, 0, 0 },
+ { 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 4, 5, 3, 6, 7, 8 },
+};
+
+void av1_set_default_ref_deltas(int8_t *ref_deltas);
+void av1_set_default_mode_deltas(int8_t *mode_deltas);
+void av1_setup_frame_contexts(struct AV1Common *cm);
+void av1_setup_past_independence(struct AV1Common *cm);
+
+// Returns (int)ceil(log2(n)).
+// NOTE: This implementation only works for n <= 2^30.
+static INLINE int av1_ceil_log2(int n) {
+ if (n < 2) return 0;
+ int i = 1, p = 2;
+ while (p < n) {
+ i++;
+ p = p << 1;
+ }
+ return i;
+}
+
+// Returns the context for palette color index at row 'r' and column 'c',
+// along with the 'color_order' of neighbors and the 'color_idx'.
+// The 'color_map' is a 2D array with the given 'stride'.
+int av1_get_palette_color_index_context(const uint8_t *color_map, int stride,
+ int r, int c, int palette_size,
+ uint8_t *color_order, int *color_idx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPYMODE_H_
diff --git a/third_party/aom/av1/common/entropymv.c b/third_party/aom/av1/common/entropymv.c
new file mode 100644
index 0000000000..4913373875
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropymv.h"
+
+static const nmv_context default_nmv_context = {
+ { AOM_CDF4(4096, 11264, 19328) }, // joints_cdf
+ { {
+ // Vertical component
+ { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+ 32762, 32767) }, // class_cdf // fp
+ { { AOM_CDF4(16384, 24576, 26624) },
+ { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf
+ { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf
+ { AOM_CDF2(128 * 128) }, // sign_cdf
+ { AOM_CDF2(160 * 128) }, // class0_hp_cdf
+ { AOM_CDF2(128 * 128) }, // hp_cdf
+ { AOM_CDF2(216 * 128) }, // class0_cdf
+ { { AOM_CDF2(128 * 136) },
+ { AOM_CDF2(128 * 140) },
+ { AOM_CDF2(128 * 148) },
+ { AOM_CDF2(128 * 160) },
+ { AOM_CDF2(128 * 176) },
+ { AOM_CDF2(128 * 192) },
+ { AOM_CDF2(128 * 224) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 240) } }, // bits_cdf
+ },
+ {
+ // Horizontal component
+ { AOM_CDF11(28672, 30976, 31858, 32320, 32551, 32656, 32740, 32757,
+ 32762, 32767) }, // class_cdf // fp
+ { { AOM_CDF4(16384, 24576, 26624) },
+ { AOM_CDF4(12288, 21248, 24128) } }, // class0_fp_cdf
+ { AOM_CDF4(8192, 17408, 21248) }, // fp_cdf
+ { AOM_CDF2(128 * 128) }, // sign_cdf
+ { AOM_CDF2(160 * 128) }, // class0_hp_cdf
+ { AOM_CDF2(128 * 128) }, // hp_cdf
+ { AOM_CDF2(216 * 128) }, // class0_cdf
+ { { AOM_CDF2(128 * 136) },
+ { AOM_CDF2(128 * 140) },
+ { AOM_CDF2(128 * 148) },
+ { AOM_CDF2(128 * 160) },
+ { AOM_CDF2(128 * 176) },
+ { AOM_CDF2(128 * 192) },
+ { AOM_CDF2(128 * 224) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 234) },
+ { AOM_CDF2(128 * 240) } }, // bits_cdf
+ } },
+};
+
+void av1_init_mv_probs(AV1_COMMON *cm) {
+ // NB: this sets CDFs too
+ cm->fc->nmvc = default_nmv_context;
+ cm->fc->ndvc = default_nmv_context;
+}
diff --git a/third_party/aom/av1/common/entropymv.h b/third_party/aom/av1/common/entropymv.h
new file mode 100644
index 0000000000..fa818a2c16
--- /dev/null
+++ b/third_party/aom/av1/common/entropymv.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENTROPYMV_H_
+#define AOM_AV1_COMMON_ENTROPYMV_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/prob.h"
+
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+void av1_init_mv_probs(struct AV1Common *cm);
+
+#define MV_UPDATE_PROB 252
+
+/* Symbols for coding which components are zero jointly */
+#define MV_JOINTS 4
+typedef enum {
+ MV_JOINT_ZERO = 0, /* Zero vector */
+ MV_JOINT_HNZVZ = 1, /* Vert zero, hor nonzero */
+ MV_JOINT_HZVNZ = 2, /* Hor zero, vert nonzero */
+ MV_JOINT_HNZVNZ = 3, /* Both components nonzero */
+} MV_JOINT_TYPE;
+
+static INLINE int mv_joint_vertical(MV_JOINT_TYPE type) {
+ return type == MV_JOINT_HZVNZ || type == MV_JOINT_HNZVNZ;
+}
+
+static INLINE int mv_joint_horizontal(MV_JOINT_TYPE type) {
+ return type == MV_JOINT_HNZVZ || type == MV_JOINT_HNZVNZ;
+}
+
+/* Symbols for coding magnitude class of nonzero components */
+#define MV_CLASSES 11
+typedef enum {
+ MV_CLASS_0 = 0, /* (0, 2] integer pel */
+ MV_CLASS_1 = 1, /* (2, 4] integer pel */
+ MV_CLASS_2 = 2, /* (4, 8] integer pel */
+ MV_CLASS_3 = 3, /* (8, 16] integer pel */
+ MV_CLASS_4 = 4, /* (16, 32] integer pel */
+ MV_CLASS_5 = 5, /* (32, 64] integer pel */
+ MV_CLASS_6 = 6, /* (64, 128] integer pel */
+ MV_CLASS_7 = 7, /* (128, 256] integer pel */
+ MV_CLASS_8 = 8, /* (256, 512] integer pel */
+ MV_CLASS_9 = 9, /* (512, 1024] integer pel */
+ MV_CLASS_10 = 10, /* (1024,2048] integer pel */
+} MV_CLASS_TYPE;
+
+#define CLASS0_BITS 1 /* bits at integer precision for class 0 */
+#define CLASS0_SIZE (1 << CLASS0_BITS)
+#define MV_OFFSET_BITS (MV_CLASSES + CLASS0_BITS - 2)
+#define MV_BITS_CONTEXTS 6
+#define MV_FP_SIZE 4
+
+#define MV_MAX_BITS (MV_CLASSES + CLASS0_BITS + 2)
+#define MV_MAX ((1 << MV_MAX_BITS) - 1)
+#define MV_VALS ((MV_MAX << 1) + 1)
+
+#define MV_IN_USE_BITS 14
+#define MV_UPP (1 << MV_IN_USE_BITS)
+#define MV_LOW (-(1 << MV_IN_USE_BITS))
+
+typedef struct {
+ aom_cdf_prob classes_cdf[CDF_SIZE(MV_CLASSES)];
+ aom_cdf_prob class0_fp_cdf[CLASS0_SIZE][CDF_SIZE(MV_FP_SIZE)];
+ aom_cdf_prob fp_cdf[CDF_SIZE(MV_FP_SIZE)];
+ aom_cdf_prob sign_cdf[CDF_SIZE(2)];
+ aom_cdf_prob class0_hp_cdf[CDF_SIZE(2)];
+ aom_cdf_prob hp_cdf[CDF_SIZE(2)];
+ aom_cdf_prob class0_cdf[CDF_SIZE(CLASS0_SIZE)];
+ aom_cdf_prob bits_cdf[MV_OFFSET_BITS][CDF_SIZE(2)];
+} nmv_component;
+
+typedef struct {
+ aom_cdf_prob joints_cdf[CDF_SIZE(MV_JOINTS)];
+ nmv_component comps[2];
+} nmv_context;
+
+typedef enum {
+ MV_SUBPEL_NONE = -1,
+ MV_SUBPEL_LOW_PRECISION = 0,
+ MV_SUBPEL_HIGH_PRECISION,
+} MvSubpelPrecision;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENTROPYMV_H_
diff --git a/third_party/aom/av1/common/enums.h b/third_party/aom/av1/common/enums.h
new file mode 100644
index 0000000000..869c06ef2f
--- /dev/null
+++ b/third_party/aom/av1/common/enums.h
@@ -0,0 +1,619 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ENUMS_H_
+#define AOM_AV1_COMMON_ENUMS_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef MAX_SB_SIZE
+
+// Max superblock size
+#define MAX_SB_SIZE_LOG2 7
+#define MAX_SB_SIZE (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2 2
+#define MI_SIZE (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK (MAX_MIB_SIZE - 1)
+
+// Maximum number of tile rows and tile columns
+#define MAX_TILE_ROWS 64
+#define MAX_TILE_COLS 64
+
+#define MAX_VARTX_DEPTH 2
+
+#define MI_SIZE_64X64 (64 >> MI_SIZE_LOG2)
+#define MI_SIZE_128X128 (128 >> MI_SIZE_LOG2)
+
+#define MAX_PALETTE_SQUARE (64 * 64)
+// Maximum number of colors in a palette.
+#define PALETTE_MAX_SIZE 8
+// Minimum number of colors in a palette.
+#define PALETTE_MIN_SIZE 2
+
+#define FRAME_OFFSET_BITS 5
+#define MAX_FRAME_DISTANCE ((1 << FRAME_OFFSET_BITS) - 1)
+
+#define REF_FRAMES_LOG2 3
+#define REF_FRAMES (1 << REF_FRAMES_LOG2)
+
+// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
+// in parallel, 3 for scaled references on the encoder.
+// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
+// of framebuffers.
+// TODO(jkoleszar): These 3 extra references could probably come from the
+// normal reference pool.
+#define FRAME_BUFFERS (REF_FRAMES + 7)
+
+// 4 frame filter levels: y plane vertical, y plane horizontal,
+// u plane, and v plane
+#define FRAME_LF_COUNT 4
+#define DEFAULT_DELTA_LF_MULTI 0
+#define MAX_MODE_LF_DELTAS 2
+
+#define DIST_PRECISION_BITS 4
+#define DIST_PRECISION (1 << DIST_PRECISION_BITS) // 16
+
+// TODO(chengchen): Temporal flag serve as experimental flag for WIP
+// bitmask construction.
+// Shall be removed when bitmask code is completely checkedin
+#define LOOP_FILTER_BITMASK 0
+
+#define PROFILE_BITS 3
+// The following three profiles are currently defined.
+// Profile 0. 8-bit and 10-bit 4:2:0 and 4:0:0 only.
+// Profile 1. 8-bit and 10-bit 4:4:4
+// Profile 2. 8-bit and 10-bit 4:2:2
+// 12-bit 4:0:0, 4:2:2 and 4:4:4
+// Since we have three bits for the profiles, it can be extended later.
+typedef enum BITSTREAM_PROFILE {
+ PROFILE_0,
+ PROFILE_1,
+ PROFILE_2,
+ MAX_PROFILES,
+} BITSTREAM_PROFILE;
+
+#define LEVEL_MAJOR_BITS 3
+#define LEVEL_MINOR_BITS 2
+#define LEVEL_BITS (LEVEL_MAJOR_BITS + LEVEL_MINOR_BITS)
+
+#define LEVEL_MAJOR_MIN 2
+#define LEVEL_MAJOR_MAX ((1 << LEVEL_MAJOR_BITS) - 1 + LEVEL_MAJOR_MIN)
+#define LEVEL_MINOR_MIN 0
+#define LEVEL_MINOR_MAX ((1 << LEVEL_MINOR_BITS) - 1)
+
+#define OP_POINTS_CNT_MINUS_1_BITS 5
+#define OP_POINTS_IDC_BITS 12
+
+// Note: Some enums use the attribute 'packed' to use smallest possible integer
+// type, so that we can save memory when they are used in structs/arrays.
+
+typedef enum ATTRIBUTE_PACKED {
+ BLOCK_4X4,
+ BLOCK_4X8,
+ BLOCK_8X4,
+ BLOCK_8X8,
+ BLOCK_8X16,
+ BLOCK_16X8,
+ BLOCK_16X16,
+ BLOCK_16X32,
+ BLOCK_32X16,
+ BLOCK_32X32,
+ BLOCK_32X64,
+ BLOCK_64X32,
+ BLOCK_64X64,
+ BLOCK_64X128,
+ BLOCK_128X64,
+ BLOCK_128X128,
+ BLOCK_4X16,
+ BLOCK_16X4,
+ BLOCK_8X32,
+ BLOCK_32X8,
+ BLOCK_16X64,
+ BLOCK_64X16,
+ BLOCK_SIZES_ALL,
+ BLOCK_SIZES = BLOCK_4X16,
+ BLOCK_INVALID = 255,
+ BLOCK_LARGEST = (BLOCK_SIZES - 1)
+} BLOCK_SIZE;
+
+// 4X4, 8X8, 16X16, 32X32, 64X64, 128X128
+#define SQR_BLOCK_SIZES 6
+
+typedef enum ATTRIBUTE_PACKED {
+ PARTITION_NONE,
+ PARTITION_HORZ,
+ PARTITION_VERT,
+ PARTITION_SPLIT,
+ PARTITION_HORZ_A, // HORZ split and the top partition is split again
+ PARTITION_HORZ_B, // HORZ split and the bottom partition is split again
+ PARTITION_VERT_A, // VERT split and the left partition is split again
+ PARTITION_VERT_B, // VERT split and the right partition is split again
+ PARTITION_HORZ_4, // 4:1 horizontal partition
+ PARTITION_VERT_4, // 4:1 vertical partition
+ EXT_PARTITION_TYPES,
+ PARTITION_TYPES = PARTITION_SPLIT + 1,
+ PARTITION_INVALID = 255
+} PARTITION_TYPE;
+
+typedef char PARTITION_CONTEXT;
+#define PARTITION_PLOFFSET 4 // number of probability models per block size
+#define PARTITION_BLOCK_SIZES 5
+#define PARTITION_CONTEXTS (PARTITION_BLOCK_SIZES * PARTITION_PLOFFSET)
+
+// block transform size
+#if defined(_MSC_VER)
+typedef uint8_t TX_SIZE;
+enum ATTRIBUTE_PACKED {
+#else
+typedef enum ATTRIBUTE_PACKED {
+#endif
+ TX_4X4, // 4x4 transform
+ TX_8X8, // 8x8 transform
+ TX_16X16, // 16x16 transform
+ TX_32X32, // 32x32 transform
+ TX_64X64, // 64x64 transform
+ TX_4X8, // 4x8 transform
+ TX_8X4, // 8x4 transform
+ TX_8X16, // 8x16 transform
+ TX_16X8, // 16x8 transform
+ TX_16X32, // 16x32 transform
+ TX_32X16, // 32x16 transform
+ TX_32X64, // 32x64 transform
+ TX_64X32, // 64x32 transform
+ TX_4X16, // 4x16 transform
+ TX_16X4, // 16x4 transform
+ TX_8X32, // 8x32 transform
+ TX_32X8, // 32x8 transform
+ TX_16X64, // 16x64 transform
+ TX_64X16, // 64x16 transform
+ TX_SIZES_ALL, // Includes rectangular transforms
+ TX_SIZES = TX_4X8, // Does NOT include rectangular transforms
+ TX_SIZES_LARGEST = TX_64X64,
+ TX_INVALID = 255 // Invalid transform size
+#if defined(_MSC_VER)
+};
+#else
+} TX_SIZE;
+#endif
+
+#define TX_SIZE_LUMA_MIN (TX_4X4)
+/* We don't need to code a transform size unless the allowed size is at least
+ one more than the minimum. */
+#define TX_SIZE_CTX_MIN (TX_SIZE_LUMA_MIN + 1)
+
+// Maximum tx_size categories
+#define MAX_TX_CATS (TX_SIZES - TX_SIZE_CTX_MIN)
+#define MAX_TX_DEPTH 2
+
+#define MAX_TX_SIZE_LOG2 (6)
+#define MAX_TX_SIZE (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2 2
+#define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR_LOG2 2
+#define TX_PAD_HOR 4
+// Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
+// check.
+#define TX_PAD_TOP 2
+#define TX_PAD_BOTTOM 4
+#define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D ((32 + TX_PAD_HOR) * (32 + TX_PAD_VER) + TX_PAD_END)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+// frame transform mode
+typedef enum ATTRIBUTE_PACKED {
+ ONLY_4X4, // use only 4x4 transform
+ TX_MODE_LARGEST, // transform size is the largest possible for pu size
+ TX_MODE_SELECT, // transform specified for each block
+ TX_MODES,
+} TX_MODE;
+
+// 1D tx types
+typedef enum ATTRIBUTE_PACKED {
+ DCT_1D,
+ ADST_1D,
+ FLIPADST_1D,
+ IDTX_1D,
+ TX_TYPES_1D,
+} TX_TYPE_1D;
+
+typedef enum ATTRIBUTE_PACKED {
+ DCT_DCT, // DCT in both horizontal and vertical
+ ADST_DCT, // ADST in vertical, DCT in horizontal
+ DCT_ADST, // DCT in vertical, ADST in horizontal
+ ADST_ADST, // ADST in both directions
+ FLIPADST_DCT,
+ DCT_FLIPADST,
+ FLIPADST_FLIPADST,
+ ADST_FLIPADST,
+ FLIPADST_ADST,
+ IDTX,
+ V_DCT,
+ H_DCT,
+ V_ADST,
+ H_ADST,
+ V_FLIPADST,
+ H_FLIPADST,
+ TX_TYPES,
+} TX_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ REG_REG,
+ REG_SMOOTH,
+ REG_SHARP,
+ SMOOTH_REG,
+ SMOOTH_SMOOTH,
+ SMOOTH_SHARP,
+ SHARP_REG,
+ SHARP_SMOOTH,
+ SHARP_SHARP,
+} DUAL_FILTER_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ // DCT only
+ EXT_TX_SET_DCTONLY,
+ // DCT + Identity only
+ EXT_TX_SET_DCT_IDTX,
+ // Discrete Trig transforms w/o flip (4) + Identity (1)
+ EXT_TX_SET_DTT4_IDTX,
+ // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+ EXT_TX_SET_DTT4_IDTX_1DDCT,
+ // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+ EXT_TX_SET_DTT9_IDTX_1DDCT,
+ // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+ EXT_TX_SET_ALL16,
+ EXT_TX_SET_TYPES
+} TxSetType;
+
+#define IS_2D_TRANSFORM(tx_type) (tx_type < IDTX)
+
+#define EXT_TX_SIZES 4 // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER 4 // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA 3 // Sets of transform selections for INTRA
+
+typedef enum ATTRIBUTE_PACKED {
+ AOM_LAST_FLAG = 1 << 0,
+ AOM_LAST2_FLAG = 1 << 1,
+ AOM_LAST3_FLAG = 1 << 2,
+ AOM_GOLD_FLAG = 1 << 3,
+ AOM_BWD_FLAG = 1 << 4,
+ AOM_ALT2_FLAG = 1 << 5,
+ AOM_ALT_FLAG = 1 << 6,
+ AOM_REFFRAME_ALL = (1 << 7) - 1
+} AOM_REFFRAME;
+
+typedef enum ATTRIBUTE_PACKED {
+ UNIDIR_COMP_REFERENCE,
+ BIDIR_COMP_REFERENCE,
+ COMP_REFERENCE_TYPES,
+} COMP_REFERENCE_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ PLANE_TYPE_Y,
+ PLANE_TYPE_UV,
+ PLANE_TYPES
+} PLANE_TYPE;
+
+#define CFL_ALPHABET_SIZE_LOG2 4
+#define CFL_ALPHABET_SIZE (1 << CFL_ALPHABET_SIZE_LOG2)
+#define CFL_MAGS_SIZE ((2 << CFL_ALPHABET_SIZE_LOG2) + 1)
+#define CFL_IDX_U(idx) (idx >> CFL_ALPHABET_SIZE_LOG2)
+#define CFL_IDX_V(idx) (idx & (CFL_ALPHABET_SIZE - 1))
+
+typedef enum ATTRIBUTE_PACKED {
+ CFL_PRED_U,
+ CFL_PRED_V,
+ CFL_PRED_PLANES
+} CFL_PRED_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ CFL_SIGN_ZERO,
+ CFL_SIGN_NEG,
+ CFL_SIGN_POS,
+ CFL_SIGNS
+} CFL_SIGN_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ CFL_DISALLOWED,
+ CFL_ALLOWED,
+ CFL_ALLOWED_TYPES
+} CFL_ALLOWED_TYPE;
+
+// CFL_SIGN_ZERO,CFL_SIGN_ZERO is invalid
+#define CFL_JOINT_SIGNS (CFL_SIGNS * CFL_SIGNS - 1)
+// CFL_SIGN_U is equivalent to (js + 1) / 3 for js in 0 to 8
+#define CFL_SIGN_U(js) (((js + 1) * 11) >> 5)
+// CFL_SIGN_V is equivalent to (js + 1) % 3 for js in 0 to 8
+#define CFL_SIGN_V(js) ((js + 1) - CFL_SIGNS * CFL_SIGN_U(js))
+
+// There is no context when the alpha for a given plane is zero.
+// So there are 2 fewer contexts than joint signs.
+#define CFL_ALPHA_CONTEXTS (CFL_JOINT_SIGNS + 1 - CFL_SIGNS)
+#define CFL_CONTEXT_U(js) (js + 1 - CFL_SIGNS)
+// Also, the contexts are symmetric under swapping the planes.
+#define CFL_CONTEXT_V(js) \
+ (CFL_SIGN_V(js) * CFL_SIGNS + CFL_SIGN_U(js) - CFL_SIGNS)
+
+typedef enum ATTRIBUTE_PACKED {
+ PALETTE_MAP,
+ COLOR_MAP_TYPES,
+} COLOR_MAP_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ TWO_COLORS,
+ THREE_COLORS,
+ FOUR_COLORS,
+ FIVE_COLORS,
+ SIX_COLORS,
+ SEVEN_COLORS,
+ EIGHT_COLORS,
+ PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum ATTRIBUTE_PACKED {
+ PALETTE_COLOR_ONE,
+ PALETTE_COLOR_TWO,
+ PALETTE_COLOR_THREE,
+ PALETTE_COLOR_FOUR,
+ PALETTE_COLOR_FIVE,
+ PALETTE_COLOR_SIX,
+ PALETTE_COLOR_SEVEN,
+ PALETTE_COLOR_EIGHT,
+ PALETTE_COLORS
+} PALETTE_COLOR;
+
+// Note: All directional predictors must be between V_PRED and D67_PRED (both
+// inclusive).
+typedef enum ATTRIBUTE_PACKED {
+ DC_PRED, // Average of above and left pixels
+ V_PRED, // Vertical
+ H_PRED, // Horizontal
+ D45_PRED, // Directional 45 degree
+ D135_PRED, // Directional 135 degree
+ D113_PRED, // Directional 113 degree
+ D157_PRED, // Directional 157 degree
+ D203_PRED, // Directional 203 degree
+ D67_PRED, // Directional 67 degree
+ SMOOTH_PRED, // Combination of horizontal and vertical interpolation
+ SMOOTH_V_PRED, // Vertical interpolation
+ SMOOTH_H_PRED, // Horizontal interpolation
+ PAETH_PRED, // Predict from the direction of smallest gradient
+ NEARESTMV,
+ NEARMV,
+ GLOBALMV,
+ NEWMV,
+ // Compound ref compound modes
+ NEAREST_NEARESTMV,
+ NEAR_NEARMV,
+ NEAREST_NEWMV,
+ NEW_NEARESTMV,
+ NEAR_NEWMV,
+ NEW_NEARMV,
+ GLOBAL_GLOBALMV,
+ NEW_NEWMV,
+ MB_MODE_COUNT,
+ INTRA_MODE_START = DC_PRED,
+ INTRA_MODE_END = NEARESTMV,
+ INTRA_MODE_NUM = INTRA_MODE_END - INTRA_MODE_START,
+ SINGLE_INTER_MODE_START = NEARESTMV,
+ SINGLE_INTER_MODE_END = NEAREST_NEARESTMV,
+ SINGLE_INTER_MODE_NUM = SINGLE_INTER_MODE_END - SINGLE_INTER_MODE_START,
+ COMP_INTER_MODE_START = NEAREST_NEARESTMV,
+ COMP_INTER_MODE_END = MB_MODE_COUNT,
+ COMP_INTER_MODE_NUM = COMP_INTER_MODE_END - COMP_INTER_MODE_START,
+ INTER_MODE_START = NEARESTMV,
+ INTER_MODE_END = MB_MODE_COUNT,
+ INTRA_MODES = PAETH_PRED + 1, // PAETH_PRED has to be the last intra mode.
+ INTRA_INVALID = MB_MODE_COUNT // For uv_mode in inter blocks
+} PREDICTION_MODE;
+
+// TODO(ltrudeau) Do we really want to pack this?
+// TODO(ltrudeau) Do we match with PREDICTION_MODE?
+typedef enum ATTRIBUTE_PACKED {
+ UV_DC_PRED, // Average of above and left pixels
+ UV_V_PRED, // Vertical
+ UV_H_PRED, // Horizontal
+ UV_D45_PRED, // Directional 45 degree
+ UV_D135_PRED, // Directional 135 degree
+ UV_D113_PRED, // Directional 113 degree
+ UV_D157_PRED, // Directional 157 degree
+ UV_D203_PRED, // Directional 203 degree
+ UV_D67_PRED, // Directional 67 degree
+ UV_SMOOTH_PRED, // Combination of horizontal and vertical interpolation
+ UV_SMOOTH_V_PRED, // Vertical interpolation
+ UV_SMOOTH_H_PRED, // Horizontal interpolation
+ UV_PAETH_PRED, // Predict from the direction of smallest gradient
+ UV_CFL_PRED, // Chroma-from-Luma
+ UV_INTRA_MODES,
+ UV_MODE_INVALID, // For uv_mode in inter blocks
+} UV_PREDICTION_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+ SIMPLE_TRANSLATION,
+ OBMC_CAUSAL, // 2-sided OBMC
+ WARPED_CAUSAL, // 2-sided WARPED
+ MOTION_MODES
+} MOTION_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+ II_DC_PRED,
+ II_V_PRED,
+ II_H_PRED,
+ II_SMOOTH_PRED,
+ INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+typedef enum ATTRIBUTE_PACKED {
+ COMPOUND_AVERAGE,
+ COMPOUND_WEDGE,
+ COMPOUND_DIFFWTD,
+ COMPOUND_TYPES,
+} COMPOUND_TYPE;
+
+typedef enum ATTRIBUTE_PACKED {
+ FILTER_DC_PRED,
+ FILTER_V_PRED,
+ FILTER_H_PRED,
+ FILTER_D157_PRED,
+ FILTER_PAETH_PRED,
+ FILTER_INTRA_MODES,
+} FILTER_INTRA_MODE;
+
+#define DIRECTIONAL_MODES 8
+#define MAX_ANGLE_DELTA 3
+#define ANGLE_STEP 3
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define SKIP_MODE_CONTEXTS 3
+
+#define COMP_INDEX_CONTEXTS 6
+#define COMP_GROUP_IDX_CONTEXTS 6
+
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS 6
+#define GLOBALMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS 6
+#define DRL_MODE_CONTEXTS 3
+
+#define GLOBALMV_OFFSET 3
+#define REFMV_OFFSET 4
+
+#define NEWMV_CTX_MASK ((1 << GLOBALMV_OFFSET) - 1)
+#define GLOBALMV_CTX_MASK ((1 << (REFMV_OFFSET - GLOBALMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define COMP_NEWMV_CTXS 5
+#define INTER_MODE_CONTEXTS 8
+
+#define DELTA_Q_SMALL 3
+#define DELTA_Q_PROBS (DELTA_Q_SMALL)
+#define DEFAULT_DELTA_Q_RES 4
+#define DELTA_LF_SMALL 3
+#define DELTA_LF_PROBS (DELTA_LF_SMALL)
+#define DEFAULT_DELTA_LF_RES 2
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define MAX_REF_MV_STACK_SIZE 8
+#define REF_CAT_LEVEL 640
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 3
+
+#define COMP_REF_TYPE_CONTEXTS 5
+#define UNI_COMP_REF_CONTEXTS 3
+
+#define TXFM_PARTITION_CONTEXTS ((TX_SIZES - TX_8X8) * 6 - 3)
+typedef uint8_t TXFM_CONTEXT;
+
+#define NONE_FRAME -1
+#define INTRA_FRAME 0
+#define LAST_FRAME 1
+#define LAST2_FRAME 2
+#define LAST3_FRAME 3
+#define GOLDEN_FRAME 4
+#define BWDREF_FRAME 5
+#define ALTREF2_FRAME 6
+#define ALTREF_FRAME 7
+#define EXTREF_FRAME REF_FRAMES
+#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
+
+#define INTER_REFS_PER_FRAME (ALTREF_FRAME - LAST_FRAME + 1)
+
+#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
+ LAST_LAST2_FRAMES, // { LAST_FRAME, LAST2_FRAME }
+ LAST_LAST3_FRAMES, // { LAST_FRAME, LAST3_FRAME }
+ LAST_GOLDEN_FRAMES, // { LAST_FRAME, GOLDEN_FRAME }
+ BWDREF_ALTREF_FRAMES, // { BWDREF_FRAME, ALTREF_FRAME }
+ LAST2_LAST3_FRAMES, // { LAST2_FRAME, LAST3_FRAME }
+ LAST2_GOLDEN_FRAMES, // { LAST2_FRAME, GOLDEN_FRAME }
+ LAST3_GOLDEN_FRAMES, // { LAST3_FRAME, GOLDEN_FRAME }
+ BWDREF_ALTREF2_FRAMES, // { BWDREF_FRAME, ALTREF2_FRAME }
+ ALTREF2_ALTREF_FRAMES, // { ALTREF2_FRAME, ALTREF_FRAME }
+ TOTAL_UNIDIR_COMP_REFS,
+ // NOTE: UNIDIR_COMP_REFS is the number of uni-directional reference pairs
+ // that are explicitly signaled.
+ UNIDIR_COMP_REFS = BWDREF_ALTREF_FRAMES + 1,
+} UNIDIR_COMP_REF;
+
+#define TOTAL_COMP_REFS (FWD_REFS * BWD_REFS + TOTAL_UNIDIR_COMP_REFS)
+
+#define COMP_REFS (FWD_REFS * BWD_REFS + UNIDIR_COMP_REFS)
+
+// NOTE: A limited number of unidirectional reference pairs can be signalled for
+// compound prediction. The use of skip mode, on the other hand, makes it
+// possible to have a reference pair not listed for explicit signaling.
+#define MODE_CTX_REF_FRAMES (REF_FRAMES + TOTAL_COMP_REFS)
+
+typedef enum ATTRIBUTE_PACKED {
+ RESTORE_NONE,
+ RESTORE_WIENER,
+ RESTORE_SGRPROJ,
+ RESTORE_SWITCHABLE,
+ RESTORE_SWITCHABLE_TYPES = RESTORE_SWITCHABLE,
+ RESTORE_TYPES = 4,
+} RestorationType;
+
+#define SUPERRES_SCALE_BITS 3
+#define SUPERRES_SCALE_DENOMINATOR_MIN (SCALE_NUMERATOR + 1)
+
+// In large_scale_tile coding, external references are used.
+#define MAX_EXTERNAL_REFERENCES 128
+#define MAX_TILES 512
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ENUMS_H_
diff --git a/third_party/aom/av1/common/filter.h b/third_party/aom/av1/common/filter.h
new file mode 100644
index 0000000000..571422d114
--- /dev/null
+++ b/third_party/aom/av1/common/filter.h
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FILTER_H_
+#define AOM_AV1_COMMON_FILTER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_FILTER_TAP 8
+
+typedef enum ATTRIBUTE_PACKED {
+ EIGHTTAP_REGULAR,
+ EIGHTTAP_SMOOTH,
+ MULTITAP_SHARP,
+ BILINEAR,
+ INTERP_FILTERS_ALL,
+ SWITCHABLE_FILTERS = BILINEAR,
+ SWITCHABLE = SWITCHABLE_FILTERS + 1, /* the last switchable one */
+ EXTRA_FILTERS = INTERP_FILTERS_ALL - SWITCHABLE_FILTERS,
+} InterpFilter;
+
+// With CONFIG_DUAL_FILTER, pack two InterpFilter's into a uint32_t: since
+// there are at most 10 filters, we can use 16 bits for each and have more than
+// enough space. This reduces argument passing and unifies the operation of
+// setting a (pair of) filters.
+//
+// Without CONFIG_DUAL_FILTER,
+typedef uint32_t InterpFilters;
+static INLINE InterpFilter av1_extract_interp_filter(InterpFilters filters,
+ int x_filter) {
+ return (InterpFilter)((filters >> (x_filter ? 16 : 0)) & 0xf);
+}
+
+static INLINE InterpFilters av1_make_interp_filters(InterpFilter y_filter,
+ InterpFilter x_filter) {
+ uint16_t y16 = y_filter & 0xf;
+ uint16_t x16 = x_filter & 0xf;
+ return y16 | ((uint32_t)x16 << 16);
+}
+
+static INLINE InterpFilters av1_broadcast_interp_filter(InterpFilter filter) {
+ return av1_make_interp_filters(filter, filter);
+}
+
+static INLINE InterpFilter av1_unswitchable_filter(InterpFilter filter) {
+ return filter == SWITCHABLE ? EIGHTTAP_REGULAR : filter;
+}
+
+/* (1 << LOG_SWITCHABLE_FILTERS) > SWITCHABLE_FILTERS */
+#define LOG_SWITCHABLE_FILTERS 2
+
+#define MAX_SUBPEL_TAPS 12
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET ((SWITCHABLE_FILTERS + 1) * 2)
+
+typedef struct InterpFilterParams {
+ const int16_t *filter_ptr;
+ uint16_t taps;
+ uint16_t subpel_shifts;
+ InterpFilter interp_filter;
+} InterpFilterParams;
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_bilinear_filters[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 0, 120, 8, 0, 0, 0 },
+ { 0, 0, 0, 112, 16, 0, 0, 0 }, { 0, 0, 0, 104, 24, 0, 0, 0 },
+ { 0, 0, 0, 96, 32, 0, 0, 0 }, { 0, 0, 0, 88, 40, 0, 0, 0 },
+ { 0, 0, 0, 80, 48, 0, 0, 0 }, { 0, 0, 0, 72, 56, 0, 0, 0 },
+ { 0, 0, 0, 64, 64, 0, 0, 0 }, { 0, 0, 0, 56, 72, 0, 0, 0 },
+ { 0, 0, 0, 48, 80, 0, 0, 0 }, { 0, 0, 0, 40, 88, 0, 0, 0 },
+ { 0, 0, 0, 32, 96, 0, 0, 0 }, { 0, 0, 0, 24, 104, 0, 0, 0 },
+ { 0, 0, 0, 16, 112, 0, 0, 0 }, { 0, 0, 0, 8, 120, 0, 0, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, -6, 126, 8, -2, 0, 0 },
+ { 0, 2, -10, 122, 18, -4, 0, 0 }, { 0, 2, -12, 116, 28, -8, 2, 0 },
+ { 0, 2, -14, 110, 38, -10, 2, 0 }, { 0, 2, -14, 102, 48, -12, 2, 0 },
+ { 0, 2, -16, 94, 58, -12, 2, 0 }, { 0, 2, -14, 84, 66, -12, 2, 0 },
+ { 0, 2, -14, 76, 76, -14, 2, 0 }, { 0, 2, -12, 66, 84, -14, 2, 0 },
+ { 0, 2, -12, 58, 94, -16, 2, 0 }, { 0, 2, -12, 48, 102, -14, 2, 0 },
+ { 0, 2, -10, 38, 110, -14, 2, 0 }, { 0, 2, -8, 28, 116, -12, 2, 0 },
+ { 0, 0, -4, 18, 122, -10, 2, 0 }, { 0, 0, -2, 8, 126, -6, 2, 0 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { -2, 2, -6, 126, 8, -2, 2, 0 },
+ { -2, 6, -12, 124, 16, -6, 4, -2 }, { -2, 8, -18, 120, 26, -10, 6, -2 },
+ { -4, 10, -22, 116, 38, -14, 6, -2 }, { -4, 10, -22, 108, 48, -18, 8, -2 },
+ { -4, 10, -24, 100, 60, -20, 8, -2 }, { -4, 10, -24, 90, 70, -22, 10, -2 },
+ { -4, 12, -24, 80, 80, -24, 12, -4 }, { -2, 10, -22, 70, 90, -24, 10, -4 },
+ { -2, 8, -20, 60, 100, -24, 10, -4 }, { -2, 8, -18, 48, 108, -22, 10, -4 },
+ { -2, 6, -14, 38, 116, -22, 10, -4 }, { -2, 6, -10, 26, 120, -18, 8, -2 },
+ { -2, 4, -6, 16, 124, -12, 6, -2 }, { 0, 2, -2, 8, 126, -6, 2, -2 }
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 2, 28, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, -2, 16, 54, 48, 12, 0, 0 },
+ { 0, -2, 14, 52, 52, 14, -2, 0 }, { 0, 0, 12, 48, 54, 16, -2, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 28, 2, 0 }
+};
+
+static const InterpFilterParams
+ av1_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_8smooth, SUBPEL_TAPS,
+ SUBPEL_SHIFTS, EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ MULTITAP_SHARP },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR }
+ };
+
+// A special 2-tap bilinear filter for IntraBC chroma. IntraBC uses full pixel
+// MV for luma. If sub-sampling exists, chroma may possibly use half-pel MV.
+DECLARE_ALIGNED(256, static const int16_t, av1_intrabc_bilinear_filter[2]) = {
+ 64,
+ 64,
+};
+
+static const InterpFilterParams av1_intrabc_filter_params = {
+ av1_intrabc_bilinear_filter, 2, 0, BILINEAR
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -4, 126, 8, -2, 0, 0 },
+ { 0, 0, -8, 122, 18, -4, 0, 0 }, { 0, 0, -10, 116, 28, -6, 0, 0 },
+ { 0, 0, -12, 110, 38, -8, 0, 0 }, { 0, 0, -12, 102, 48, -10, 0, 0 },
+ { 0, 0, -14, 94, 58, -10, 0, 0 }, { 0, 0, -12, 84, 66, -10, 0, 0 },
+ { 0, 0, -12, 76, 76, -12, 0, 0 }, { 0, 0, -10, 66, 84, -12, 0, 0 },
+ { 0, 0, -10, 58, 94, -14, 0, 0 }, { 0, 0, -10, 48, 102, -12, 0, 0 },
+ { 0, 0, -8, 38, 110, -12, 0, 0 }, { 0, 0, -6, 28, 116, -10, 0, 0 },
+ { 0, 0, -4, 18, 122, -8, 0, 0 }, { 0, 0, -2, 8, 126, -4, 0, 0 }
+};
+DECLARE_ALIGNED(256, static const InterpKernel,
+ av1_sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, 30, 62, 34, 2, 0, 0 },
+ { 0, 0, 26, 62, 36, 4, 0, 0 }, { 0, 0, 22, 62, 40, 4, 0, 0 },
+ { 0, 0, 20, 60, 42, 6, 0, 0 }, { 0, 0, 18, 58, 44, 8, 0, 0 },
+ { 0, 0, 16, 56, 46, 10, 0, 0 }, { 0, 0, 14, 54, 48, 12, 0, 0 },
+ { 0, 0, 12, 52, 52, 12, 0, 0 }, { 0, 0, 12, 48, 54, 14, 0, 0 },
+ { 0, 0, 10, 46, 56, 16, 0, 0 }, { 0, 0, 8, 44, 58, 18, 0, 0 },
+ { 0, 0, 6, 42, 60, 20, 0, 0 }, { 0, 0, 4, 40, 62, 22, 0, 0 },
+ { 0, 0, 4, 36, 62, 26, 0, 0 }, { 0, 0, 2, 34, 62, 30, 0, 0 }
+};
+
+// For w<=4, MULTITAP_SHARP is the same as EIGHTTAP_REGULAR
+static const InterpFilterParams av1_interp_4tap[SWITCHABLE_FILTERS + 1] = {
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_sub_pel_filters_4smooth, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_SMOOTH },
+ { (const int16_t *)av1_sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ EIGHTTAP_REGULAR },
+ { (const int16_t *)av1_bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS,
+ BILINEAR },
+};
+
+static INLINE const InterpFilterParams *
+av1_get_interp_filter_params_with_block_size(const InterpFilter interp_filter,
+ const int w) {
+ if (w <= 4) return &av1_interp_4tap[interp_filter];
+ return &av1_interp_filter_params_list[interp_filter];
+}
+
+static INLINE const InterpFilterParams *av1_get_4tap_interp_filter_params(
+ const InterpFilter interp_filter) {
+ return &av1_interp_4tap[interp_filter];
+}
+
+static INLINE const int16_t *av1_get_interp_filter_kernel(
+ const InterpFilter interp_filter) {
+ return av1_interp_filter_params_list[interp_filter].filter_ptr;
+}
+
+static INLINE const int16_t *av1_get_interp_filter_subpel_kernel(
+ const InterpFilterParams *const filter_params, const int subpel) {
+ return filter_params->filter_ptr + filter_params->taps * subpel;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_FILTER_H_
diff --git a/third_party/aom/av1/common/frame_buffers.c b/third_party/aom/av1/common/frame_buffers.c
new file mode 100644
index 0000000000..fd6c4bc799
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/frame_buffers.h"
+#include "aom_mem/aom_mem.h"
+
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+ assert(list != NULL);
+ av1_free_internal_frame_buffers(list);
+
+ list->num_internal_frame_buffers =
+ AOM_MAXIMUM_REF_BUFFERS + AOM_MAXIMUM_WORK_BUFFERS;
+ list->int_fb = (InternalFrameBuffer *)aom_calloc(
+ list->num_internal_frame_buffers, sizeof(*list->int_fb));
+ return (list->int_fb == NULL);
+}
+
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ aom_free(list->int_fb[i].data);
+ list->int_fb[i].data = NULL;
+ }
+ aom_free(list->int_fb);
+ list->int_fb = NULL;
+}
+
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list) {
+ int i;
+
+ assert(list != NULL);
+
+ for (i = 0; i < list->num_internal_frame_buffers; ++i) {
+ if (list->int_fb[i].data && !list->int_fb[i].in_use)
+ memset(list->int_fb[i].data, 0, list->int_fb[i].size);
+ }
+}
+
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb) {
+ int i;
+ InternalFrameBufferList *const int_fb_list =
+ (InternalFrameBufferList *)cb_priv;
+ if (int_fb_list == NULL) return -1;
+
+ // Find a free frame buffer.
+ for (i = 0; i < int_fb_list->num_internal_frame_buffers; ++i) {
+ if (!int_fb_list->int_fb[i].in_use) break;
+ }
+
+ if (i == int_fb_list->num_internal_frame_buffers) return -1;
+
+ if (int_fb_list->int_fb[i].size < min_size) {
+ aom_free(int_fb_list->int_fb[i].data);
+ // The data must be zeroed to fix a valgrind error from the C loop filter
+ // due to access uninitialized memory in frame border. It could be
+ // skipped if border were totally removed.
+ int_fb_list->int_fb[i].data = (uint8_t *)aom_calloc(1, min_size);
+ if (!int_fb_list->int_fb[i].data) return -1;
+ int_fb_list->int_fb[i].size = min_size;
+ }
+
+ fb->data = int_fb_list->int_fb[i].data;
+ fb->size = int_fb_list->int_fb[i].size;
+ int_fb_list->int_fb[i].in_use = 1;
+
+ // Set the frame buffer's private data to point at the internal frame buffer.
+ fb->priv = &int_fb_list->int_fb[i];
+ return 0;
+}
+
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb) {
+ InternalFrameBuffer *const int_fb = (InternalFrameBuffer *)fb->priv;
+ (void)cb_priv;
+ if (int_fb) int_fb->in_use = 0;
+ fb->priv = NULL;
+ return 0;
+}
diff --git a/third_party/aom/av1/common/frame_buffers.h b/third_party/aom/av1/common/frame_buffers.h
new file mode 100644
index 0000000000..16188e51c7
--- /dev/null
+++ b/third_party/aom/av1/common/frame_buffers.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_FRAME_BUFFERS_H_
+#define AOM_AV1_COMMON_FRAME_BUFFERS_H_
+
+#include "aom/aom_frame_buffer.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct InternalFrameBuffer {
+ uint8_t *data;
+ size_t size;
+ int in_use;
+} InternalFrameBuffer;
+
+typedef struct InternalFrameBufferList {
+ int num_internal_frame_buffers;
+ InternalFrameBuffer *int_fb;
+} InternalFrameBufferList;
+
+// Initializes |list|. Returns 0 on success.
+int av1_alloc_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Free any data allocated to the frame buffers.
+void av1_free_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Zeros all unused internal frame buffers. In particular, this zeros the
+// frame borders. Call this function after a sequence header change to
+// re-initialize the frame borders for the different width, height, or bit
+// depth.
+void av1_zero_unused_internal_frame_buffers(InternalFrameBufferList *list);
+
+// Callback used by libaom to request an external frame buffer. |cb_priv|
+// Callback private data, which points to an InternalFrameBufferList.
+// |min_size| is the minimum size in bytes needed to decode the next frame.
+// |fb| pointer to the frame buffer.
+int av1_get_frame_buffer(void *cb_priv, size_t min_size,
+ aom_codec_frame_buffer_t *fb);
+
+// Callback used by libaom when there are no references to the frame buffer.
+// |cb_priv| is not used. |fb| pointer to the frame buffer.
+int av1_release_frame_buffer(void *cb_priv, aom_codec_frame_buffer_t *fb);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_FRAME_BUFFERS_H_
diff --git a/third_party/aom/av1/common/idct.c b/third_party/aom/av1/common/idct.c
new file mode 100644
index 0000000000..2c1cb98271
--- /dev/null
+++ b/third_party/aom/av1/common/idct.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+#include "av1/common/idct.h"
+
+int av1_get_tx_scale(const TX_SIZE tx_size) {
+ const int pels = tx_size_2d[tx_size];
+ // Largest possible pels is 4096 (64x64).
+ return (pels > 256) + (pels > 1024);
+}
+
+// NOTE: The implementation of all inverses need to be aware of the fact
+// that input and output could be the same buffer.
+
+// idct
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd) {
+ if (eob > 1)
+ av1_highbd_iwht4x4_16_add(input, dest, stride, bd);
+ else
+ av1_highbd_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x4_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_4x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_4x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x4(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x4_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_4x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_4x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x8(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_32x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_64x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x64(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x64_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_64x16(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_64x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_8x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type, bd);
+}
+
+void av1_highbd_inv_txfm_add_16x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_8x16_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_16x8_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int32_t *src = cast_to_int32(input);
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+}
+
+void av1_highbd_inv_txfm_add_32x32_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+void av1_highbd_inv_txfm_add_64x64_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ assert(tx_type == DCT_DCT);
+ av1_inv_txfm2d_add_64x64_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+}
+
+static void init_txfm_param(const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type, int eob, int reduced_tx_set,
+ TxfmParam *txfm_param) {
+ (void)plane;
+ txfm_param->tx_type = tx_type;
+ txfm_param->tx_size = tx_size;
+ txfm_param->eob = eob;
+ txfm_param->lossless = xd->lossless[xd->mi[0]->segment_id];
+ txfm_param->bd = xd->bd;
+ txfm_param->is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param->tx_set_type = av1_get_ext_tx_set_type(
+ txfm_param->tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+}
+
+void av1_highbd_inv_txfm_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_c(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ av1_highbd_inv_txfm_add_64x64_c(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_16X64:
+ av1_highbd_inv_txfm_add_16x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X16:
+ av1_highbd_inv_txfm_add_64x16(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ // this is like av1_short_idct4x4 but has a special case around eob<=1
+ // which is significant (not just an optimization) for the lossless
+ // case.
+ av1_highbd_inv_txfm_add_4x4_c(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
+
+void av1_inv_txfm_add_c(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ DECLARE_ALIGNED(32, uint16_t, tmp[MAX_TX_SQUARE]);
+ int tmp_stride = MAX_TX_SIZE;
+ int w = tx_size_wide[tx_size];
+ int h = tx_size_high[tx_size];
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ tmp[r * tmp_stride + c] = dst[r * stride + c];
+ }
+ }
+
+ av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+ txfm_param);
+
+ for (int r = 0; r < h; ++r) {
+ for (int c = 0; c < w; ++c) {
+ dst[r * stride + c] = (uint8_t)tmp[r * tmp_stride + c];
+ }
+ }
+}
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+ const tran_low_t *dqcoeff, int plane,
+ TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+ int stride, int eob, int reduced_tx_set) {
+ if (!eob) return;
+
+ assert(eob <= av1_get_max_eob(tx_size));
+
+ TxfmParam txfm_param;
+ init_txfm_param(xd, plane, tx_size, tx_type, eob, reduced_tx_set,
+ &txfm_param);
+ assert(av1_ext_tx_used[txfm_param.tx_set_type][txfm_param.tx_type]);
+
+ if (txfm_param.is_hbd) {
+ av1_highbd_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ } else {
+ av1_inv_txfm_add(dqcoeff, dst, stride, &txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/idct.h b/third_party/aom/av1/common/idct.h
new file mode 100644
index 0000000000..d9454e73fe
--- /dev/null
+++ b/third_party/aom/av1/common/idct.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_IDCT_H_
+#define AOM_AV1_COMMON_IDCT_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+#include "aom_dsp/txfm_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*transform_1d)(const tran_low_t *, tran_low_t *);
+
+typedef struct {
+ transform_1d cols, rows; // vertical and horizontal
+} transform_2d;
+
+#define MAX_TX_SCALE 1
+int av1_get_tx_scale(const TX_SIZE tx_size);
+
+void av1_inverse_transform_block(const MACROBLOCKD *xd,
+ const tran_low_t *dqcoeff, int plane,
+ TX_TYPE tx_type, TX_SIZE tx_size, uint8_t *dst,
+ int stride, int eob, int reduced_tx_set);
+void av1_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+ int eob, int bd);
+
+static INLINE const int32_t *cast_to_int32(const tran_low_t *input) {
+ assert(sizeof(int32_t) == sizeof(tran_low_t));
+ return (const int32_t *)input;
+}
+
+typedef void(highbd_inv_txfm_add)(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *param);
+
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x8;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x64;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_64x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_16x4;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_4x16;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_8x32;
+highbd_inv_txfm_add av1_highbd_inv_txfm_add_32x8;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_IDCT_H_
diff --git a/third_party/aom/av1/common/mv.h b/third_party/aom/av1/common/mv.h
new file mode 100644
index 0000000000..5b0225192a
--- /dev/null
+++ b/third_party/aom/av1/common/mv.h
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_MV_H_
+#define AOM_AV1_COMMON_MV_H_
+
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "aom_dsp/aom_filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INVALID_MV 0x80008000
+
+typedef struct mv {
+ int16_t row;
+ int16_t col;
+} MV;
+
+static const MV kZeroMv = { 0, 0 };
+
+typedef union int_mv {
+ uint32_t as_int;
+ MV as_mv;
+} int_mv; /* facilitates faster equality tests and copies */
+
+typedef struct mv32 {
+ int32_t row;
+ int32_t col;
+} MV32;
+
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS 16
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS 16
+
+#define WARPEDMODEL_TRANS_CLAMP (128 << WARPEDMODEL_PREC_BITS)
+#define WARPEDMODEL_NONDIAGAFFINE_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 3))
+#define WARPEDMODEL_ROW3HOMO_CLAMP (1 << (WARPEDMODEL_PREC_BITS - 2))
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS 6
+#define WARPEDPIXEL_PREC_SHIFTS (1 << WARPEDPIXEL_PREC_BITS)
+
+#define WARP_PARAM_REDUCE_BITS 6
+
+#define WARPEDDIFF_PREC_BITS (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+/* clang-format off */
+typedef enum ATTRIBUTE_PACKED {
+ IDENTITY = 0, // identity transformation, 0-parameter
+ TRANSLATION = 1, // translational motion 2-parameter
+ ROTZOOM = 2, // simplified affine with rotation + zoom only, 4-parameter
+ AFFINE = 3, // affine, 6-parameter
+ TRANS_TYPES,
+} TransformationType;
+/* clang-format on */
+
+// Number of types used for global motion (must be >= 3 and <= TRANS_TYPES)
+// The following can be useful:
+// GLOBAL_TRANS_TYPES 3 - up to rotation-zoom
+// GLOBAL_TRANS_TYPES 4 - up to affine
+// GLOBAL_TRANS_TYPES 6 - up to hor/ver trapezoids
+// GLOBAL_TRANS_TYPES 7 - up to full homography
+#define GLOBAL_TRANS_TYPES 4
+
+typedef struct {
+ int global_warp_allowed;
+ int local_warp_allowed;
+} WarpTypesAllowed;
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+// The order of values in the wmmat matrix below is best described
+// by the homography:
+// [x' (m2 m3 m0 [x
+// z . y' = m4 m5 m1 * y
+// 1] m6 m7 1) 1]
+typedef struct {
+ TransformationType wmtype;
+ int32_t wmmat[8];
+ int16_t alpha, beta, gamma, delta;
+ int8_t invalid;
+} WarpedMotionParams;
+
+/* clang-format off */
+static const WarpedMotionParams default_warp_params = {
+ IDENTITY,
+ { 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0, 0, (1 << WARPEDMODEL_PREC_BITS), 0,
+ 0 },
+ 0, 0, 0, 0,
+ 0,
+};
+/* clang-format on */
+
+// The following constants describe the various precisions
+// of different parameters in the global motion experiment.
+//
+// Given the general homography:
+// [x' (a b c [x
+// z . y' = d e f * y
+// 1] g h i) 1]
+//
+// Constants using the name ALPHA here are related to parameters
+// a, b, d, e. Constants using the name TRANS are related
+// to parameters c and f.
+//
+// Anything ending in PREC_BITS is the number of bits of precision
+// to maintain when converting from double to integer.
+//
+// The ABS parameters are used to create an upper and lower bound
+// for each parameter. In other words, after a parameter is integerized
+// it is clamped between -(1 << ABS_XXX_BITS) and (1 << ABS_XXX_BITS).
+//
+// XXX_PREC_DIFF and XXX_DECODE_FACTOR
+// are computed once here to prevent repetitive
+// computation on the decoder side. These are
+// to allow the global motion parameters to be encoded in a lower
+// precision than the warped model precision. This means that they
+// need to be changed to warped precision when they are decoded.
+//
+// XX_MIN, XX_MAX are also computed to avoid repeated computation
+
+#define SUBEXPFIN_K 3
+#define GM_TRANS_PREC_BITS 6
+#define GM_ABS_TRANS_BITS 12
+#define GM_ABS_TRANS_ONLY_BITS (GM_ABS_TRANS_BITS - GM_TRANS_PREC_BITS + 3)
+#define GM_TRANS_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_TRANS_PREC_BITS)
+#define GM_TRANS_ONLY_PREC_DIFF (WARPEDMODEL_PREC_BITS - 3)
+#define GM_TRANS_DECODE_FACTOR (1 << GM_TRANS_PREC_DIFF)
+#define GM_TRANS_ONLY_DECODE_FACTOR (1 << GM_TRANS_ONLY_PREC_DIFF)
+
+#define GM_ALPHA_PREC_BITS 15
+#define GM_ABS_ALPHA_BITS 12
+#define GM_ALPHA_PREC_DIFF (WARPEDMODEL_PREC_BITS - GM_ALPHA_PREC_BITS)
+#define GM_ALPHA_DECODE_FACTOR (1 << GM_ALPHA_PREC_DIFF)
+
+#define GM_ROW3HOMO_PREC_BITS 16
+#define GM_ABS_ROW3HOMO_BITS 11
+#define GM_ROW3HOMO_PREC_DIFF \
+ (WARPEDMODEL_ROW3HOMO_PREC_BITS - GM_ROW3HOMO_PREC_BITS)
+#define GM_ROW3HOMO_DECODE_FACTOR (1 << GM_ROW3HOMO_PREC_DIFF)
+
+#define GM_TRANS_MAX (1 << GM_ABS_TRANS_BITS)
+#define GM_ALPHA_MAX (1 << GM_ABS_ALPHA_BITS)
+#define GM_ROW3HOMO_MAX (1 << GM_ABS_ROW3HOMO_BITS)
+
+#define GM_TRANS_MIN -GM_TRANS_MAX
+#define GM_ALPHA_MIN -GM_ALPHA_MAX
+#define GM_ROW3HOMO_MIN -GM_ROW3HOMO_MAX
+
+static INLINE int block_center_x(int mi_col, BLOCK_SIZE bs) {
+ const int bw = block_size_wide[bs];
+ return mi_col * MI_SIZE + bw / 2 - 1;
+}
+
+static INLINE int block_center_y(int mi_row, BLOCK_SIZE bs) {
+ const int bh = block_size_high[bs];
+ return mi_row * MI_SIZE + bh / 2 - 1;
+}
+
+static INLINE int convert_to_trans_prec(int allow_hp, int coor) {
+ if (allow_hp)
+ return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 3);
+ else
+ return ROUND_POWER_OF_TWO_SIGNED(coor, WARPEDMODEL_PREC_BITS - 2) * 2;
+}
+static INLINE void integer_mv_precision(MV *mv) {
+ int mod = (mv->row % 8);
+ if (mod != 0) {
+ mv->row -= mod;
+ if (abs(mod) > 4) {
+ if (mod > 0) {
+ mv->row += 8;
+ } else {
+ mv->row -= 8;
+ }
+ }
+ }
+
+ mod = (mv->col % 8);
+ if (mod != 0) {
+ mv->col -= mod;
+ if (abs(mod) > 4) {
+ if (mod > 0) {
+ mv->col += 8;
+ } else {
+ mv->col -= 8;
+ }
+ }
+ }
+}
+// Convert a global motion vector into a motion vector at the centre of the
+// given block.
+//
+// The resulting motion vector will have three fractional bits of precision. If
+// allow_hp is zero, the bottom bit will always be zero. If CONFIG_AMVR and
+// is_integer is true, the bottom three bits will be zero (so the motion vector
+// represents an integer)
+static INLINE int_mv gm_get_motion_vector(const WarpedMotionParams *gm,
+ int allow_hp, BLOCK_SIZE bsize,
+ int mi_col, int mi_row,
+ int is_integer) {
+ int_mv res;
+
+ if (gm->wmtype == IDENTITY) {
+ res.as_int = 0;
+ return res;
+ }
+
+ const int32_t *mat = gm->wmmat;
+ int x, y, tx, ty;
+
+ if (gm->wmtype == TRANSLATION) {
+ // All global motion vectors are stored with WARPEDMODEL_PREC_BITS (16)
+ // bits of fractional precision. The offset for a translation is stored in
+ // entries 0 and 1. For translations, all but the top three (two if
+ // cm->allow_high_precision_mv is false) fractional bits are always zero.
+ //
+ // After the right shifts, there are 3 fractional bits of precision. If
+ // allow_hp is false, the bottom bit is always zero (so we don't need a
+ // call to convert_to_trans_prec here)
+ res.as_mv.row = gm->wmmat[0] >> GM_TRANS_ONLY_PREC_DIFF;
+ res.as_mv.col = gm->wmmat[1] >> GM_TRANS_ONLY_PREC_DIFF;
+ assert(IMPLIES(1 & (res.as_mv.row | res.as_mv.col), allow_hp));
+ if (is_integer) {
+ integer_mv_precision(&res.as_mv);
+ }
+ return res;
+ }
+
+ x = block_center_x(mi_col, bsize);
+ y = block_center_y(mi_row, bsize);
+
+ if (gm->wmtype == ROTZOOM) {
+ assert(gm->wmmat[5] == gm->wmmat[2]);
+ assert(gm->wmmat[4] == -gm->wmmat[3]);
+ }
+
+ const int xc =
+ (mat[2] - (1 << WARPEDMODEL_PREC_BITS)) * x + mat[3] * y + mat[0];
+ const int yc =
+ mat[4] * x + (mat[5] - (1 << WARPEDMODEL_PREC_BITS)) * y + mat[1];
+ tx = convert_to_trans_prec(allow_hp, xc);
+ ty = convert_to_trans_prec(allow_hp, yc);
+
+ res.as_mv.row = ty;
+ res.as_mv.col = tx;
+
+ if (is_integer) {
+ integer_mv_precision(&res.as_mv);
+ }
+ return res;
+}
+
+static INLINE TransformationType get_gmtype(const WarpedMotionParams *gm) {
+ if (gm->wmmat[5] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[4] &&
+ gm->wmmat[2] == (1 << WARPEDMODEL_PREC_BITS) && !gm->wmmat[3]) {
+ return ((!gm->wmmat[1] && !gm->wmmat[0]) ? IDENTITY : TRANSLATION);
+ }
+ if (gm->wmmat[2] == gm->wmmat[5] && gm->wmmat[3] == -gm->wmmat[4])
+ return ROTZOOM;
+ else
+ return AFFINE;
+}
+
+typedef struct candidate_mv {
+ int_mv this_mv;
+ int_mv comp_mv;
+ int weight;
+} CANDIDATE_MV;
+
+static INLINE int is_zero_mv(const MV *mv) {
+ return *((const uint32_t *)mv) == 0;
+}
+
+static INLINE int is_equal_mv(const MV *a, const MV *b) {
+ return *((const uint32_t *)a) == *((const uint32_t *)b);
+}
+
+static INLINE void clamp_mv(MV *mv, int min_col, int max_col, int min_row,
+ int max_row) {
+ mv->col = clamp(mv->col, min_col, max_col);
+ mv->row = clamp(mv->row, min_row, max_row);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_MV_H_
diff --git a/third_party/aom/av1/common/mvref_common.c b/third_party/aom/av1/common/mvref_common.c
new file mode 100644
index 0000000000..7f24ab4e6d
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.c
@@ -0,0 +1,1523 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "av1/common/mvref_common.h"
+#include "av1/common/warped_motion.h"
+
+// Although we assign 32 bit integers, all the values are strictly under 14
+// bits.
+static int div_mult[32] = { 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340,
+ 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092,
+ 1024, 963, 910, 862, 819, 780, 744, 712,
+ 682, 655, 630, 606, 585, 564, 546, 528 };
+
+// TODO(jingning): Consider the use of lookup table for (num / den)
+// altogether.
+static void get_mv_projection(MV *output, MV ref, int num, int den) {
+ den = AOMMIN(den, MAX_FRAME_DISTANCE);
+ num = num > 0 ? AOMMIN(num, MAX_FRAME_DISTANCE)
+ : AOMMAX(num, -MAX_FRAME_DISTANCE);
+ const int mv_row =
+ ROUND_POWER_OF_TWO_SIGNED(ref.row * num * div_mult[den], 14);
+ const int mv_col =
+ ROUND_POWER_OF_TWO_SIGNED(ref.col * num * div_mult[den], 14);
+ const int clamp_max = MV_UPP - 1;
+ const int clamp_min = MV_LOW + 1;
+ output->row = (int16_t)clamp(mv_row, clamp_min, clamp_max);
+ output->col = (int16_t)clamp(mv_col, clamp_min, clamp_max);
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
+ const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+ MV_REF *frame_mvs =
+ cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+ x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+ y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+ int w, h;
+
+ for (h = 0; h < y_mis; h++) {
+ MV_REF *mv = frame_mvs;
+ for (w = 0; w < x_mis; w++) {
+ mv->ref_frame = NONE_FRAME;
+ mv->mv.as_int = 0;
+
+ for (int idx = 0; idx < 2; ++idx) {
+ MV_REFERENCE_FRAME ref_frame = mi->ref_frame[idx];
+ if (ref_frame > INTRA_FRAME) {
+ int8_t ref_idx = cm->ref_frame_side[ref_frame];
+ if (ref_idx) continue;
+ if ((abs(mi->mv[idx].as_mv.row) > REFMVS_LIMIT) ||
+ (abs(mi->mv[idx].as_mv.col) > REFMVS_LIMIT))
+ continue;
+ mv->ref_frame = ref_frame;
+ mv->mv.as_int = mi->mv[idx].as_int;
+ }
+ }
+ mv++;
+ }
+ frame_mvs += frame_mvs_stride;
+ }
+}
+
+static void add_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const MV_REFERENCE_FRAME rf[2],
+ uint8_t *refmv_count, uint8_t *ref_match_count, uint8_t *newmv_count,
+ CANDIDATE_MV *ref_mv_stack, int_mv *gm_mv_candidates,
+ const WarpedMotionParams *gm_params, int col, int weight) {
+ if (!is_inter_block(candidate)) return; // for intrabc
+ int index = 0, ref;
+ assert(weight % 2 == 0);
+
+ if (rf[1] == NONE_FRAME) {
+ // single reference frame
+ for (ref = 0; ref < 2; ++ref) {
+ if (candidate->ref_frame[ref] == rf[0]) {
+ int_mv this_refmv;
+ if (is_global_mv_block(candidate, gm_params[rf[0]].wmtype))
+ this_refmv = gm_mv_candidates[0];
+ else
+ this_refmv = get_sub_block_mv(candidate, ref, col);
+
+ for (index = 0; index < *refmv_count; ++index)
+ if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int) break;
+
+ if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+
+ // Add a new item to the list.
+ if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[index].this_mv = this_refmv;
+ ref_mv_stack[index].weight = weight;
+ ++(*refmv_count);
+ }
+ if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+ ++*ref_match_count;
+ }
+ }
+ } else {
+ // compound reference frame
+ if (candidate->ref_frame[0] == rf[0] && candidate->ref_frame[1] == rf[1]) {
+ int_mv this_refmv[2];
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (is_global_mv_block(candidate, gm_params[rf[ref]].wmtype))
+ this_refmv[ref] = gm_mv_candidates[ref];
+ else
+ this_refmv[ref] = get_sub_block_mv(candidate, ref, col);
+ }
+
+ for (index = 0; index < *refmv_count; ++index)
+ if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+ (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+ break;
+
+ if (index < *refmv_count) ref_mv_stack[index].weight += weight;
+
+ // Add a new item to the list.
+ if (index == *refmv_count && *refmv_count < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[index].this_mv = this_refmv[0];
+ ref_mv_stack[index].comp_mv = this_refmv[1];
+ ref_mv_stack[index].weight = weight;
+ ++(*refmv_count);
+ }
+ if (have_newmv_in_inter_mode(candidate->mode)) ++*newmv_count;
+ ++*ref_match_count;
+ }
+ }
+}
+
+static void scan_row_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const MV_REFERENCE_FRAME rf[2], int row_offset,
+ CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+ uint8_t *ref_match_count, uint8_t *newmv_count,
+ int_mv *gm_mv_candidates, int max_row_offset,
+ int *processed_rows) {
+ int end_mi = AOMMIN(xd->n4_w, cm->mi_cols - mi_col);
+ end_mi = AOMMIN(end_mi, mi_size_wide[BLOCK_64X64]);
+ const int n8_w_8 = mi_size_wide[BLOCK_8X8];
+ const int n8_w_16 = mi_size_wide[BLOCK_16X16];
+ int i;
+ int col_offset = 0;
+ // TODO(jingning): Revisit this part after cb4x4 is stable.
+ if (abs(row_offset) > 1) {
+ col_offset = 1;
+ if ((mi_col & 0x01) && xd->n4_w < n8_w_8) --col_offset;
+ }
+ const int use_step_16 = (xd->n4_w >= 16);
+ MB_MODE_INFO **const candidate_mi0 = xd->mi + row_offset * xd->mi_stride;
+ (void)mi_row;
+
+ for (i = 0; i < end_mi;) {
+ const MB_MODE_INFO *const candidate = candidate_mi0[col_offset + i];
+ const int candidate_bsize = candidate->sb_type;
+ const int n4_w = mi_size_wide[candidate_bsize];
+ int len = AOMMIN(xd->n4_w, n4_w);
+ if (use_step_16)
+ len = AOMMAX(n8_w_16, len);
+ else if (abs(row_offset) > 1)
+ len = AOMMAX(len, n8_w_8);
+
+ int weight = 2;
+ if (xd->n4_w >= n8_w_8 && xd->n4_w <= n4_w) {
+ int inc = AOMMIN(-max_row_offset + row_offset + 1,
+ mi_size_high[candidate_bsize]);
+ // Obtain range used in weight calculation.
+ weight = AOMMAX(weight, inc);
+ // Update processed rows.
+ *processed_rows = inc - row_offset - 1;
+ }
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, gm_mv_candidates,
+ cm->global_motion, col_offset + i, len * weight);
+
+ i += len;
+ }
+}
+
+static void scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ const MV_REFERENCE_FRAME rf[2], int col_offset,
+ CANDIDATE_MV *ref_mv_stack, uint8_t *refmv_count,
+ uint8_t *ref_match_count, uint8_t *newmv_count,
+ int_mv *gm_mv_candidates, int max_col_offset,
+ int *processed_cols) {
+ int end_mi = AOMMIN(xd->n4_h, cm->mi_rows - mi_row);
+ end_mi = AOMMIN(end_mi, mi_size_high[BLOCK_64X64]);
+ const int n8_h_8 = mi_size_high[BLOCK_8X8];
+ const int n8_h_16 = mi_size_high[BLOCK_16X16];
+ int i;
+ int row_offset = 0;
+ if (abs(col_offset) > 1) {
+ row_offset = 1;
+ if ((mi_row & 0x01) && xd->n4_h < n8_h_8) --row_offset;
+ }
+ const int use_step_16 = (xd->n4_h >= 16);
+ (void)mi_col;
+
+ for (i = 0; i < end_mi;) {
+ const MB_MODE_INFO *const candidate =
+ xd->mi[(row_offset + i) * xd->mi_stride + col_offset];
+ const int candidate_bsize = candidate->sb_type;
+ const int n4_h = mi_size_high[candidate_bsize];
+ int len = AOMMIN(xd->n4_h, n4_h);
+ if (use_step_16)
+ len = AOMMAX(n8_h_16, len);
+ else if (abs(col_offset) > 1)
+ len = AOMMAX(len, n8_h_8);
+
+ int weight = 2;
+ if (xd->n4_h >= n8_h_8 && xd->n4_h <= n4_h) {
+ int inc = AOMMIN(-max_col_offset + col_offset + 1,
+ mi_size_wide[candidate_bsize]);
+ // Obtain range used in weight calculation.
+ weight = AOMMAX(weight, inc);
+ // Update processed cols.
+ *processed_cols = inc - col_offset - 1;
+ }
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, gm_mv_candidates,
+ cm->global_motion, col_offset, len * weight);
+
+ i += len;
+ }
+}
+
+static void scan_blk_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ const int mi_row, const int mi_col,
+ const MV_REFERENCE_FRAME rf[2], int row_offset,
+ int col_offset, CANDIDATE_MV *ref_mv_stack,
+ uint8_t *ref_match_count, uint8_t *newmv_count,
+ int_mv *gm_mv_candidates,
+ uint8_t refmv_count[MODE_CTX_REF_FRAMES]) {
+ const TileInfo *const tile = &xd->tile;
+ POSITION mi_pos;
+
+ mi_pos.row = row_offset;
+ mi_pos.col = col_offset;
+
+ if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+ const MB_MODE_INFO *const candidate =
+ xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+ const int len = mi_size_wide[BLOCK_8X8];
+
+ add_ref_mv_candidate(candidate, rf, refmv_count, ref_match_count,
+ newmv_count, ref_mv_stack, gm_mv_candidates,
+ cm->global_motion, mi_pos.col, 2 * len);
+ } // Analyze a single 8x8 block motion information.
+}
+
+static int has_top_right(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col, int bs) {
+ const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+ const int mask_row = mi_row & (sb_mi_size - 1);
+ const int mask_col = mi_col & (sb_mi_size - 1);
+
+ if (bs > mi_size_wide[BLOCK_64X64]) return 0;
+
+ // In a split partition all apart from the bottom right has a top right
+ int has_tr = !((mask_row & bs) && (mask_col & bs));
+
+ // bs > 0 and bs is a power of 2
+ assert(bs > 0 && !(bs & (bs - 1)));
+
+ // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+ // to the right have not been decoded therefore the bottom right does
+ // not have a top right
+ while (bs < sb_mi_size) {
+ if (mask_col & bs) {
+ if ((mask_col & (2 * bs)) && (mask_row & (2 * bs))) {
+ has_tr = 0;
+ break;
+ }
+ } else {
+ break;
+ }
+ bs <<= 1;
+ }
+
+ // The left hand of two vertical rectangles always has a top right (as the
+ // block above will have been decoded)
+ if (xd->n4_w < xd->n4_h)
+ if (!xd->is_sec_rect) has_tr = 1;
+
+ // The bottom of two horizontal rectangles never has a top right (as the block
+ // to the right won't have been decoded)
+ if (xd->n4_w > xd->n4_h)
+ if (xd->is_sec_rect) has_tr = 0;
+
+ // The bottom left square of a Vertical A (in the old format) does
+ // not have a top right as it is decoded before the right hand
+ // rectangle of the partition
+ if (xd->mi[0]->partition == PARTITION_VERT_A) {
+ if (xd->n4_w == xd->n4_h)
+ if (mask_row & bs) has_tr = 0;
+ }
+
+ return has_tr;
+}
+
+static int check_sb_border(const int mi_row, const int mi_col,
+ const int row_offset, const int col_offset) {
+ const int sb_mi_size = mi_size_wide[BLOCK_64X64];
+ const int row = mi_row & (sb_mi_size - 1);
+ const int col = mi_col & (sb_mi_size - 1);
+
+ if (row + row_offset < 0 || row + row_offset >= sb_mi_size ||
+ col + col_offset < 0 || col + col_offset >= sb_mi_size)
+ return 0;
+
+ return 1;
+}
+
+static int add_tpl_ref_mv(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int mi_row, int mi_col, MV_REFERENCE_FRAME ref_frame,
+ int blk_row, int blk_col, int_mv *gm_mv_candidates,
+ uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stacks[][MAX_REF_MV_STACK_SIZE],
+ int16_t *mode_context) {
+ POSITION mi_pos;
+ int idx;
+ const int weight_unit = 1; // mi_size_wide[BLOCK_8X8];
+
+ mi_pos.row = (mi_row & 0x01) ? blk_row : blk_row + 1;
+ mi_pos.col = (mi_col & 0x01) ? blk_col : blk_col + 1;
+
+ if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos)) return 0;
+
+ const TPL_MV_REF *prev_frame_mvs =
+ cm->tpl_mvs + ((mi_row + mi_pos.row) >> 1) * (cm->mi_stride >> 1) +
+ ((mi_col + mi_pos.col) >> 1);
+
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+
+ if (rf[1] == NONE_FRAME) {
+ int cur_frame_index = cm->cur_frame->cur_frame_offset;
+ int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+ int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+ int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+ CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[rf[0]];
+
+ if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+ int_mv this_refmv;
+
+ get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_0, prev_frame_mvs->ref_frame_offset);
+ lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+ cm->cur_frame_force_integer_mv);
+
+ if (blk_row == 0 && blk_col == 0)
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+ for (idx = 0; idx < refmv_count[rf[0]]; ++idx)
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int) break;
+
+ if (idx < refmv_count[rf[0]]) ref_mv_stack[idx].weight += 2 * weight_unit;
+
+ if (idx == refmv_count[rf[0]] &&
+ refmv_count[rf[0]] < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_stack[idx].weight = 2 * weight_unit;
+ ++(refmv_count[rf[0]]);
+ }
+ return 1;
+ }
+ } else {
+ // Process compound inter mode
+ int cur_frame_index = cm->cur_frame->cur_frame_offset;
+ int buf_idx_0 = cm->frame_refs[FWD_RF_OFFSET(rf[0])].idx;
+ int frame0_index = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+
+ int cur_offset_0 = get_relative_dist(cm, cur_frame_index, frame0_index);
+ int buf_idx_1 = cm->frame_refs[FWD_RF_OFFSET(rf[1])].idx;
+ int frame1_index = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+ int cur_offset_1 = get_relative_dist(cm, cur_frame_index, frame1_index);
+ CANDIDATE_MV *ref_mv_stack = ref_mv_stacks[ref_frame];
+
+ if (prev_frame_mvs->mfmv0.as_int != INVALID_MV) {
+ int_mv this_refmv;
+ int_mv comp_refmv;
+ get_mv_projection(&this_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_0, prev_frame_mvs->ref_frame_offset);
+ get_mv_projection(&comp_refmv.as_mv, prev_frame_mvs->mfmv0.as_mv,
+ cur_offset_1, prev_frame_mvs->ref_frame_offset);
+
+ lower_mv_precision(&this_refmv.as_mv, cm->allow_high_precision_mv,
+ cm->cur_frame_force_integer_mv);
+ lower_mv_precision(&comp_refmv.as_mv, cm->allow_high_precision_mv,
+ cm->cur_frame_force_integer_mv);
+
+ if (blk_row == 0 && blk_col == 0)
+ if (abs(this_refmv.as_mv.row - gm_mv_candidates[0].as_mv.row) >= 16 ||
+ abs(this_refmv.as_mv.col - gm_mv_candidates[0].as_mv.col) >= 16 ||
+ abs(comp_refmv.as_mv.row - gm_mv_candidates[1].as_mv.row) >= 16 ||
+ abs(comp_refmv.as_mv.col - gm_mv_candidates[1].as_mv.col) >= 16)
+ mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+ for (idx = 0; idx < refmv_count[ref_frame]; ++idx)
+ if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int &&
+ comp_refmv.as_int == ref_mv_stack[idx].comp_mv.as_int)
+ break;
+
+ if (idx < refmv_count[ref_frame])
+ ref_mv_stack[idx].weight += 2 * weight_unit;
+
+ if (idx == refmv_count[ref_frame] &&
+ refmv_count[ref_frame] < MAX_REF_MV_STACK_SIZE) {
+ ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+ ref_mv_stack[idx].comp_mv.as_int = comp_refmv.as_int;
+ ref_mv_stack[idx].weight = 2 * weight_unit;
+ ++(refmv_count[ref_frame]);
+ }
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void process_compound_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ const MV_REFERENCE_FRAME *const rf, int_mv ref_id[2][2],
+ int ref_id_count[2], int_mv ref_diff[2][2], int ref_diff_count[2]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ MV_REFERENCE_FRAME can_rf = candidate->ref_frame[rf_idx];
+
+ for (int cmp_idx = 0; cmp_idx < 2; ++cmp_idx) {
+ if (can_rf == rf[cmp_idx] && ref_id_count[cmp_idx] < 2) {
+ ref_id[cmp_idx][ref_id_count[cmp_idx]] = candidate->mv[rf_idx];
+ ++ref_id_count[cmp_idx];
+ } else if (can_rf > INTRA_FRAME && ref_diff_count[cmp_idx] < 2) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[can_rf] !=
+ cm->ref_frame_sign_bias[rf[cmp_idx]]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ ref_diff[cmp_idx][ref_diff_count[cmp_idx]] = this_mv;
+ ++ref_diff_count[cmp_idx];
+ }
+ }
+ }
+}
+
+static void process_single_ref_mv_candidate(
+ const MB_MODE_INFO *const candidate, const AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame, uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE]) {
+ for (int rf_idx = 0; rf_idx < 2; ++rf_idx) {
+ if (candidate->ref_frame[rf_idx] > INTRA_FRAME) {
+ int_mv this_mv = candidate->mv[rf_idx];
+ if (cm->ref_frame_sign_bias[candidate->ref_frame[rf_idx]] !=
+ cm->ref_frame_sign_bias[ref_frame]) {
+ this_mv.as_mv.row = -this_mv.as_mv.row;
+ this_mv.as_mv.col = -this_mv.as_mv.col;
+ }
+ int stack_idx;
+ for (stack_idx = 0; stack_idx < refmv_count[ref_frame]; ++stack_idx) {
+ const int_mv stack_mv = ref_mv_stack[ref_frame][stack_idx].this_mv;
+ if (this_mv.as_int == stack_mv.as_int) break;
+ }
+
+ if (stack_idx == refmv_count[ref_frame]) {
+ ref_mv_stack[ref_frame][stack_idx].this_mv = this_mv;
+
+ // TODO(jingning): Set an arbitrary small number here. The weight
+ // doesn't matter as long as it is properly initialized.
+ ref_mv_stack[ref_frame][stack_idx].weight = 2;
+ ++refmv_count[ref_frame];
+ }
+ }
+ }
+}
+
+static void setup_ref_mv_list(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, MV_REFERENCE_FRAME ref_frame,
+ uint8_t refmv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES], int_mv *gm_mv_candidates,
+ int mi_row, int mi_col, int16_t *mode_context) {
+ const int bs = AOMMAX(xd->n4_w, xd->n4_h);
+ const int has_tr = has_top_right(cm, xd, mi_row, mi_col, bs);
+ MV_REFERENCE_FRAME rf[2];
+
+ const TileInfo *const tile = &xd->tile;
+ int max_row_offset = 0, max_col_offset = 0;
+ const int row_adj = (xd->n4_h < mi_size_high[BLOCK_8X8]) && (mi_row & 0x01);
+ const int col_adj = (xd->n4_w < mi_size_wide[BLOCK_8X8]) && (mi_col & 0x01);
+ int processed_rows = 0;
+ int processed_cols = 0;
+
+ av1_set_ref_frame(rf, ref_frame);
+ mode_context[ref_frame] = 0;
+ refmv_count[ref_frame] = 0;
+
+ // Find valid maximum row/col offset.
+ if (xd->up_available) {
+ max_row_offset = -(MVREF_ROW_COLS << 1) + row_adj;
+
+ if (xd->n4_h < mi_size_high[BLOCK_8X8])
+ max_row_offset = -(2 << 1) + row_adj;
+
+ max_row_offset = find_valid_row_offset(tile, mi_row, max_row_offset);
+ }
+
+ if (xd->left_available) {
+ max_col_offset = -(MVREF_ROW_COLS << 1) + col_adj;
+
+ if (xd->n4_w < mi_size_wide[BLOCK_8X8])
+ max_col_offset = -(2 << 1) + col_adj;
+
+ max_col_offset = find_valid_col_offset(tile, mi_col, max_col_offset);
+ }
+
+ uint8_t col_match_count = 0;
+ uint8_t row_match_count = 0;
+ uint8_t newmv_count = 0;
+
+ // Scan the first above row mode info. row_offset = -1;
+ if (abs(max_row_offset) >= 1)
+ scan_row_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+ &refmv_count[ref_frame], &row_match_count, &newmv_count,
+ gm_mv_candidates, max_row_offset, &processed_rows);
+ // Scan the first left column mode info. col_offset = -1;
+ if (abs(max_col_offset) >= 1)
+ scan_col_mbmi(cm, xd, mi_row, mi_col, rf, -1, ref_mv_stack[ref_frame],
+ &refmv_count[ref_frame], &col_match_count, &newmv_count,
+ gm_mv_candidates, max_col_offset, &processed_cols);
+ // Check top-right boundary
+ if (has_tr)
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, xd->n4_w,
+ ref_mv_stack[ref_frame], &row_match_count, &newmv_count,
+ gm_mv_candidates, &refmv_count[ref_frame]);
+
+ const uint8_t nearest_match = (row_match_count > 0) + (col_match_count > 0);
+ const uint8_t nearest_refmv_count = refmv_count[ref_frame];
+
+ // TODO(yunqing): for comp_search, do it for all 3 cases.
+ for (int idx = 0; idx < nearest_refmv_count; ++idx)
+ ref_mv_stack[ref_frame][idx].weight += REF_CAT_LEVEL;
+
+ if (cm->allow_ref_frame_mvs) {
+ int is_available = 0;
+ const int voffset = AOMMAX(mi_size_high[BLOCK_8X8], xd->n4_h);
+ const int hoffset = AOMMAX(mi_size_wide[BLOCK_8X8], xd->n4_w);
+ const int blk_row_end = AOMMIN(xd->n4_h, mi_size_high[BLOCK_64X64]);
+ const int blk_col_end = AOMMIN(xd->n4_w, mi_size_wide[BLOCK_64X64]);
+
+ const int tpl_sample_pos[3][2] = {
+ { voffset, -2 },
+ { voffset, hoffset },
+ { voffset - 2, hoffset },
+ };
+ const int allow_extension = (xd->n4_h >= mi_size_high[BLOCK_8X8]) &&
+ (xd->n4_h < mi_size_high[BLOCK_64X64]) &&
+ (xd->n4_w >= mi_size_wide[BLOCK_8X8]) &&
+ (xd->n4_w < mi_size_wide[BLOCK_64X64]);
+
+ const int step_h = (xd->n4_h >= mi_size_high[BLOCK_64X64])
+ ? mi_size_high[BLOCK_16X16]
+ : mi_size_high[BLOCK_8X8];
+ const int step_w = (xd->n4_w >= mi_size_wide[BLOCK_64X64])
+ ? mi_size_wide[BLOCK_16X16]
+ : mi_size_wide[BLOCK_8X8];
+
+ for (int blk_row = 0; blk_row < blk_row_end; blk_row += step_h) {
+ for (int blk_col = 0; blk_col < blk_col_end; blk_col += step_w) {
+ int ret = add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row,
+ blk_col, gm_mv_candidates, refmv_count,
+ ref_mv_stack, mode_context);
+ if (blk_row == 0 && blk_col == 0) is_available = ret;
+ }
+ }
+
+ if (is_available == 0) mode_context[ref_frame] |= (1 << GLOBALMV_OFFSET);
+
+ for (int i = 0; i < 3 && allow_extension; ++i) {
+ const int blk_row = tpl_sample_pos[i][0];
+ const int blk_col = tpl_sample_pos[i][1];
+
+ if (!check_sb_border(mi_row, mi_col, blk_row, blk_col)) continue;
+ add_tpl_ref_mv(cm, xd, mi_row, mi_col, ref_frame, blk_row, blk_col,
+ gm_mv_candidates, refmv_count, ref_mv_stack, mode_context);
+ }
+ }
+
+ uint8_t dummy_newmv_count = 0;
+
+ // Scan the second outer area.
+ scan_blk_mbmi(cm, xd, mi_row, mi_col, rf, -1, -1, ref_mv_stack[ref_frame],
+ &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+ &refmv_count[ref_frame]);
+
+ for (int idx = 2; idx <= MVREF_ROW_COLS; ++idx) {
+ const int row_offset = -(idx << 1) + 1 + row_adj;
+ const int col_offset = -(idx << 1) + 1 + col_adj;
+
+ if (abs(row_offset) <= abs(max_row_offset) &&
+ abs(row_offset) > processed_rows)
+ scan_row_mbmi(cm, xd, mi_row, mi_col, rf, row_offset,
+ ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+ &row_match_count, &dummy_newmv_count, gm_mv_candidates,
+ max_row_offset, &processed_rows);
+
+ if (abs(col_offset) <= abs(max_col_offset) &&
+ abs(col_offset) > processed_cols)
+ scan_col_mbmi(cm, xd, mi_row, mi_col, rf, col_offset,
+ ref_mv_stack[ref_frame], &refmv_count[ref_frame],
+ &col_match_count, &dummy_newmv_count, gm_mv_candidates,
+ max_col_offset, &processed_cols);
+ }
+
+ const uint8_t ref_match_count = (row_match_count > 0) + (col_match_count > 0);
+
+ switch (nearest_match) {
+ case 0:
+ mode_context[ref_frame] |= 0;
+ if (ref_match_count >= 1) mode_context[ref_frame] |= 1;
+ if (ref_match_count == 1)
+ mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+ else if (ref_match_count >= 2)
+ mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+ break;
+ case 1:
+ mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+ if (ref_match_count == 1)
+ mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+ else if (ref_match_count >= 2)
+ mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+ break;
+ case 2:
+ default:
+ if (newmv_count >= 1)
+ mode_context[ref_frame] |= 4;
+ else
+ mode_context[ref_frame] |= 5;
+
+ mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+ break;
+ }
+
+ // Rank the likelihood and assign nearest and near mvs.
+ int len = nearest_refmv_count;
+ while (len > 0) {
+ int nr_len = 0;
+ for (int idx = 1; idx < len; ++idx) {
+ if (ref_mv_stack[ref_frame][idx - 1].weight <
+ ref_mv_stack[ref_frame][idx].weight) {
+ CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+ ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+ ref_mv_stack[ref_frame][idx] = tmp_mv;
+ nr_len = idx;
+ }
+ }
+ len = nr_len;
+ }
+
+ len = refmv_count[ref_frame];
+ while (len > nearest_refmv_count) {
+ int nr_len = nearest_refmv_count;
+ for (int idx = nearest_refmv_count + 1; idx < len; ++idx) {
+ if (ref_mv_stack[ref_frame][idx - 1].weight <
+ ref_mv_stack[ref_frame][idx].weight) {
+ CANDIDATE_MV tmp_mv = ref_mv_stack[ref_frame][idx - 1];
+ ref_mv_stack[ref_frame][idx - 1] = ref_mv_stack[ref_frame][idx];
+ ref_mv_stack[ref_frame][idx] = tmp_mv;
+ nr_len = idx;
+ }
+ }
+ len = nr_len;
+ }
+
+ if (rf[1] > NONE_FRAME) {
+ // TODO(jingning, yunqing): Refactor and consolidate the compound and
+ // single reference frame modes. Reduce unnecessary redundancy.
+ if (refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES) {
+ int_mv ref_id[2][2], ref_diff[2][2];
+ int ref_id_count[2] = { 0 }, ref_diff_count[2] = { 0 };
+
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
+ mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
+ mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+ int mi_size = AOMMIN(mi_width, mi_height);
+
+ for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size;) {
+ const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_wide[candidate->sb_type];
+ }
+
+ for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size;) {
+ const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+ process_compound_ref_mv_candidate(
+ candidate, cm, rf, ref_id, ref_id_count, ref_diff, ref_diff_count);
+ idx += mi_size_high[candidate->sb_type];
+ }
+
+ // Build up the compound mv predictor
+ int_mv comp_list[3][2];
+
+ for (int idx = 0; idx < 2; ++idx) {
+ int comp_idx = 0;
+ for (int list_idx = 0; list_idx < ref_id_count[idx] && comp_idx < 2;
+ ++list_idx, ++comp_idx)
+ comp_list[comp_idx][idx] = ref_id[idx][list_idx];
+ for (int list_idx = 0; list_idx < ref_diff_count[idx] && comp_idx < 2;
+ ++list_idx, ++comp_idx)
+ comp_list[comp_idx][idx] = ref_diff[idx][list_idx];
+ for (; comp_idx < 3; ++comp_idx)
+ comp_list[comp_idx][idx] = gm_mv_candidates[idx];
+ }
+
+ if (refmv_count[ref_frame]) {
+ assert(refmv_count[ref_frame] == 1);
+ if (comp_list[0][0].as_int ==
+ ref_mv_stack[ref_frame][0].this_mv.as_int &&
+ comp_list[0][1].as_int ==
+ ref_mv_stack[ref_frame][0].comp_mv.as_int) {
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+ comp_list[1][0];
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+ comp_list[1][1];
+ } else {
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+ comp_list[0][0];
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+ comp_list[0][1];
+ }
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+ ++refmv_count[ref_frame];
+ } else {
+ for (int idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx) {
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].this_mv =
+ comp_list[idx][0];
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].comp_mv =
+ comp_list[idx][1];
+ ref_mv_stack[ref_frame][refmv_count[ref_frame]].weight = 2;
+ ++refmv_count[ref_frame];
+ }
+ }
+ }
+
+ assert(refmv_count[ref_frame] >= 2);
+
+ for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+ clamp_mv_ref(&ref_mv_stack[ref_frame][idx].comp_mv.as_mv,
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+ }
+ } else {
+ // Handle single reference frame extension
+ int mi_width = AOMMIN(mi_size_wide[BLOCK_64X64], xd->n4_w);
+ mi_width = AOMMIN(mi_width, cm->mi_cols - mi_col);
+ int mi_height = AOMMIN(mi_size_high[BLOCK_64X64], xd->n4_h);
+ mi_height = AOMMIN(mi_height, cm->mi_rows - mi_row);
+ int mi_size = AOMMIN(mi_width, mi_height);
+
+ for (int idx = 0; abs(max_row_offset) >= 1 && idx < mi_size &&
+ refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+ const MB_MODE_INFO *const candidate = xd->mi[-xd->mi_stride + idx];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_wide[candidate->sb_type];
+ }
+
+ for (int idx = 0; abs(max_col_offset) >= 1 && idx < mi_size &&
+ refmv_count[ref_frame] < MAX_MV_REF_CANDIDATES;) {
+ const MB_MODE_INFO *const candidate = xd->mi[idx * xd->mi_stride - 1];
+ process_single_ref_mv_candidate(candidate, cm, ref_frame, refmv_count,
+ ref_mv_stack);
+ idx += mi_size_high[candidate->sb_type];
+ }
+
+ for (int idx = 0; idx < refmv_count[ref_frame]; ++idx) {
+ clamp_mv_ref(&ref_mv_stack[ref_frame][idx].this_mv.as_mv,
+ xd->n4_w << MI_SIZE_LOG2, xd->n4_h << MI_SIZE_LOG2, xd);
+ }
+
+ if (mv_ref_list != NULL) {
+ for (int idx = refmv_count[ref_frame]; idx < MAX_MV_REF_CANDIDATES; ++idx)
+ mv_ref_list[rf[0]][idx].as_int = gm_mv_candidates[0].as_int;
+
+ for (int idx = 0;
+ idx < AOMMIN(MAX_MV_REF_CANDIDATES, refmv_count[ref_frame]); ++idx) {
+ mv_ref_list[rf[0]][idx].as_int =
+ ref_mv_stack[ref_frame][idx].this_mv.as_int;
+ }
+ }
+ }
+}
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+ int_mv *global_mvs, int mi_row, int mi_col,
+ int16_t *mode_context) {
+ int_mv zeromv[2];
+ BLOCK_SIZE bsize = mi->sb_type;
+ MV_REFERENCE_FRAME rf[2];
+ av1_set_ref_frame(rf, ref_frame);
+
+ if (ref_frame < REF_FRAMES) {
+ if (ref_frame != INTRA_FRAME) {
+ global_mvs[ref_frame] = gm_get_motion_vector(
+ &cm->global_motion[ref_frame], cm->allow_high_precision_mv, bsize,
+ mi_col, mi_row, cm->cur_frame_force_integer_mv);
+ } else {
+ global_mvs[ref_frame].as_int = INVALID_MV;
+ }
+ }
+
+ if (ref_frame != INTRA_FRAME) {
+ zeromv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[rf[0]],
+ cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+ cm->cur_frame_force_integer_mv)
+ .as_int;
+ zeromv[1].as_int =
+ (rf[1] != NONE_FRAME)
+ ? gm_get_motion_vector(&cm->global_motion[rf[1]],
+ cm->allow_high_precision_mv, bsize, mi_col,
+ mi_row, cm->cur_frame_force_integer_mv)
+ .as_int
+ : 0;
+ } else {
+ zeromv[0].as_int = zeromv[1].as_int = 0;
+ }
+
+ setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack, mv_ref_list,
+ zeromv, mi_row, mi_col, mode_context);
+}
+
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+ int_mv *near_mv, int is_integer) {
+ int i;
+ // Make sure all the candidates are properly clamped etc
+ for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) {
+ lower_mv_precision(&mvlist[i].as_mv, allow_hp, is_integer);
+ }
+ *nearest_mv = mvlist[0];
+ *near_mv = mvlist[1];
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm) {
+ cm->cur_frame->cur_frame_offset = cm->frame_offset;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+ if (buf_idx >= 0)
+ cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME] =
+ cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ }
+}
+
+void av1_setup_frame_sign_bias(AV1_COMMON *cm) {
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+ if (cm->seq_params.enable_order_hint && buf_idx != INVALID_IDX) {
+ const int ref_frame_offset =
+ cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ cm->ref_frame_sign_bias[ref_frame] =
+ (get_relative_dist(cm, ref_frame_offset, (int)cm->frame_offset) <= 0)
+ ? 0
+ : 1;
+ } else {
+ cm->ref_frame_sign_bias[ref_frame] = 0;
+ }
+ }
+}
+
+#define MAX_OFFSET_WIDTH 64
+#define MAX_OFFSET_HEIGHT 0
+
+static int get_block_position(AV1_COMMON *cm, int *mi_r, int *mi_c, int blk_row,
+ int blk_col, MV mv, int sign_bias) {
+ const int base_blk_row = (blk_row >> 3) << 3;
+ const int base_blk_col = (blk_col >> 3) << 3;
+
+ const int row_offset = (mv.row >= 0) ? (mv.row >> (4 + MI_SIZE_LOG2))
+ : -((-mv.row) >> (4 + MI_SIZE_LOG2));
+
+ const int col_offset = (mv.col >= 0) ? (mv.col >> (4 + MI_SIZE_LOG2))
+ : -((-mv.col) >> (4 + MI_SIZE_LOG2));
+
+ const int row =
+ (sign_bias == 1) ? blk_row - row_offset : blk_row + row_offset;
+ const int col =
+ (sign_bias == 1) ? blk_col - col_offset : blk_col + col_offset;
+
+ if (row < 0 || row >= (cm->mi_rows >> 1) || col < 0 ||
+ col >= (cm->mi_cols >> 1))
+ return 0;
+
+ if (row < base_blk_row - (MAX_OFFSET_HEIGHT >> 3) ||
+ row >= base_blk_row + 8 + (MAX_OFFSET_HEIGHT >> 3) ||
+ col < base_blk_col - (MAX_OFFSET_WIDTH >> 3) ||
+ col >= base_blk_col + 8 + (MAX_OFFSET_WIDTH >> 3))
+ return 0;
+
+ *mi_r = row;
+ *mi_c = col;
+
+ return 1;
+}
+
+// Note: motion_filed_projection finds motion vectors of current frame's
+// reference frame, and projects them to current frame. To make it clear,
+// let's call current frame's reference frame as start frame.
+// Call Start frame's reference frames as reference frames.
+// Call ref_offset as frame distances between start frame and its reference
+// frames.
+static int motion_field_projection(AV1_COMMON *cm,
+ MV_REFERENCE_FRAME start_frame, int dir) {
+ TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+ int ref_offset[REF_FRAMES] = { 0 };
+
+ (void)dir;
+
+ const int start_frame_idx = cm->frame_refs[FWD_RF_OFFSET(start_frame)].idx;
+ if (start_frame_idx < 0) return 0;
+
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].intra_only) return 0;
+
+ if (cm->buffer_pool->frame_bufs[start_frame_idx].mi_rows != cm->mi_rows ||
+ cm->buffer_pool->frame_bufs[start_frame_idx].mi_cols != cm->mi_cols)
+ return 0;
+
+ const int start_frame_offset =
+ cm->buffer_pool->frame_bufs[start_frame_idx].cur_frame_offset;
+ const unsigned int *const ref_frame_offsets =
+ &cm->buffer_pool->frame_bufs[start_frame_idx].ref_frame_offset[0];
+ const int cur_frame_offset = cm->cur_frame->cur_frame_offset;
+ int start_to_current_frame_offset =
+ get_relative_dist(cm, start_frame_offset, cur_frame_offset);
+
+ for (MV_REFERENCE_FRAME rf = LAST_FRAME; rf <= INTER_REFS_PER_FRAME; ++rf) {
+ ref_offset[rf] = get_relative_dist(cm, start_frame_offset,
+ ref_frame_offsets[rf - LAST_FRAME]);
+ }
+
+ if (dir == 2) start_to_current_frame_offset = -start_to_current_frame_offset;
+
+ MV_REF *mv_ref_base = cm->buffer_pool->frame_bufs[start_frame_idx].mvs;
+ const int mvs_rows = (cm->mi_rows + 1) >> 1;
+ const int mvs_cols = (cm->mi_cols + 1) >> 1;
+
+ for (int blk_row = 0; blk_row < mvs_rows; ++blk_row) {
+ for (int blk_col = 0; blk_col < mvs_cols; ++blk_col) {
+ MV_REF *mv_ref = &mv_ref_base[blk_row * mvs_cols + blk_col];
+ MV fwd_mv = mv_ref->mv.as_mv;
+
+ if (mv_ref->ref_frame > INTRA_FRAME) {
+ int_mv this_mv;
+ int mi_r, mi_c;
+ const int ref_frame_offset = ref_offset[mv_ref->ref_frame];
+
+ int pos_valid =
+ abs(ref_frame_offset) <= MAX_FRAME_DISTANCE &&
+ ref_frame_offset > 0 &&
+ abs(start_to_current_frame_offset) <= MAX_FRAME_DISTANCE;
+
+ if (pos_valid) {
+ get_mv_projection(&this_mv.as_mv, fwd_mv,
+ start_to_current_frame_offset, ref_frame_offset);
+ pos_valid = get_block_position(cm, &mi_r, &mi_c, blk_row, blk_col,
+ this_mv.as_mv, dir >> 1);
+ }
+
+ if (pos_valid) {
+ const int mi_offset = mi_r * (cm->mi_stride >> 1) + mi_c;
+
+ tpl_mvs_base[mi_offset].mfmv0.as_mv.row = fwd_mv.row;
+ tpl_mvs_base[mi_offset].mfmv0.as_mv.col = fwd_mv.col;
+ tpl_mvs_base[mi_offset].ref_frame_offset = ref_frame_offset;
+ }
+ }
+ }
+ }
+
+ return 1;
+}
+
+void av1_setup_motion_field(AV1_COMMON *cm) {
+ memset(cm->ref_frame_side, 0, sizeof(cm->ref_frame_side));
+ if (!cm->seq_params.enable_order_hint) return;
+
+ TPL_MV_REF *tpl_mvs_base = cm->tpl_mvs;
+ int size = ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+ for (int idx = 0; idx < size; ++idx) {
+ tpl_mvs_base[idx].mfmv0.as_int = INVALID_MV;
+ tpl_mvs_base[idx].ref_frame_offset = 0;
+ }
+
+ const int cur_order_hint = cm->cur_frame->cur_frame_offset;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+ int ref_buf_idx[INTER_REFS_PER_FRAME];
+ int ref_order_hint[INTER_REFS_PER_FRAME];
+
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ const int ref_idx = ref_frame - LAST_FRAME;
+ const int buf_idx = cm->frame_refs[ref_idx].idx;
+ int order_hint = 0;
+
+ if (buf_idx >= 0) order_hint = frame_bufs[buf_idx].cur_frame_offset;
+
+ ref_buf_idx[ref_idx] = buf_idx;
+ ref_order_hint[ref_idx] = order_hint;
+
+ if (get_relative_dist(cm, order_hint, cur_order_hint) > 0)
+ cm->ref_frame_side[ref_frame] = 1;
+ else if (order_hint == cur_order_hint)
+ cm->ref_frame_side[ref_frame] = -1;
+ }
+
+ int ref_stamp = MFMV_STACK_SIZE - 1;
+
+ if (ref_buf_idx[LAST_FRAME - LAST_FRAME] >= 0) {
+ const int alt_of_lst_order_hint =
+ frame_bufs[ref_buf_idx[LAST_FRAME - LAST_FRAME]]
+ .ref_frame_offset[ALTREF_FRAME - LAST_FRAME];
+
+ const int is_lst_overlay =
+ (alt_of_lst_order_hint == ref_order_hint[GOLDEN_FRAME - LAST_FRAME]);
+ if (!is_lst_overlay) motion_field_projection(cm, LAST_FRAME, 2);
+ --ref_stamp;
+ }
+
+ if (get_relative_dist(cm, ref_order_hint[BWDREF_FRAME - LAST_FRAME],
+ cur_order_hint) > 0) {
+ if (motion_field_projection(cm, BWDREF_FRAME, 0)) --ref_stamp;
+ }
+
+ if (get_relative_dist(cm, ref_order_hint[ALTREF2_FRAME - LAST_FRAME],
+ cur_order_hint) > 0) {
+ if (motion_field_projection(cm, ALTREF2_FRAME, 0)) --ref_stamp;
+ }
+
+ if (get_relative_dist(cm, ref_order_hint[ALTREF_FRAME - LAST_FRAME],
+ cur_order_hint) > 0 &&
+ ref_stamp >= 0)
+ if (motion_field_projection(cm, ALTREF_FRAME, 0)) --ref_stamp;
+
+ if (ref_stamp >= 0 && ref_buf_idx[LAST2_FRAME - LAST_FRAME] >= 0)
+ if (motion_field_projection(cm, LAST2_FRAME, 2)) --ref_stamp;
+}
+
+static INLINE void record_samples(MB_MODE_INFO *mbmi, int *pts, int *pts_inref,
+ int row_offset, int sign_r, int col_offset,
+ int sign_c) {
+ int bw = block_size_wide[mbmi->sb_type];
+ int bh = block_size_high[mbmi->sb_type];
+ int x = col_offset * MI_SIZE + sign_c * AOMMAX(bw, MI_SIZE) / 2 - 1;
+ int y = row_offset * MI_SIZE + sign_r * AOMMAX(bh, MI_SIZE) / 2 - 1;
+
+ pts[0] = (x * 8);
+ pts[1] = (y * 8);
+ pts_inref[0] = (x * 8) + mbmi->mv[0].as_mv.col;
+ pts_inref[1] = (y * 8) + mbmi->mv[0].as_mv.row;
+}
+
+// Select samples according to the motion vector difference.
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int thresh = clamp(AOMMAX(bw, bh), 16, 112);
+ int pts_mvd[SAMPLES_ARRAY_SIZE] = { 0 };
+ int i, j, k, l = len;
+ int ret = 0;
+ assert(len <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Obtain the motion vector difference.
+ for (i = 0; i < len; ++i) {
+ pts_mvd[i] = abs(pts_inref[2 * i] - pts[2 * i] - mv->col) +
+ abs(pts_inref[2 * i + 1] - pts[2 * i + 1] - mv->row);
+
+ if (pts_mvd[i] > thresh)
+ pts_mvd[i] = -1;
+ else
+ ret++;
+ }
+
+ // Keep at least 1 sample.
+ if (!ret) return 1;
+
+ i = 0;
+ j = l - 1;
+ for (k = 0; k < l - ret; k++) {
+ while (pts_mvd[i] != -1) i++;
+ while (pts_mvd[j] == -1) j--;
+ assert(i != j);
+ if (i > j) break;
+
+ // Replace the discarded samples;
+ pts_mvd[i] = pts_mvd[j];
+ pts[2 * i] = pts[2 * j];
+ pts[2 * i + 1] = pts[2 * j + 1];
+ pts_inref[2 * i] = pts_inref[2 * j];
+ pts_inref[2 * i + 1] = pts_inref[2 * j + 1];
+ i++;
+ j--;
+ }
+
+ return ret;
+}
+
+// Note: Samples returned are at 1/8-pel precision
+// Sample are the neighbor block center point's coordinates relative to the
+// left-top pixel of current block.
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+ int *pts, int *pts_inref) {
+ MB_MODE_INFO *const mbmi0 = xd->mi[0];
+ int ref_frame = mbmi0->ref_frame[0];
+ int up_available = xd->up_available;
+ int left_available = xd->left_available;
+ int i, mi_step = 1, np = 0;
+
+ const TileInfo *const tile = &xd->tile;
+ int do_tl = 1;
+ int do_tr = 1;
+
+ // scan the nearest above rows
+ if (up_available) {
+ int mi_row_offset = -1;
+ MB_MODE_INFO *mbmi = xd->mi[mi_row_offset * xd->mi_stride];
+ uint8_t n4_w = mi_size_wide[mbmi->sb_type];
+
+ if (xd->n4_w <= n4_w) {
+ // Handle "current block width <= above block width" case.
+ int col_offset = -mi_col % n4_w;
+
+ if (col_offset < 0) do_tl = 0;
+ if (col_offset + n4_w > xd->n4_w) do_tr = 0;
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, col_offset, 1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ } else {
+ // Handle "current block width > above block width" case.
+ for (i = 0; i < AOMMIN(xd->n4_w, cm->mi_cols - mi_col); i += mi_step) {
+ int mi_col_offset = i;
+ mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+ n4_w = mi_size_wide[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_w, n4_w);
+
+ if (mbmi->ref_frame[0] == ref_frame &&
+ mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, i, 1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // scan the nearest left columns
+ if (left_available) {
+ int mi_col_offset = -1;
+
+ MB_MODE_INFO *mbmi = xd->mi[mi_col_offset];
+ uint8_t n4_h = mi_size_high[mbmi->sb_type];
+
+ if (xd->n4_h <= n4_h) {
+ // Handle "current block height <= above block height" case.
+ int row_offset = -mi_row % n4_h;
+
+ if (row_offset < 0) do_tl = 0;
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, row_offset, 1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ } else {
+ // Handle "current block height > above block height" case.
+ for (i = 0; i < AOMMIN(xd->n4_h, cm->mi_rows - mi_row); i += mi_step) {
+ int mi_row_offset = i;
+ mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+ n4_h = mi_size_high[mbmi->sb_type];
+ mi_step = AOMMIN(xd->n4_h, n4_h);
+
+ if (mbmi->ref_frame[0] == ref_frame &&
+ mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, i, 1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Top-left block
+ if (do_tl && left_available && up_available) {
+ int mi_row_offset = -1;
+ int mi_col_offset = -1;
+
+ MB_MODE_INFO *mbmi = xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, 0, -1);
+ pts += 2;
+ pts_inref += 2;
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ // Top-right block
+ if (do_tr &&
+ has_top_right(cm, xd, mi_row, mi_col, AOMMAX(xd->n4_w, xd->n4_h))) {
+ POSITION trb_pos = { -1, xd->n4_w };
+
+ if (is_inside(tile, mi_col, mi_row, &trb_pos)) {
+ int mi_row_offset = -1;
+ int mi_col_offset = xd->n4_w;
+
+ MB_MODE_INFO *mbmi =
+ xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride];
+
+ if (mbmi->ref_frame[0] == ref_frame && mbmi->ref_frame[1] == NONE_FRAME) {
+ record_samples(mbmi, pts, pts_inref, 0, -1, xd->n4_w, 1);
+ np++;
+ if (np >= LEAST_SQUARES_SAMPLES_MAX) return LEAST_SQUARES_SAMPLES_MAX;
+ }
+ }
+ }
+ assert(np <= LEAST_SQUARES_SAMPLES_MAX);
+
+ return np;
+}
+
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm) {
+ cm->is_skip_mode_allowed = 0;
+ cm->ref_frame_idx_0 = cm->ref_frame_idx_1 = INVALID_IDX;
+
+ if (!cm->seq_params.enable_order_hint || frame_is_intra_only(cm) ||
+ cm->reference_mode == SINGLE_REFERENCE)
+ return;
+
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ const int cur_frame_offset = cm->frame_offset;
+ int ref_frame_offset[2] = { -1, INT_MAX };
+ int ref_idx[2] = { INVALID_IDX, INVALID_IDX };
+
+ // Identify the nearest forward and backward references.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const int buf_idx = cm->frame_refs[i].idx;
+ if (buf_idx == INVALID_IDX) continue;
+
+ const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+ if (get_relative_dist(cm, ref_offset, cur_frame_offset) < 0) {
+ // Forward reference
+ if (ref_frame_offset[0] == -1 ||
+ get_relative_dist(cm, ref_offset, ref_frame_offset[0]) > 0) {
+ ref_frame_offset[0] = ref_offset;
+ ref_idx[0] = i;
+ }
+ } else if (get_relative_dist(cm, ref_offset, cur_frame_offset) > 0) {
+ // Backward reference
+ if (ref_frame_offset[1] == INT_MAX ||
+ get_relative_dist(cm, ref_offset, ref_frame_offset[1]) < 0) {
+ ref_frame_offset[1] = ref_offset;
+ ref_idx[1] = i;
+ }
+ }
+ }
+
+ if (ref_idx[0] != INVALID_IDX && ref_idx[1] != INVALID_IDX) {
+ // == Bi-directional prediction ==
+ cm->is_skip_mode_allowed = 1;
+ cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ } else if (ref_idx[0] != INVALID_IDX && ref_idx[1] == INVALID_IDX) {
+ // == Forward prediction only ==
+ // Identify the second nearest forward reference.
+ ref_frame_offset[1] = -1;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ const int buf_idx = cm->frame_refs[i].idx;
+ if (buf_idx == INVALID_IDX) continue;
+
+ const int ref_offset = frame_bufs[buf_idx].cur_frame_offset;
+ if ((ref_frame_offset[0] != -1 &&
+ get_relative_dist(cm, ref_offset, ref_frame_offset[0]) < 0) &&
+ (ref_frame_offset[1] == -1 ||
+ get_relative_dist(cm, ref_offset, ref_frame_offset[1]) > 0)) {
+ // Second closest forward reference
+ ref_frame_offset[1] = ref_offset;
+ ref_idx[1] = i;
+ }
+ }
+ if (ref_frame_offset[1] != -1) {
+ cm->is_skip_mode_allowed = 1;
+ cm->ref_frame_idx_0 = AOMMIN(ref_idx[0], ref_idx[1]);
+ cm->ref_frame_idx_1 = AOMMAX(ref_idx[0], ref_idx[1]);
+ }
+ }
+}
+
+typedef struct {
+ int map_idx; // frame map index
+ int buf_idx; // frame buffer index
+ int sort_idx; // index based on the offset to be used for sorting
+} REF_FRAME_INFO;
+
+static int compare_ref_frame_info(const void *arg_a, const void *arg_b) {
+ const REF_FRAME_INFO *info_a = (REF_FRAME_INFO *)arg_a;
+ const REF_FRAME_INFO *info_b = (REF_FRAME_INFO *)arg_b;
+
+ if (info_a->sort_idx < info_b->sort_idx) return -1;
+ if (info_a->sort_idx > info_b->sort_idx) return 1;
+ return (info_a->map_idx < info_b->map_idx)
+ ? -1
+ : ((info_a->map_idx > info_b->map_idx) ? 1 : 0);
+}
+
+static void set_ref_frame_info(AV1_COMMON *const cm, int frame_idx,
+ REF_FRAME_INFO *ref_info) {
+ assert(frame_idx >= 0 && frame_idx < INTER_REFS_PER_FRAME);
+
+ const int buf_idx = ref_info->buf_idx;
+
+ cm->frame_refs[frame_idx].idx = buf_idx;
+ cm->frame_refs[frame_idx].buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+ cm->frame_refs[frame_idx].map_idx = ref_info->map_idx;
+}
+
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx,
+ int gld_map_idx) {
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+ int lst_frame_sort_idx = -1;
+ int gld_frame_sort_idx = -1;
+
+ assert(cm->seq_params.enable_order_hint);
+ assert(cm->seq_params.order_hint_bits_minus_1 >= 0);
+ const int cur_frame_offset = (int)cm->frame_offset;
+ const int cur_frame_sort_idx = 1 << cm->seq_params.order_hint_bits_minus_1;
+
+ REF_FRAME_INFO ref_frame_info[REF_FRAMES];
+ int ref_flag_list[INTER_REFS_PER_FRAME] = { 0, 0, 0, 0, 0, 0, 0 };
+
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ const int map_idx = i;
+
+ ref_frame_info[i].map_idx = map_idx;
+ ref_frame_info[i].sort_idx = -1;
+
+ const int buf_idx = cm->ref_frame_map[map_idx];
+ ref_frame_info[i].buf_idx = buf_idx;
+
+ if (buf_idx < 0 || buf_idx >= FRAME_BUFFERS) continue;
+ // TODO(zoeliu@google.com): To verify the checking on ref_count.
+ if (frame_bufs[buf_idx].ref_count <= 0) continue;
+
+ const int offset = (int)frame_bufs[buf_idx].cur_frame_offset;
+ ref_frame_info[i].sort_idx =
+ (offset == -1) ? -1
+ : cur_frame_sort_idx +
+ get_relative_dist(cm, offset, cur_frame_offset);
+ assert(ref_frame_info[i].sort_idx >= -1);
+
+ if (map_idx == lst_map_idx) lst_frame_sort_idx = ref_frame_info[i].sort_idx;
+ if (map_idx == gld_map_idx) gld_frame_sort_idx = ref_frame_info[i].sort_idx;
+ }
+
+ // Confirm both LAST_FRAME and GOLDEN_FRAME are valid forward reference
+ // frames.
+ if (lst_frame_sort_idx == -1 || lst_frame_sort_idx >= cur_frame_sort_idx) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests a look-ahead frame as LAST");
+ }
+ if (gld_frame_sort_idx == -1 || gld_frame_sort_idx >= cur_frame_sort_idx) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests a look-ahead frame as GOLDEN");
+ }
+
+ // Sort ref frames based on their frame_offset values.
+ qsort(ref_frame_info, REF_FRAMES, sizeof(REF_FRAME_INFO),
+ compare_ref_frame_info);
+
+ // Identify forward and backward reference frames.
+ // Forward reference: offset < cur_frame_offset
+ // Backward reference: offset >= cur_frame_offset
+ int fwd_start_idx = 0, fwd_end_idx = REF_FRAMES - 1;
+
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (ref_frame_info[i].sort_idx == -1) {
+ fwd_start_idx++;
+ continue;
+ }
+
+ if (ref_frame_info[i].sort_idx >= cur_frame_sort_idx) {
+ fwd_end_idx = i - 1;
+ break;
+ }
+ }
+
+ int bwd_start_idx = fwd_end_idx + 1;
+ int bwd_end_idx = REF_FRAMES - 1;
+
+ // === Backward Reference Frames ===
+
+ // == ALTREF_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(cm, ALTREF_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_end_idx]);
+ ref_flag_list[ALTREF_FRAME - LAST_FRAME] = 1;
+ bwd_end_idx--;
+ }
+
+ // == BWDREF_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(cm, BWDREF_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_start_idx]);
+ ref_flag_list[BWDREF_FRAME - LAST_FRAME] = 1;
+ bwd_start_idx++;
+ }
+
+ // == ALTREF2_FRAME ==
+ if (bwd_start_idx <= bwd_end_idx) {
+ set_ref_frame_info(cm, ALTREF2_FRAME - LAST_FRAME,
+ &ref_frame_info[bwd_start_idx]);
+ ref_flag_list[ALTREF2_FRAME - LAST_FRAME] = 1;
+ }
+
+ // === Forward Reference Frames ===
+
+ for (int i = fwd_start_idx; i <= fwd_end_idx; ++i) {
+ // == LAST_FRAME ==
+ if (ref_frame_info[i].map_idx == lst_map_idx) {
+ set_ref_frame_info(cm, LAST_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ ref_flag_list[LAST_FRAME - LAST_FRAME] = 1;
+ }
+
+ // == GOLDEN_FRAME ==
+ if (ref_frame_info[i].map_idx == gld_map_idx) {
+ set_ref_frame_info(cm, GOLDEN_FRAME - LAST_FRAME, &ref_frame_info[i]);
+ ref_flag_list[GOLDEN_FRAME - LAST_FRAME] = 1;
+ }
+ }
+
+ assert(ref_flag_list[LAST_FRAME - LAST_FRAME] == 1 &&
+ ref_flag_list[GOLDEN_FRAME - LAST_FRAME] == 1);
+
+ // == LAST2_FRAME ==
+ // == LAST3_FRAME ==
+ // == BWDREF_FRAME ==
+ // == ALTREF2_FRAME ==
+ // == ALTREF_FRAME ==
+
+ // Set up the reference frames in the anti-chronological order.
+ static const MV_REFERENCE_FRAME ref_frame_list[INTER_REFS_PER_FRAME - 2] = {
+ LAST2_FRAME, LAST3_FRAME, BWDREF_FRAME, ALTREF2_FRAME, ALTREF_FRAME
+ };
+
+ int ref_idx;
+ for (ref_idx = 0; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+ const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+
+ if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+
+ while (fwd_start_idx <= fwd_end_idx &&
+ (ref_frame_info[fwd_end_idx].map_idx == lst_map_idx ||
+ ref_frame_info[fwd_end_idx].map_idx == gld_map_idx)) {
+ fwd_end_idx--;
+ }
+ if (fwd_start_idx > fwd_end_idx) break;
+
+ set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ &ref_frame_info[fwd_end_idx]);
+ ref_flag_list[ref_frame - LAST_FRAME] = 1;
+
+ fwd_end_idx--;
+ }
+
+ // Assign all the remaining frame(s), if any, to the earliest reference frame.
+ for (; ref_idx < (INTER_REFS_PER_FRAME - 2); ref_idx++) {
+ const MV_REFERENCE_FRAME ref_frame = ref_frame_list[ref_idx];
+ if (ref_flag_list[ref_frame - LAST_FRAME] == 1) continue;
+ set_ref_frame_info(cm, ref_frame - LAST_FRAME,
+ &ref_frame_info[fwd_start_idx]);
+ ref_flag_list[ref_frame - LAST_FRAME] = 1;
+ }
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; i++) {
+ assert(ref_flag_list[i] == 1);
+ }
+}
diff --git a/third_party/aom/av1/common/mvref_common.h b/third_party/aom/av1/common/mvref_common.h
new file mode 100644
index 0000000000..83f7a1ac0d
--- /dev/null
+++ b/third_party/aom/av1/common/mvref_common.h
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_MVREF_COMMON_H_
+#define AOM_AV1_COMMON_MVREF_COMMON_H_
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MVREF_ROW_COLS 3
+
+// Set the upper limit of the motion vector component magnitude.
+// This would make a motion vector fit in 26 bits. Plus 3 bits for the
+// reference frame index. A tuple of motion vector can hence be stored within
+// 32 bit range for efficient load/store operations.
+#define REFMVS_LIMIT ((1 << 12) - 1)
+
+typedef struct position {
+ int row;
+ int col;
+} POSITION;
+
+// clamp_mv_ref
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+
+static INLINE int get_relative_dist(const AV1_COMMON *cm, int a, int b) {
+ if (!cm->seq_params.enable_order_hint) return 0;
+
+ const int bits = cm->seq_params.order_hint_bits_minus_1 + 1;
+
+ assert(bits >= 1);
+ assert(a >= 0 && a < (1 << bits));
+ assert(b >= 0 && b < (1 << bits));
+
+ int diff = a - b;
+ const int m = 1 << (bits - 1);
+ diff = (diff & (m - 1)) - (diff & m);
+ return diff;
+}
+
+static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
+ clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
+ xd->mb_to_right_edge + bw * 8 + MV_BORDER,
+ xd->mb_to_top_edge - bh * 8 - MV_BORDER,
+ xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
+}
+
+// This function returns either the appropriate sub block or block's mv
+// on whether the block_size < 8x8 and we have check_sub_blocks set.
+static INLINE int_mv get_sub_block_mv(const MB_MODE_INFO *candidate,
+ int which_mv, int search_col) {
+ (void)search_col;
+ return candidate->mv[which_mv];
+}
+
+static INLINE int_mv get_sub_block_pred_mv(const MB_MODE_INFO *candidate,
+ int which_mv, int search_col) {
+ (void)search_col;
+ return candidate->mv[which_mv];
+}
+
+// Performs mv sign inversion if indicated by the reference frame combination.
+static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
+ const MV_REFERENCE_FRAME this_ref_frame,
+ const int *ref_sign_bias) {
+ int_mv mv = mbmi->mv[ref];
+ if (ref_sign_bias[mbmi->ref_frame[ref]] != ref_sign_bias[this_ref_frame]) {
+ mv.as_mv.row *= -1;
+ mv.as_mv.col *= -1;
+ }
+ return mv;
+}
+
+// Checks that the given mi_row, mi_col and search point
+// are inside the borders of the tile.
+static INLINE int is_inside(const TileInfo *const tile, int mi_col, int mi_row,
+ const POSITION *mi_pos) {
+ return !(mi_row + mi_pos->row < tile->mi_row_start ||
+ mi_col + mi_pos->col < tile->mi_col_start ||
+ mi_row + mi_pos->row >= tile->mi_row_end ||
+ mi_col + mi_pos->col >= tile->mi_col_end);
+}
+
+static INLINE int find_valid_row_offset(const TileInfo *const tile, int mi_row,
+ int row_offset) {
+ return clamp(row_offset, tile->mi_row_start - mi_row,
+ tile->mi_row_end - mi_row - 1);
+}
+
+static INLINE int find_valid_col_offset(const TileInfo *const tile, int mi_col,
+ int col_offset) {
+ return clamp(col_offset, tile->mi_col_start - mi_col,
+ tile->mi_col_end - mi_col - 1);
+}
+
+static INLINE void lower_mv_precision(MV *mv, int allow_hp, int is_integer) {
+ if (is_integer) {
+ integer_mv_precision(mv);
+ } else {
+ if (!allow_hp) {
+ if (mv->row & 1) mv->row += (mv->row > 0 ? -1 : 1);
+ if (mv->col & 1) mv->col += (mv->col > 0 ? -1 : 1);
+ }
+ }
+}
+
+static INLINE int8_t get_uni_comp_ref_idx(const MV_REFERENCE_FRAME *const rf) {
+ // Single ref pred
+ if (rf[1] <= INTRA_FRAME) return -1;
+
+ // Bi-directional comp ref pred
+ if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
+
+ for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
+ if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx))
+ return ref_idx;
+ }
+ return -1;
+}
+
+static INLINE int8_t av1_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+ if (rf[1] > INTRA_FRAME) {
+ const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
+ if (uni_comp_ref_idx >= 0) {
+ assert((REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
+ MODE_CTX_REF_FRAMES);
+ return REF_FRAMES + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
+ } else {
+ return REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+ BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+ }
+ }
+
+ return rf[0];
+}
+
+// clang-format off
+static MV_REFERENCE_FRAME ref_frame_map[TOTAL_COMP_REFS][2] = {
+ { LAST_FRAME, BWDREF_FRAME }, { LAST2_FRAME, BWDREF_FRAME },
+ { LAST3_FRAME, BWDREF_FRAME }, { GOLDEN_FRAME, BWDREF_FRAME },
+
+ { LAST_FRAME, ALTREF2_FRAME }, { LAST2_FRAME, ALTREF2_FRAME },
+ { LAST3_FRAME, ALTREF2_FRAME }, { GOLDEN_FRAME, ALTREF2_FRAME },
+
+ { LAST_FRAME, ALTREF_FRAME }, { LAST2_FRAME, ALTREF_FRAME },
+ { LAST3_FRAME, ALTREF_FRAME }, { GOLDEN_FRAME, ALTREF_FRAME },
+
+ { LAST_FRAME, LAST2_FRAME }, { LAST_FRAME, LAST3_FRAME },
+ { LAST_FRAME, GOLDEN_FRAME }, { BWDREF_FRAME, ALTREF_FRAME },
+
+ // NOTE: Following reference frame pairs are not supported to be explicitly
+ // signalled, but they are possibly chosen by the use of skip_mode,
+ // which may use the most recent one-sided reference frame pair.
+ { LAST2_FRAME, LAST3_FRAME }, { LAST2_FRAME, GOLDEN_FRAME },
+ { LAST3_FRAME, GOLDEN_FRAME }, {BWDREF_FRAME, ALTREF2_FRAME},
+ { ALTREF2_FRAME, ALTREF_FRAME }
+};
+// clang-format on
+
+static INLINE void av1_set_ref_frame(MV_REFERENCE_FRAME *rf,
+ int8_t ref_frame_type) {
+ if (ref_frame_type >= REF_FRAMES) {
+ rf[0] = ref_frame_map[ref_frame_type - REF_FRAMES][0];
+ rf[1] = ref_frame_map[ref_frame_type - REF_FRAMES][1];
+ } else {
+ rf[0] = ref_frame_type;
+ rf[1] = NONE_FRAME;
+ assert(ref_frame_type > NONE_FRAME);
+ }
+}
+
+static uint16_t compound_mode_ctx_map[3][COMP_NEWMV_CTXS] = {
+ { 0, 1, 1, 1, 1 },
+ { 1, 2, 3, 4, 4 },
+ { 4, 4, 5, 6, 7 },
+};
+
+static INLINE int16_t av1_mode_context_analyzer(
+ const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf) {
+ const int8_t ref_frame = av1_ref_frame_type(rf);
+
+ if (rf[1] <= INTRA_FRAME) return mode_context[ref_frame];
+
+ const int16_t newmv_ctx = mode_context[ref_frame] & NEWMV_CTX_MASK;
+ const int16_t refmv_ctx =
+ (mode_context[ref_frame] >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+ const int16_t comp_ctx = compound_mode_ctx_map[refmv_ctx >> 1][AOMMIN(
+ newmv_ctx, COMP_NEWMV_CTXS - 1)];
+ return comp_ctx;
+}
+
+static INLINE uint8_t av1_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
+ int ref_idx) {
+ if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+ ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL)
+ return 0;
+
+ if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+ ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+ return 1;
+
+ if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
+ ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+ return 2;
+
+ return 0;
+}
+
+void av1_setup_frame_buf_refs(AV1_COMMON *cm);
+void av1_setup_frame_sign_bias(AV1_COMMON *cm);
+void av1_setup_skip_mode_allowed(AV1_COMMON *cm);
+void av1_setup_motion_field(AV1_COMMON *cm);
+void av1_set_frame_refs(AV1_COMMON *const cm, int lst_map_idx, int gld_map_idx);
+
+static INLINE void av1_collect_neighbors_ref_counts(MACROBLOCKD *const xd) {
+ av1_zero(xd->neighbors_ref_counts);
+
+ uint8_t *const ref_counts = xd->neighbors_ref_counts;
+
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
+
+ // Above neighbor
+ if (above_in_image && is_inter_block(above_mbmi)) {
+ ref_counts[above_mbmi->ref_frame[0]]++;
+ if (has_second_ref(above_mbmi)) {
+ ref_counts[above_mbmi->ref_frame[1]]++;
+ }
+ }
+
+ // Left neighbor
+ if (left_in_image && is_inter_block(left_mbmi)) {
+ ref_counts[left_mbmi->ref_frame[0]]++;
+ if (has_second_ref(left_mbmi)) {
+ ref_counts[left_mbmi->ref_frame[1]]++;
+ }
+ }
+}
+
+void av1_copy_frame_mvs(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *const mi, int mi_row, int mi_col,
+ int x_mis, int y_mis);
+
+void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ MB_MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES],
+ CANDIDATE_MV ref_mv_stack[][MAX_REF_MV_STACK_SIZE],
+ int_mv mv_ref_list[][MAX_MV_REF_CANDIDATES],
+ int_mv *global_mvs, int mi_row, int mi_col,
+ int16_t *mode_context);
+
+// check a list of motion vectors by sad score using a number rows of pixels
+// above and a number cols of pixels in the left to select the one with best
+// score to use as ref motion vector
+void av1_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *nearest_mv,
+ int_mv *near_mv, int is_integer);
+
+int selectSamples(MV *mv, int *pts, int *pts_inref, int len, BLOCK_SIZE bsize);
+int findSamples(const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+ int *pts, int *pts_inref);
+
+#define INTRABC_DELAY_PIXELS 256 // Delay of 256 pixels
+#define INTRABC_DELAY_SB64 (INTRABC_DELAY_PIXELS / 64)
+
+static INLINE void av1_find_ref_dv(int_mv *ref_dv, const TileInfo *const tile,
+ int mib_size, int mi_row, int mi_col) {
+ (void)mi_col;
+ if (mi_row - mib_size < tile->mi_row_start) {
+ ref_dv->as_mv.row = 0;
+ ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
+ } else {
+ ref_dv->as_mv.row = -MI_SIZE * mib_size;
+ ref_dv->as_mv.col = 0;
+ }
+ ref_dv->as_mv.row *= 8;
+ ref_dv->as_mv.col *= 8;
+}
+
+static INLINE int av1_is_dv_valid(const MV dv, const AV1_COMMON *cm,
+ const MACROBLOCKD *xd, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int mib_size_log2) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int SCALE_PX_TO_MV = 8;
+ // Disallow subpixel for now
+ // SUBPEL_MASK is not the correct scale
+ if (((dv.row & (SCALE_PX_TO_MV - 1)) || (dv.col & (SCALE_PX_TO_MV - 1))))
+ return 0;
+
+ const TileInfo *const tile = &xd->tile;
+ // Is the source top-left inside the current tile?
+ const int src_top_edge = mi_row * MI_SIZE * SCALE_PX_TO_MV + dv.row;
+ const int tile_top_edge = tile->mi_row_start * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_top_edge < tile_top_edge) return 0;
+ const int src_left_edge = mi_col * MI_SIZE * SCALE_PX_TO_MV + dv.col;
+ const int tile_left_edge = tile->mi_col_start * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_left_edge < tile_left_edge) return 0;
+ // Is the bottom right inside the current tile?
+ const int src_bottom_edge = (mi_row * MI_SIZE + bh) * SCALE_PX_TO_MV + dv.row;
+ const int tile_bottom_edge = tile->mi_row_end * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_bottom_edge > tile_bottom_edge) return 0;
+ const int src_right_edge = (mi_col * MI_SIZE + bw) * SCALE_PX_TO_MV + dv.col;
+ const int tile_right_edge = tile->mi_col_end * MI_SIZE * SCALE_PX_TO_MV;
+ if (src_right_edge > tile_right_edge) return 0;
+
+ // Special case for sub 8x8 chroma cases, to prevent referring to chroma
+ // pixels outside current tile.
+ for (int plane = 1; plane < av1_num_planes(cm); ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y)) {
+ if (bw < 8 && pd->subsampling_x)
+ if (src_left_edge < tile_left_edge + 4 * SCALE_PX_TO_MV) return 0;
+ if (bh < 8 && pd->subsampling_y)
+ if (src_top_edge < tile_top_edge + 4 * SCALE_PX_TO_MV) return 0;
+ }
+ }
+
+ // Is the bottom right within an already coded SB? Also consider additional
+ // constraints to facilitate HW decoder.
+ const int max_mib_size = 1 << mib_size_log2;
+ const int active_sb_row = mi_row >> mib_size_log2;
+ const int active_sb64_col = (mi_col * MI_SIZE) >> 6;
+ const int sb_size = max_mib_size * MI_SIZE;
+ const int src_sb_row = ((src_bottom_edge >> 3) - 1) / sb_size;
+ const int src_sb64_col = ((src_right_edge >> 3) - 1) >> 6;
+ const int total_sb64_per_row =
+ ((tile->mi_col_end - tile->mi_col_start - 1) >> 4) + 1;
+ const int active_sb64 = active_sb_row * total_sb64_per_row + active_sb64_col;
+ const int src_sb64 = src_sb_row * total_sb64_per_row + src_sb64_col;
+ if (src_sb64 >= active_sb64 - INTRABC_DELAY_SB64) return 0;
+
+ // Wavefront constraint: use only top left area of frame for reference.
+ const int gradient = 1 + INTRABC_DELAY_SB64 + (sb_size > 64);
+ const int wf_offset = gradient * (active_sb_row - src_sb_row);
+ if (src_sb_row > active_sb_row ||
+ src_sb64_col >= active_sb64_col - INTRABC_DELAY_SB64 + wf_offset)
+ return 0;
+
+ return 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_MVREF_COMMON_H_
diff --git a/third_party/aom/av1/common/obmc.h b/third_party/aom/av1/common/obmc.h
new file mode 100644
index 0000000000..1c90cd93fc
--- /dev/null
+++ b/third_party/aom/av1/common/obmc.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_OBMC_H_
+#define AOM_AV1_COMMON_OBMC_H_
+
+typedef void (*overlappable_nb_visitor_t)(MACROBLOCKD *xd, int rel_mi_pos,
+ uint8_t nb_mi_size,
+ MB_MODE_INFO *nb_mi, void *fun_ctxt,
+ const int num_planes);
+
+static INLINE void foreach_overlappable_nb_above(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_col,
+ int nb_max,
+ overlappable_nb_visitor_t fun,
+ void *fun_ctxt) {
+ const int num_planes = av1_num_planes(cm);
+ if (!xd->up_available) return;
+
+ int nb_count = 0;
+
+ // prev_row_mi points into the mi array, starting at the beginning of the
+ // previous row.
+ MB_MODE_INFO **prev_row_mi = xd->mi - mi_col - 1 * xd->mi_stride;
+ const int end_col = AOMMIN(mi_col + xd->n4_w, cm->mi_cols);
+ uint8_t mi_step;
+ for (int above_mi_col = mi_col; above_mi_col < end_col && nb_count < nb_max;
+ above_mi_col += mi_step) {
+ MB_MODE_INFO **above_mi = prev_row_mi + above_mi_col;
+ mi_step =
+ AOMMIN(mi_size_wide[above_mi[0]->sb_type], mi_size_wide[BLOCK_64X64]);
+ // If we're considering a block with width 4, it should be treated as
+ // half of a pair of blocks with chroma information in the second. Move
+ // above_mi_col back to the start of the pair if needed, set above_mbmi
+ // to point at the block with chroma information, and set mi_step to 2 to
+ // step over the entire pair at the end of the iteration.
+ if (mi_step == 1) {
+ above_mi_col &= ~1;
+ above_mi = prev_row_mi + above_mi_col + 1;
+ mi_step = 2;
+ }
+ if (is_neighbor_overlappable(*above_mi)) {
+ ++nb_count;
+ fun(xd, above_mi_col - mi_col, AOMMIN(xd->n4_w, mi_step), *above_mi,
+ fun_ctxt, num_planes);
+ }
+ }
+}
+
+static INLINE void foreach_overlappable_nb_left(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_row,
+ int nb_max,
+ overlappable_nb_visitor_t fun,
+ void *fun_ctxt) {
+ const int num_planes = av1_num_planes(cm);
+ if (!xd->left_available) return;
+
+ int nb_count = 0;
+
+ // prev_col_mi points into the mi array, starting at the top of the
+ // previous column
+ MB_MODE_INFO **prev_col_mi = xd->mi - 1 - mi_row * xd->mi_stride;
+ const int end_row = AOMMIN(mi_row + xd->n4_h, cm->mi_rows);
+ uint8_t mi_step;
+ for (int left_mi_row = mi_row; left_mi_row < end_row && nb_count < nb_max;
+ left_mi_row += mi_step) {
+ MB_MODE_INFO **left_mi = prev_col_mi + left_mi_row * xd->mi_stride;
+ mi_step =
+ AOMMIN(mi_size_high[left_mi[0]->sb_type], mi_size_high[BLOCK_64X64]);
+ if (mi_step == 1) {
+ left_mi_row &= ~1;
+ left_mi = prev_col_mi + (left_mi_row + 1) * xd->mi_stride;
+ mi_step = 2;
+ }
+ if (is_neighbor_overlappable(*left_mi)) {
+ ++nb_count;
+ fun(xd, left_mi_row - mi_row, AOMMIN(xd->n4_h, mi_step), *left_mi,
+ fun_ctxt, num_planes);
+ }
+ }
+}
+
+#endif // AOM_AV1_COMMON_OBMC_H_
diff --git a/third_party/aom/av1/common/obu_util.c b/third_party/aom/av1/common/obu_util.c
new file mode 100644
index 0000000000..823b700b13
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/common/obu_util.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+
+// Returns 1 when OBU type is valid, and 0 otherwise.
+static int valid_obu_type(int obu_type) {
+ int valid_type = 0;
+ switch (obu_type) {
+ case OBU_SEQUENCE_HEADER:
+ case OBU_TEMPORAL_DELIMITER:
+ case OBU_FRAME_HEADER:
+ case OBU_TILE_GROUP:
+ case OBU_METADATA:
+ case OBU_FRAME:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_TILE_LIST:
+ case OBU_PADDING: valid_type = 1; break;
+ default: break;
+ }
+ return valid_type;
+}
+
+static aom_codec_err_t read_obu_size(const uint8_t *data,
+ size_t bytes_available,
+ size_t *const obu_size,
+ size_t *const length_field_size) {
+ uint64_t u_obu_size = 0;
+ if (aom_uleb_decode(data, bytes_available, &u_obu_size, length_field_size) !=
+ 0) {
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (u_obu_size > UINT32_MAX) return AOM_CODEC_CORRUPT_FRAME;
+ *obu_size = (size_t)u_obu_size;
+ return AOM_CODEC_OK;
+}
+
+// Parses OBU header and stores values in 'header'.
+static aom_codec_err_t read_obu_header(struct aom_read_bit_buffer *rb,
+ int is_annexb, ObuHeader *header) {
+ if (!rb || !header) return AOM_CODEC_INVALID_PARAM;
+
+ const ptrdiff_t bit_buffer_byte_length = rb->bit_buffer_end - rb->bit_buffer;
+ if (bit_buffer_byte_length < 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size = 1;
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // Forbidden bit. Must not be set.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ header->type = (OBU_TYPE)aom_rb_read_literal(rb, 4);
+
+ if (!valid_obu_type(header->type)) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->has_extension = aom_rb_read_bit(rb);
+ header->has_size_field = aom_rb_read_bit(rb);
+
+ if (!header->has_size_field && !is_annexb) {
+ // section 5 obu streams must have obu_size field set.
+ return AOM_CODEC_UNSUP_BITSTREAM;
+ }
+
+ if (aom_rb_read_bit(rb) != 0) {
+ // obu_reserved_1bit must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+
+ if (header->has_extension) {
+ if (bit_buffer_byte_length == 1) return AOM_CODEC_CORRUPT_FRAME;
+
+ header->size += 1;
+ header->temporal_layer_id = aom_rb_read_literal(rb, 3);
+ header->spatial_layer_id = aom_rb_read_literal(rb, 2);
+ if (aom_rb_read_literal(rb, 3) != 0) {
+ // extension_header_reserved_3bits must be set to 0.
+ return AOM_CODEC_CORRUPT_FRAME;
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb) {
+ if (buffer_length < 1 || !consumed || !header) return AOM_CODEC_INVALID_PARAM;
+
+ // TODO(tomfinegan): Set the error handler here and throughout this file, and
+ // confirm parsing work done via aom_read_bit_buffer is successful.
+ struct aom_read_bit_buffer rb = { buffer, buffer + buffer_length, 0, NULL,
+ NULL };
+ aom_codec_err_t parse_result = read_obu_header(&rb, is_annexb, header);
+ if (parse_result == AOM_CODEC_OK) *consumed = header->size;
+ return parse_result;
+}
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read) {
+ size_t length_field_size = 0, obu_size = 0;
+ aom_codec_err_t status;
+
+ if (is_annexb) {
+ // Size field comes before the OBU header, and includes the OBU header
+ status =
+ read_obu_size(data, bytes_available, &obu_size, &length_field_size);
+
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ struct aom_read_bit_buffer rb = { data + length_field_size,
+ data + bytes_available, 0, NULL, NULL };
+
+ status = read_obu_header(&rb, is_annexb, obu_header);
+ if (status != AOM_CODEC_OK) return status;
+
+ if (is_annexb) {
+ // Derive the payload size from the data we've already read
+ if (obu_size < obu_header->size) return AOM_CODEC_CORRUPT_FRAME;
+
+ *payload_size = obu_size - obu_header->size;
+ } else {
+ // Size field comes after the OBU header, and is just the payload size
+ status = read_obu_size(data + obu_header->size,
+ bytes_available - obu_header->size, payload_size,
+ &length_field_size);
+ if (status != AOM_CODEC_OK) return status;
+ }
+
+ *bytes_read = length_field_size + obu_header->size;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/common/obu_util.h b/third_party/aom/av1/common/obu_util.h
new file mode 100644
index 0000000000..7c56904c84
--- /dev/null
+++ b/third_party/aom/av1/common/obu_util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_OBU_UTIL_H_
+#define AOM_AV1_COMMON_OBU_UTIL_H_
+
+#include "aom/aom_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ size_t size; // Size (1 or 2 bytes) of the OBU header (including the
+ // optional OBU extension header) in the bitstream.
+ OBU_TYPE type;
+ int has_size_field;
+ int has_extension;
+ // The following fields come from the OBU extension header and therefore are
+ // only used if has_extension is true.
+ int temporal_layer_id;
+ int spatial_layer_id;
+} ObuHeader;
+
+aom_codec_err_t aom_read_obu_header(uint8_t *buffer, size_t buffer_length,
+ size_t *consumed, ObuHeader *header,
+ int is_annexb);
+
+aom_codec_err_t aom_read_obu_header_and_size(const uint8_t *data,
+ size_t bytes_available,
+ int is_annexb,
+ ObuHeader *obu_header,
+ size_t *const payload_size,
+ size_t *const bytes_read);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_OBU_UTIL_H_
diff --git a/third_party/aom/av1/common/odintrin.c b/third_party/aom/av1/common/odintrin.c
new file mode 100644
index 0000000000..7584b2e52f
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "av1/common/odintrin.h"
+
+/*Constants for use with OD_DIVU_SMALL().
+ See \cite{Rob05} for details on computing these constants.
+ @INPROCEEDINGS{Rob05,
+ author="Arch D. Robison",
+ title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+ booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+ (ARITH'05)",
+ pages="131--139",
+ address="Cape Cod, MA",
+ month=Jun,
+ year=2005
+ }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+ { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xAAAAAAAB, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xCCCCCCCD, 0 }, { 0xAAAAAAAB, 0 },
+ { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xE38E38E4, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xBA2E8BA3, 0 }, { 0xAAAAAAAB, 0 },
+ { 0x9D89D89E, 0 }, { 0x92492492, 0x92492492 },
+ { 0x88888889, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xF0F0F0F1, 0 }, { 0xE38E38E4, 0 },
+ { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+ { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+ { 0xB21642C9, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA3D70A3E, 0 }, { 0x9D89D89E, 0 },
+ { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+ { 0x8D3DCB09, 0 }, { 0x88888889, 0 },
+ { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xF83E0F84, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+ { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+ { 0xC7CE0C7D, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xBE82FA0C, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+ { 0xAE4C415D, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA72F053A, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA0A0A0A1, 0 }, { 0x9D89D89E, 0 },
+ { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+ { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+ { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+ { 0x8AD8F2FC, 0 }, { 0x88888889, 0 },
+ { 0x864B8A7E, 0 }, { 0x84210842, 0x84210842 },
+ { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFC0FC0FD, 0 }, { 0xF83E0F84, 0 },
+ { 0xF4898D60, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xED7303B6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE6C2B449, 0 }, { 0xE38E38E4, 0 },
+ { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDA740DA8, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD4C77B04, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xCF6474A9, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCA4587E7, 0 }, { 0xC7CE0C7D, 0 },
+ { 0xC565C87C, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC0C0C0C1, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBC52640C, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB81702E1, 0 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+ { 0xB02C0B03, 0 }, { 0xAE4C415D, 0 },
+ { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+ { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+ { 0xA57EB503, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+ { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+ { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+ { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+ { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+ { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+ { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+ { 0x89AE408A, 0 }, { 0x88888889, 0 },
+ { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+ { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+ { 0x83126E98, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFE03F810, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFA232CF3, 0 }, { 0xF83E0F84, 0 },
+ { 0xF6603D99, 0 }, { 0xF4898D60, 0 },
+ { 0xF2B9D649, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xEF2EB720, 0 }, { 0xED7303B6, 0 },
+ { 0xEBBDB2A6, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE865AC7C, 0 }, { 0xE6C2B449, 0 },
+ { 0xE525982B, 0 }, { 0xE38E38E4, 0 },
+ { 0xE1FC780F, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xDEE95C4D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDBEB61EF, 0 }, { 0xDA740DA8, 0 },
+ { 0xD901B204, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD62B80D7, 0 }, { 0xD4C77B04, 0 },
+ { 0xD3680D37, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD0B69FCC, 0 }, { 0xCF6474A9, 0 },
+ { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+ { 0xCB8727C1, 0 }, { 0xCA4587E7, 0 },
+ { 0xC907DA4F, 0 }, { 0xC7CE0C7D, 0 },
+ { 0xC6980C6A, 0 }, { 0xC565C87C, 0 },
+ { 0xC4372F86, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC1E4BBD6, 0 }, { 0xC0C0C0C1, 0 },
+ { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+ { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+ { 0xBB3EE722, 0 }, { 0xBA2E8BA3, 0 },
+ { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+ { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB509E68B, 0 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB30F6353, 0 }, { 0xB21642C9, 0 },
+ { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+ { 0xAF3ADDC7, 0 }, { 0xAE4C415D, 0 },
+ { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+ { 0xAB8F69E3, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xA9C84A48, 0 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+ { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+ { 0xA4A9CF1E, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA3065E40, 0 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA16B312F, 0 }, { 0xA0A0A0A1, 0 },
+ { 0x9FD809FE, 0 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9E4CAD24, 0 }, { 0x9D89D89E, 0 },
+ { 0x9CC8E161, 0 }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9B4C6F9F, 0 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x99D722DB, 0 }, { 0x991F1A51, 0x991F1A51 },
+ { 0x9868C80A, 0 }, { 0x97B425ED, 0x97B425ED },
+ { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+ { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+ { 0x91A2B3C5, 0 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F1779DA, 0 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+ { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+ { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+ { 0x891AC73B, 0 }, { 0x88888889, 0 },
+ { 0x87F78088, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x86D90545, 0 }, { 0x864B8A7E, 0 },
+ { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+ { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+ { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+ { 0x828CBFBF, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81848DA9, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80808081, 0 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFF00FF01, 0 }, { 0xFE03F810, 0 },
+ { 0xFD08E551, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFB188566, 0 }, { 0xFA232CF3, 0 },
+ { 0xF92FB222, 0 }, { 0xF83E0F84, 0 },
+ { 0xF74E3FC3, 0 }, { 0xF6603D99, 0 },
+ { 0xF57403D6, 0 }, { 0xF4898D60, 0 },
+ { 0xF3A0D52D, 0 }, { 0xF2B9D649, 0 },
+ { 0xF1D48BCF, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+ { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+ { 0xEC979119, 0 }, { 0xEBBDB2A6, 0 },
+ { 0xEAE56404, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE9396520, 0 }, { 0xE865AC7C, 0 },
+ { 0xE79372E3, 0 }, { 0xE6C2B449, 0 },
+ { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+ { 0xE45932D8, 0 }, { 0xE38E38E4, 0 },
+ { 0xE2C4A689, 0 }, { 0xE1FC780F, 0 },
+ { 0xE135A9CA, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xDFAC1F75, 0 }, { 0xDEE95C4D, 0 },
+ { 0xDE27EB2D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDCA8F159, 0 }, { 0xDBEB61EF, 0 },
+ { 0xDB2F171E, 0 }, { 0xDA740DA8, 0 },
+ { 0xD9BA4257, 0 }, { 0xD901B204, 0 },
+ { 0xD84A598F, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD6DF43FD, 0 }, { 0xD62B80D7, 0 },
+ { 0xD578E97D, 0 }, { 0xD4C77B04, 0 },
+ { 0xD417328A, 0 }, { 0xD3680D37, 0 },
+ { 0xD2BA083C, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+ { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+ { 0xCEBCF8BC, 0 }, { 0xCE168A77, 0xCE168A77 },
+ { 0xCD712753, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCC29786D, 0 }, { 0xCB8727C1, 0 },
+ { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+ { 0xC9A633FD, 0 }, { 0xC907DA4F, 0 },
+ { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+ { 0xC73293D8, 0 }, { 0xC6980C6A, 0 },
+ { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+ { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+ { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC2780614, 0 }, { 0xC1E4BBD6, 0 },
+ { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+ { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+ { 0xBF112A8B, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBDF59C92, 0 }, { 0xBD691047, 0xBD691047 },
+ { 0xBCDD535E, 0 }, { 0xBC52640C, 0 },
+ { 0xBBC8408D, 0 }, { 0xBB3EE722, 0 },
+ { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+ { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+ { 0xB89BC36D, 0 }, { 0xB81702E1, 0 },
+ { 0xB79300B8, 0 }, { 0xB70FBB5A, 0xB70FBB5A },
+ { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+ { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+ { 0xB2927C2A, 0 }, { 0xB21642C9, 0 },
+ { 0xB19AB5C5, 0 }, { 0xB11FD3B8, 0xB11FD3B8 },
+ { 0xB0A59B42, 0 }, { 0xB02C0B03, 0 },
+ { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+ { 0xAEC33E20, 0 }, { 0xAE4C415D, 0 },
+ { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+ { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+ { 0xAC02B00B, 0 }, { 0xAB8F69E3, 0 },
+ { 0xAB1CBDD4, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xAA392F36, 0 }, { 0xA9C84A48, 0 },
+ { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA8791709, 0 }, { 0xA80A80A8, 0xA80A80A8 },
+ { 0xA79C7B17, 0 }, { 0xA72F053A, 0 },
+ { 0xA6C21DF7, 0 }, { 0xA655C439, 0xA655C439 },
+ { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+ { 0xA513FD6C, 0 }, { 0xA4A9CF1E, 0 },
+ { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+ { 0xA36E71A3, 0 }, { 0xA3065E40, 0 },
+ { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA1D13986, 0 }, { 0xA16B312F, 0 },
+ { 0xA105A933, 0 }, { 0xA0A0A0A1, 0 },
+ { 0xA03C1689, 0 }, { 0x9FD809FE, 0 },
+ { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+ { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+ { 0x9D2921C4, 0 }, { 0x9CC8E161, 0 },
+ { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+ { 0x9AEE72FD, 0 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+ { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+ { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+ { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+ { 0x975A7510, 0 }, { 0x97012E02, 0x97012E02 },
+ { 0x96A8500A, 0 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x95F7CC73, 0 }, { 0x95A02568, 0x95A02568 },
+ { 0x9548E498, 0 }, { 0x94F2094F, 0x94F2094F },
+ { 0x949B92DE, 0 }, { 0x94458094, 0x94458094 },
+ { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x93459BE7, 0 }, { 0x92F11384, 0x92F11384 },
+ { 0x929CEBF5, 0 }, { 0x92492492, 0x92492492 },
+ { 0x91F5BCB9, 0 }, { 0x91A2B3C5, 0 },
+ { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+ { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F67A1E4, 0 }, { 0x8F1779DA, 0 },
+ { 0x8EC7AB3A, 0 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8E2917E1, 0 }, { 0x8DDA5202, 0x8DDA5202 },
+ { 0x8D8BE340, 0 }, { 0x8D3DCB09, 0 },
+ { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+ { 0x8C55841D, 0 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8BBC50C9, 0 }, { 0x8B70344A, 0x8B70344A },
+ { 0x8B246A88, 0 }, { 0x8AD8F2FC, 0 },
+ { 0x8A8DCD20, 0 }, { 0x8A42F870, 0x8A42F870 },
+ { 0x89F8746A, 0 }, { 0x89AE408A, 0 },
+ { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+ { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+ { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+ { 0x87AF6FD6, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+ { 0x869222B2, 0 }, { 0x864B8A7E, 0 },
+ { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+ { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+ { 0x84EEDD36, 0 }, { 0x84A9F9C8, 0x84A9F9C8 },
+ { 0x84655D9C, 0 }, { 0x84210842, 0x84210842 },
+ { 0x83DCF94E, 0 }, { 0x83993052, 0x83993052 },
+ { 0x8355ACE4, 0 }, { 0x83126E98, 0 },
+ { 0x82CF7504, 0 }, { 0x828CBFBF, 0 },
+ { 0x824A4E61, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+ { 0x814327E4, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80C121B3, 0 }, { 0x80808081, 0 },
+ { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+ { 0xFF803FE1, 0 }, { 0xFF00FF01, 0 },
+ { 0xFE823CA6, 0 }, { 0xFE03F810, 0 },
+ { 0xFD863087, 0 }, { 0xFD08E551, 0 },
+ { 0xFC8C15B5, 0 }, { 0xFC0FC0FD, 0 },
+ { 0xFB93E673, 0 }, { 0xFB188566, 0 },
+ { 0xFA9D9D20, 0 }, { 0xFA232CF3, 0 },
+ { 0xF9A9342D, 0 }, { 0xF92FB222, 0 },
+ { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+ { 0xF7C5ED9D, 0 }, { 0xF74E3FC3, 0 },
+ { 0xF6D7054E, 0 }, { 0xF6603D99, 0 },
+ { 0xF5E9E7FD, 0 }, { 0xF57403D6, 0 },
+ { 0xF4FE9083, 0 }, { 0xF4898D60, 0 },
+ { 0xF414F9CE, 0 }, { 0xF3A0D52D, 0 },
+ { 0xF32D1EE0, 0 }, { 0xF2B9D649, 0 },
+ { 0xF246FACC, 0 }, { 0xF1D48BCF, 0 },
+ { 0xF16288B9, 0 }, { 0xF0F0F0F1, 0 },
+ { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+ { 0xEF9EA78C, 0 }, { 0xEF2EB720, 0 },
+ { 0xEEBF2F19, 0 }, { 0xEE500EE5, 0xEE500EE5 },
+ { 0xEDE155F4, 0 }, { 0xED7303B6, 0 },
+ { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+ { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+ { 0xEB5159A0, 0 }, { 0xEAE56404, 0 },
+ { 0xEA79D14A, 0 }, { 0xEA0EA0EA, 0xEA0EA0EA },
+ { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+ { 0xE8CF58AB, 0 }, { 0xE865AC7C, 0 },
+ { 0xE7FC600F, 0 }, { 0xE79372E3, 0 },
+ { 0xE72AE476, 0 }, { 0xE6C2B449, 0 },
+ { 0xE65AE1DC, 0 }, { 0xE5F36CB0, 0xE5F36CB0 },
+ { 0xE58C544A, 0 }, { 0xE525982B, 0 },
+ { 0xE4BF37D9, 0 }, { 0xE45932D8, 0 },
+ { 0xE3F388AF, 0 }, { 0xE38E38E4, 0 },
+ { 0xE32942FF, 0 }, { 0xE2C4A689, 0 },
+ { 0xE260630B, 0 }, { 0xE1FC780F, 0 },
+ { 0xE198E520, 0 }, { 0xE135A9CA, 0 },
+ { 0xE0D2C59A, 0 }, { 0xE070381C, 0xE070381C },
+ { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+ { 0xDF4A9369, 0 }, { 0xDEE95C4D, 0 },
+ { 0xDE8879B3, 0 }, { 0xDE27EB2D, 0 },
+ { 0xDDC7B04D, 0 }, { 0xDD67C8A6, 0xDD67C8A6 },
+ { 0xDD0833CE, 0 }, { 0xDCA8F159, 0 },
+ { 0xDC4A00DD, 0 }, { 0xDBEB61EF, 0 },
+ { 0xDB8D1428, 0 }, { 0xDB2F171E, 0 },
+ { 0xDAD16A6B, 0 }, { 0xDA740DA8, 0 },
+ { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+ { 0xD95DD300, 0 }, { 0xD901B204, 0 },
+ { 0xD8A5DEFF, 0 }, { 0xD84A598F, 0 },
+ { 0xD7EF2152, 0 }, { 0xD79435E5, 0xD79435E5 },
+ { 0xD73996E9, 0 }, { 0xD6DF43FD, 0 },
+ { 0xD6853CC1, 0 }, { 0xD62B80D7, 0 },
+ { 0xD5D20FDF, 0 }, { 0xD578E97D, 0 },
+ { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+ { 0xD46F3235, 0 }, { 0xD417328A, 0 },
+ { 0xD3BF7BA9, 0 }, { 0xD3680D37, 0 },
+ { 0xD310E6DB, 0 }, { 0xD2BA083C, 0 },
+ { 0xD2637101, 0 }, { 0xD20D20D2, 0xD20D20D2 },
+ { 0xD1B71759, 0 }, { 0xD161543E, 0xD161543E },
+ { 0xD10BD72C, 0 }, { 0xD0B69FCC, 0 },
+ { 0xD061ADCA, 0 }, { 0xD00D00D0, 0xD00D00D0 },
+ { 0xCFB8988C, 0 }, { 0xCF6474A9, 0 },
+ { 0xCF1094D4, 0 }, { 0xCEBCF8BC, 0 },
+ { 0xCE69A00D, 0 }, { 0xCE168A77, 0xCE168A77 },
+ { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+ { 0xCD1ED924, 0 }, { 0xCCCCCCCD, 0 },
+ { 0xCC7B0200, 0 }, { 0xCC29786D, 0 },
+ { 0xCBD82FC7, 0 }, { 0xCB8727C1, 0 },
+ { 0xCB36600D, 0 }, { 0xCAE5D85F, 0xCAE5D85F },
+ { 0xCA95906C, 0 }, { 0xCA4587E7, 0 },
+ { 0xC9F5BE86, 0 }, { 0xC9A633FD, 0 },
+ { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+ { 0xC8B90A96, 0 }, { 0xC86A7890, 0xC86A7890 },
+ { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+ { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+ { 0xC6E5321D, 0 }, { 0xC6980C6A, 0 },
+ { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+ { 0xC5B200C6, 0 }, { 0xC565C87C, 0 },
+ { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+ { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+ { 0xC3EC1A06, 0 }, { 0xC3A13DE6, 0xC3A13DE6 },
+ { 0xC3569AE6, 0 }, { 0xC30C30C3, 0xC30C30C3 },
+ { 0xC2C1FF3E, 0 }, { 0xC2780614, 0 },
+ { 0xC22E4507, 0 }, { 0xC1E4BBD6, 0 },
+ { 0xC19B6A42, 0 }, { 0xC152500C, 0xC152500C },
+ { 0xC1096CF6, 0 }, { 0xC0C0C0C1, 0 },
+ { 0xC0784B2F, 0 }, { 0xC0300C03, 0xC0300C03 },
+ { 0xBFE80300, 0 }, { 0xBFA02FE8, 0xBFA02FE8 },
+ { 0xBF589280, 0 }, { 0xBF112A8B, 0 },
+ { 0xBEC9F7CE, 0 }, { 0xBE82FA0C, 0 },
+ { 0xBE3C310C, 0 }, { 0xBDF59C92, 0 },
+ { 0xBDAF3C64, 0 }, { 0xBD691047, 0xBD691047 },
+ { 0xBD231803, 0 }, { 0xBCDD535E, 0 },
+ { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+ { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+ { 0xBB837AB1, 0 }, { 0xBB3EE722, 0 },
+ { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+ { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+ { 0xB9EAF063, 0 }, { 0xB9A7862A, 0xB9A7862A },
+ { 0xB9644CC4, 0 }, { 0xB92143FA, 0xB92143FA },
+ { 0xB8DE6B9A, 0 }, { 0xB89BC36D, 0 },
+ { 0xB8594B41, 0 }, { 0xB81702E1, 0 },
+ { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+ { 0xB7514689, 0 }, { 0xB70FBB5A, 0xB70FBB5A },
+ { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+ { 0xB64C31D9, 0 }, { 0xB60B60B6, 0xB60B60B6 },
+ { 0xB5CABD9B, 0 }, { 0xB58A4855, 0xB58A4855 },
+ { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+ { 0xB4C9F9A5, 0 }, { 0xB48A39D4, 0xB48A39D4 },
+ { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+ { 0xB3CC0706, 0 }, { 0xB38CF9B0, 0xB38CF9B0 },
+ { 0xB34E1884, 0 }, { 0xB30F6353, 0 },
+ { 0xB2D0D9EF, 0 }, { 0xB2927C2A, 0 },
+ { 0xB25449D7, 0 }, { 0xB21642C9, 0 },
+ { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+ { 0xB15D2F76, 0 }, { 0xB11FD3B8, 0xB11FD3B8 },
+ { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+ { 0xB068BE31, 0 }, { 0xB02C0B03, 0 },
+ { 0xAFEF818C, 0 }, { 0xAFB321A1, 0xAFB321A1 },
+ { 0xAF76EB19, 0 }, { 0xAF3ADDC7, 0 },
+ { 0xAEFEF982, 0 }, { 0xAEC33E20, 0 },
+ { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+ { 0xAE10FFA9, 0 }, { 0xADD5E632, 0xADD5E632 },
+ { 0xAD9AF4D0, 0 }, { 0xAD602B58, 0xAD602B58 },
+ { 0xAD2589A4, 0 }, { 0xACEB0F89, 0xACEB0F89 },
+ { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+ { 0xAC3C8D4A, 0 }, { 0xAC02B00B, 0 },
+ { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+ { 0xAB5600AC, 0 }, { 0xAB1CBDD4, 0 },
+ { 0xAAE3A136, 0 }, { 0xAAAAAAAB, 0 },
+ { 0xAA71DA0D, 0 }, { 0xAA392F36, 0 },
+ { 0xAA00AA01, 0 }, { 0xA9C84A48, 0 },
+ { 0xA9900FE6, 0 }, { 0xA957FAB5, 0xA957FAB5 },
+ { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+ { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+ { 0xA841B9AD, 0 }, { 0xA80A80A8, 0xA80A80A8 },
+ { 0xA7D36BD8, 0 }, { 0xA79C7B17, 0 },
+ { 0xA765AE44, 0 }, { 0xA72F053A, 0 },
+ { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+ { 0xA68BDF79, 0 }, { 0xA655C439, 0xA655C439 },
+ { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+ { 0xA5B4449D, 0 }, { 0xA57EB503, 0 },
+ { 0xA54947FE, 0 }, { 0xA513FD6C, 0 },
+ { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+ { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+ { 0xA40B88D0, 0 }, { 0xA3D70A3E, 0 },
+ { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+ { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+ { 0xA2D28634, 0 }, { 0xA29ECF16, 0xA29ECF16 },
+ { 0xA26B38C9, 0 }, { 0xA237C32B, 0xA237C32B },
+ { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+ { 0xA19E2540, 0 }, { 0xA16B312F, 0 },
+ { 0xA1385D35, 0 }, { 0xA105A933, 0 },
+ { 0xA0D3150C, 0 }, { 0xA0A0A0A1, 0 },
+ { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+ { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+ { 0x9FA63284, 0 }, { 0x9F747A15, 0x9F747A15 },
+ { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+ { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+ { 0x9E7DADA9, 0 }, { 0x9E4CAD24, 0 },
+ { 0x9E1BCAE3, 0 }, { 0x9DEB06C9, 0x9DEB06C9 },
+ { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+ { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+ { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+ { 0x9C98ED58, 0 }, { 0x9C69169B, 0x9C69169B },
+ { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+ { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+ { 0x9B7B98C0, 0 }, { 0x9B4C6F9F, 0 },
+ { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+ { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+ { 0x9A624C97, 0 }, { 0x9A33CD67, 0x9A33CD67 },
+ { 0x9A056A31, 0 }, { 0x99D722DB, 0 },
+ { 0x99A8F74C, 0 }, { 0x997AE76B, 0x997AE76B },
+ { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+ { 0x98F15CE7, 0 }, { 0x98C3BAC7, 0x98C3BAC7 },
+ { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+ { 0x983B773B, 0 }, { 0x980E4156, 0x980E4156 },
+ { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+ { 0x97874039, 0 }, { 0x975A7510, 0 },
+ { 0x972DC45B, 0 }, { 0x97012E02, 0x97012E02 },
+ { 0x96D4B1EF, 0 }, { 0x96A8500A, 0 },
+ { 0x967C083B, 0 }, { 0x964FDA6C, 0x964FDA6C },
+ { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+ { 0x95CBEC1B, 0 }, { 0x95A02568, 0x95A02568 },
+ { 0x95747844, 0 }, { 0x9548E498, 0 },
+ { 0x951D6A4E, 0 }, { 0x94F2094F, 0x94F2094F },
+ { 0x94C6C187, 0 }, { 0x949B92DE, 0 },
+ { 0x94707D3F, 0 }, { 0x94458094, 0x94458094 },
+ { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+ { 0x93C51F76, 0 }, { 0x939A85C4, 0x939A85C4 },
+ { 0x9370049C, 0 }, { 0x93459BE7, 0 },
+ { 0x931B4B91, 0 }, { 0x92F11384, 0x92F11384 },
+ { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+ { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+ { 0x921F64BF, 0 }, { 0x91F5BCB9, 0 },
+ { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+ { 0x917952AF, 0 }, { 0x91500915, 0x91500915 },
+ { 0x9126D6E5, 0 }, { 0x90FDBC09, 0x90FDBC09 },
+ { 0x90D4B86F, 0 }, { 0x90ABCC02, 0x90ABCC02 },
+ { 0x9082F6B0, 0 }, { 0x905A3863, 0x905A3863 },
+ { 0x9031910A, 0 }, { 0x90090090, 0x90090090 },
+ { 0x8FE086E3, 0 }, { 0x8FB823EE, 0x8FB823EE },
+ { 0x8F8FD7A0, 0 }, { 0x8F67A1E4, 0 },
+ { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+ { 0x8EEF8766, 0 }, { 0x8EC7AB3A, 0 },
+ { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+ { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+ { 0x8E01AA05, 0 }, { 0x8DDA5202, 0x8DDA5202 },
+ { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+ { 0x8D64CC5C, 0 }, { 0x8D3DCB09, 0 },
+ { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+ { 0x8CC947C5, 0 }, { 0x8CA29C04, 0x8CA29C04 },
+ { 0x8C7C057D, 0 }, { 0x8C55841D, 0 },
+ { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+ { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+ { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+ { 0x8B4A451A, 0 }, { 0x8B246A88, 0 },
+ { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+ { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+ { 0x8A6858AB, 0 }, { 0x8A42F870, 0x8A42F870 },
+ { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+ { 0x89D3507D, 0 }, { 0x89AE408A, 0 },
+ { 0x89894480, 0 }, { 0x89645C4F, 0x89645C4F },
+ { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+ { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+ { 0x88ACFAEE, 0 }, { 0x88888889, 0 },
+ { 0x8864298F, 0 }, { 0x883FDDF0, 0x883FDDF0 },
+ { 0x881BA59E, 0 }, { 0x87F78088, 0 },
+ { 0x87D36EA0, 0 }, { 0x87AF6FD6, 0 },
+ { 0x878B841B, 0 }, { 0x8767AB5F, 0x8767AB5F },
+ { 0x8743E595, 0 }, { 0x872032AC, 0x872032AC },
+ { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+ { 0x86B58AA8, 0 }, { 0x869222B2, 0 },
+ { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+ { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+ { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+ { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+ { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+ { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+ { 0x84CC6290, 0 }, { 0x84A9F9C8, 0x84A9F9C8 },
+ { 0x8487A2D1, 0 }, { 0x84655D9C, 0 },
+ { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+ { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+ { 0x83BB0C18, 0 }, { 0x83993052, 0x83993052 },
+ { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+ { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+ { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+ { 0x82AE11DE, 0 }, { 0x828CBFBF, 0 },
+ { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+ { 0x82292F08, 0 }, { 0x82082082, 0x82082082 },
+ { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+ { 0x81A55963, 0 }, { 0x81848DA9, 0 },
+ { 0x8163D283, 0 }, { 0x814327E4, 0 },
+ { 0x81228DBF, 0 }, { 0x81020408, 0x81020408 },
+ { 0x80E18AB3, 0 }, { 0x80C121B3, 0 },
+ { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+ { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+ { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/third_party/aom/av1/common/odintrin.h b/third_party/aom/av1/common/odintrin.h
new file mode 100644
index 0000000000..e1db0f44d8
--- /dev/null
+++ b/third_party/aom/av1/common/odintrin.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_AV1_COMMON_ODINTRIN_H_
+#define AOM_AV1_COMMON_ODINTRIN_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d) \
+ ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+ OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >> \
+ 32) >> \
+ (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+ (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+ OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat) \
+ ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+ ((maj) << 16) + ((min) << 8) + pat) // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+/** Copy n elements of memory from src to dst. The 0* term provides
+ compile-time type checking */
+#if !defined(OVERRIDE_OD_COPY)
+#define OD_COPY(dst, src, n) \
+ (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src))))
+#endif
+
+/** Copy n elements of memory from src to dst, allowing overlapping regions.
+ The 0* term provides compile-time type checking */
+#if !defined(OVERRIDE_OD_MOVE)
+# define OD_MOVE(dst, src, n) \
+ (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) ))
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ODINTRIN_H_
diff --git a/third_party/aom/av1/common/onyxc_int.h b/third_party/aom/av1/common/onyxc_int.h
new file mode 100644
index 0000000000..ff011c89e9
--- /dev/null
+++ b/third_party/aom/av1/common/onyxc_int.h
@@ -0,0 +1,1342 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_ONYXC_INT_H_
+#define AOM_AV1_COMMON_ONYXC_INT_H_
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/enums.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/mv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/timing.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/hash_motion.h"
+#include "aom_dsp/grain_synthesis.h"
+#include "aom_dsp/grain_table.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) && defined(__has_warning)
+#if __has_feature(cxx_attributes) && __has_warning("-Wimplicit-fallthrough")
+#define AOM_FALLTHROUGH_INTENDED [[clang::fallthrough]] // NOLINT
+#endif
+#elif defined(__GNUC__) && __GNUC__ >= 7
+#define AOM_FALLTHROUGH_INTENDED __attribute__((fallthrough)) // NOLINT
+#endif
+
+#ifndef AOM_FALLTHROUGH_INTENDED
+#define AOM_FALLTHROUGH_INTENDED \
+ do { \
+ } while (0)
+#endif
+
+#define CDEF_MAX_STRENGTHS 16
+
+/* Constant values while waiting for the sequence header */
+#define FRAME_ID_LENGTH 15
+#define DELTA_FRAME_ID_LENGTH 14
+
+#define FRAME_CONTEXTS (FRAME_BUFFERS + 1)
+// Extra frame context which is always kept at default values
+#define FRAME_CONTEXT_DEFAULTS (FRAME_CONTEXTS - 1)
+#define PRIMARY_REF_BITS 3
+#define PRIMARY_REF_NONE 7
+
+#define NUM_PING_PONG_BUFFERS 2
+
+#define MAX_NUM_TEMPORAL_LAYERS 8
+#define MAX_NUM_SPATIAL_LAYERS 4
+/* clang-format off */
+// clang-format seems to think this is a pointer dereference and not a
+// multiplication.
+#define MAX_NUM_OPERATING_POINTS \
+ MAX_NUM_TEMPORAL_LAYERS * MAX_NUM_SPATIAL_LAYERS
+/* clang-format on*/
+
+// TODO(jingning): Turning this on to set up transform coefficient
+// processing timer.
+#define TXCOEFF_TIMER 0
+#define TXCOEFF_COST_TIMER 0
+
+typedef enum {
+ SINGLE_REFERENCE = 0,
+ COMPOUND_REFERENCE = 1,
+ REFERENCE_MODE_SELECT = 2,
+ REFERENCE_MODES = 3,
+} REFERENCE_MODE;
+
+typedef enum {
+ /**
+ * Frame context updates are disabled
+ */
+ REFRESH_FRAME_CONTEXT_DISABLED,
+ /**
+ * Update frame context to values resulting from backward probability
+ * updates based on entropy/counts in the decoded frame
+ */
+ REFRESH_FRAME_CONTEXT_BACKWARD,
+} REFRESH_FRAME_CONTEXT_MODE;
+
+#define MFMV_STACK_SIZE 3
+typedef struct {
+ int_mv mfmv0;
+ uint8_t ref_frame_offset;
+} TPL_MV_REF;
+
+typedef struct {
+ int_mv mv;
+ MV_REFERENCE_FRAME ref_frame;
+} MV_REF;
+
+typedef struct {
+ int ref_count;
+
+ unsigned int cur_frame_offset;
+ unsigned int ref_frame_offset[INTER_REFS_PER_FRAME];
+
+ MV_REF *mvs;
+ uint8_t *seg_map;
+ struct segmentation seg;
+ int mi_rows;
+ int mi_cols;
+ // Width and height give the size of the buffer (before any upscaling, unlike
+ // the sizes that can be derived from the buf structure)
+ int width;
+ int height;
+ WarpedMotionParams global_motion[REF_FRAMES];
+ int showable_frame; // frame can be used as show existing frame in future
+ int film_grain_params_present;
+ aom_film_grain_t film_grain_params;
+ aom_codec_frame_buffer_t raw_frame_buffer;
+ YV12_BUFFER_CONFIG buf;
+ hash_table hash_table;
+ uint8_t intra_only;
+ FRAME_TYPE frame_type;
+ // The Following variables will only be used in frame parallel decode.
+
+ // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
+ // that no FrameWorker owns, or is decoding, this buffer.
+ AVxWorker *frame_worker_owner;
+
+ // row and col indicate which position frame has been decoded to in real
+ // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
+ // when the frame is fully decoded.
+ int row;
+ int col;
+
+ // Inter frame reference frame delta for loop filter
+ int8_t ref_deltas[REF_FRAMES];
+
+ // 0 = ZERO_MV, MV
+ int8_t mode_deltas[MAX_MODE_LF_DELTAS];
+} RefCntBuffer;
+
+typedef struct BufferPool {
+// Protect BufferPool from being accessed by several FrameWorkers at
+// the same time during frame parallel decode.
+// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t pool_mutex;
+#endif
+
+ // Private data associated with the frame buffer callbacks.
+ void *cb_priv;
+
+ aom_get_frame_buffer_cb_fn_t get_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+ RefCntBuffer frame_bufs[FRAME_BUFFERS];
+
+ // Frame buffers allocated internally by the codec.
+ InternalFrameBufferList int_frame_buffers;
+} BufferPool;
+
+typedef struct {
+ int base_ctx_table[2 /*row*/][2 /*col*/][3 /*sig_map*/]
+ [BASE_CONTEXT_POSITION_NUM + 1];
+} LV_MAP_CTX_TABLE;
+typedef int BASE_CTX_TABLE[2 /*col*/][3 /*sig_map*/]
+ [BASE_CONTEXT_POSITION_NUM + 1];
+
+typedef struct BitstreamLevel {
+ uint8_t major;
+ uint8_t minor;
+} BitstreamLevel;
+
+// Sequence header structure.
+// Note: All syntax elements of sequence_header_obu that need to be
+// bit-identical across multiple sequence headers must be part of this struct,
+// so that consistency is checked by are_seq_headers_consistent() function.
+typedef struct SequenceHeader {
+ int num_bits_width;
+ int num_bits_height;
+ int max_frame_width;
+ int max_frame_height;
+ int frame_id_numbers_present_flag;
+ int frame_id_length;
+ int delta_frame_id_length;
+ BLOCK_SIZE sb_size; // Size of the superblock used for this frame
+ int mib_size; // Size of the superblock in units of MI blocks
+ int mib_size_log2; // Log 2 of above.
+ int order_hint_bits_minus_1;
+ int force_screen_content_tools; // 0 - force off
+ // 1 - force on
+ // 2 - adaptive
+ int force_integer_mv; // 0 - Not to force. MV can be in 1/4 or 1/8
+ // 1 - force to integer
+ // 2 - adaptive
+ int still_picture; // Video is a single frame still picture
+ int reduced_still_picture_hdr; // Use reduced header for still picture
+ int enable_filter_intra; // enables/disables filterintra
+ int enable_intra_edge_filter; // enables/disables corner/edge/upsampling
+ int enable_interintra_compound; // enables/disables interintra_compound
+ int enable_masked_compound; // enables/disables masked compound
+ int enable_dual_filter; // 0 - disable dual interpolation filter
+ // 1 - enable vert/horiz filter selection
+ int enable_order_hint; // 0 - disable order hint, and related tools
+ // jnt_comp, ref_frame_mvs, frame_sign_bias
+ // if 0, enable_jnt_comp and
+ // enable_ref_frame_mvs must be set zs 0.
+ int enable_jnt_comp; // 0 - disable joint compound modes
+ // 1 - enable it
+ int enable_ref_frame_mvs; // 0 - disable ref frame mvs
+ // 1 - enable it
+ int enable_warped_motion; // 0 - disable warped motion for sequence
+ // 1 - enable it for the sequence
+ int enable_superres; // 0 - Disable superres for the sequence, and disable
+ // transmitting per-frame superres enabled flag.
+ // 1 - Enable superres for the sequence, and also
+ // enable per-frame flag to denote if superres is
+ // enabled for that frame.
+ int enable_cdef; // To turn on/off CDEF
+ int enable_restoration; // To turn on/off loop restoration
+ BITSTREAM_PROFILE profile;
+
+ // Operating point info.
+ int operating_points_cnt_minus_1;
+ int operating_point_idc[MAX_NUM_OPERATING_POINTS];
+ int display_model_info_present_flag;
+ int decoder_model_info_present_flag;
+ BitstreamLevel level[MAX_NUM_OPERATING_POINTS];
+ uint8_t tier[MAX_NUM_OPERATING_POINTS]; // seq_tier in the spec. One bit: 0
+ // or 1.
+
+ // Color config.
+ aom_bit_depth_t bit_depth; // AOM_BITS_8 in profile 0 or 1,
+ // AOM_BITS_10 or AOM_BITS_12 in profile 2 or 3.
+ int use_highbitdepth; // If true, we need to use 16bit frame buffers.
+ int monochrome; // Monochorme video
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ int color_range;
+ int subsampling_x; // Chroma subsampling for x
+ int subsampling_y; // Chroma subsampling for y
+ aom_chroma_sample_position_t chroma_sample_position;
+ int separate_uv_delta_q;
+
+ int film_grain_params_present;
+} SequenceHeader;
+
+typedef struct AV1Common {
+ struct aom_internal_error_info error;
+ int width;
+ int height;
+ int render_width;
+ int render_height;
+ int last_width;
+ int last_height;
+ int timing_info_present;
+ aom_timing_info_t timing_info;
+ int buffer_removal_time_present;
+ aom_dec_model_info_t buffer_model;
+ aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+ aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+ uint32_t frame_presentation_time;
+
+ int largest_tile_id;
+ size_t largest_tile_size;
+ int context_update_tile_id;
+
+ // Scale of the current frame with respect to itself.
+ struct scale_factors sf_identity;
+
+ YV12_BUFFER_CONFIG *frame_to_show;
+ RefCntBuffer *prev_frame;
+
+ // TODO(hkuang): Combine this with cur_buf in macroblockd.
+ RefCntBuffer *cur_frame;
+
+ int ref_frame_map[REF_FRAMES]; /* maps fb_idx to reference slot */
+
+ // Prepare ref_frame_map for the next frame.
+ // Only used in frame parallel decode.
+ int next_ref_frame_map[REF_FRAMES];
+
+ // TODO(jkoleszar): could expand active_ref_idx to 4, with 0 as intra, and
+ // roll new_fb_idx into it.
+
+ // Each Inter frame can reference INTER_REFS_PER_FRAME buffers
+ RefBuffer frame_refs[INTER_REFS_PER_FRAME];
+ int is_skip_mode_allowed;
+ int skip_mode_flag;
+ int ref_frame_idx_0;
+ int ref_frame_idx_1;
+
+ int new_fb_idx;
+
+ FRAME_TYPE last_frame_type; /* last frame's frame type for motion search.*/
+ FRAME_TYPE frame_type;
+
+ int show_frame;
+ int showable_frame; // frame can be used as show existing frame in future
+ int last_show_frame;
+ int show_existing_frame;
+ // Flag for a frame used as a reference - not written to the bitstream
+ int is_reference_frame;
+ int reset_decoder_state;
+
+ // Flag signaling that the frame is encoded using only INTRA modes.
+ uint8_t intra_only;
+ uint8_t last_intra_only;
+ uint8_t disable_cdf_update;
+ int allow_high_precision_mv;
+ int cur_frame_force_integer_mv; // 0 the default in AOM, 1 only integer
+
+ int allow_screen_content_tools;
+ int allow_intrabc;
+ int allow_warped_motion;
+
+ // MBs, mb_rows/cols is in 16-pixel units; mi_rows/cols is in
+ // MB_MODE_INFO (8-pixel) units.
+ int MBs;
+ int mb_rows, mi_rows;
+ int mb_cols, mi_cols;
+ int mi_stride;
+
+ /* profile settings */
+ TX_MODE tx_mode;
+
+#if CONFIG_ENTROPY_STATS
+ int coef_cdf_category;
+#endif
+
+ int base_qindex;
+ int y_dc_delta_q;
+ int u_dc_delta_q;
+ int v_dc_delta_q;
+ int u_ac_delta_q;
+ int v_ac_delta_q;
+
+ // The dequantizers below are true dequntizers used only in the
+ // dequantization process. They have the same coefficient
+ // shift/scale as TX.
+ int16_t y_dequant_QTX[MAX_SEGMENTS][2];
+ int16_t u_dequant_QTX[MAX_SEGMENTS][2];
+ int16_t v_dequant_QTX[MAX_SEGMENTS][2];
+
+ // Global quant matrix tables
+ const qm_val_t *giqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+ const qm_val_t *gqmatrix[NUM_QM_LEVELS][3][TX_SIZES_ALL];
+
+ // Local quant matrix tables for each frame
+ const qm_val_t *y_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ const qm_val_t *u_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+ const qm_val_t *v_iqmatrix[MAX_SEGMENTS][TX_SIZES_ALL];
+
+ // Encoder
+ int using_qmatrix;
+ int qm_y;
+ int qm_u;
+ int qm_v;
+ int min_qmlevel;
+ int max_qmlevel;
+
+ /* We allocate a MB_MODE_INFO struct for each macroblock, together with
+ an extra row on top and column on the left to simplify prediction. */
+ int mi_alloc_size;
+ MB_MODE_INFO *mip; /* Base of allocated array */
+ MB_MODE_INFO *mi; /* Corresponds to upper left visible macroblock */
+
+ // TODO(agrange): Move prev_mi into encoder structure.
+ // prev_mip and prev_mi will only be allocated in encoder.
+ MB_MODE_INFO *prev_mip; /* MB_MODE_INFO array 'mip' from last decoded frame */
+ MB_MODE_INFO *prev_mi; /* 'mi' from last frame (points into prev_mip) */
+
+ // Separate mi functions between encoder and decoder.
+ int (*alloc_mi)(struct AV1Common *cm, int mi_size);
+ void (*free_mi)(struct AV1Common *cm);
+ void (*setup_mi)(struct AV1Common *cm);
+
+ // Grid of pointers to 8x8 MB_MODE_INFO structs. Any 8x8 not in the visible
+ // area will be NULL.
+ MB_MODE_INFO **mi_grid_base;
+ MB_MODE_INFO **mi_grid_visible;
+ MB_MODE_INFO **prev_mi_grid_base;
+ MB_MODE_INFO **prev_mi_grid_visible;
+
+ // Whether to use previous frames' motion vectors for prediction.
+ int allow_ref_frame_mvs;
+
+ uint8_t *last_frame_seg_map;
+ uint8_t *current_frame_seg_map;
+ int seg_map_alloc_size;
+
+ InterpFilter interp_filter;
+
+ int switchable_motion_mode;
+
+ loop_filter_info_n lf_info;
+ // The denominator of the superres scale; the numerator is fixed.
+ uint8_t superres_scale_denominator;
+ int superres_upscaled_width;
+ int superres_upscaled_height;
+ RestorationInfo rst_info[MAX_MB_PLANE];
+
+ // rst_end_stripe[i] is one more than the index of the bottom stripe
+ // for tile row i.
+ int rst_end_stripe[MAX_TILE_ROWS];
+
+ // Pointer to a scratch buffer used by self-guided restoration
+ int32_t *rst_tmpbuf;
+ RestorationLineBuffers *rlbs;
+
+ // Output of loop restoration
+ YV12_BUFFER_CONFIG rst_frame;
+
+ // Flag signaling how frame contexts should be updated at the end of
+ // a frame decode
+ REFRESH_FRAME_CONTEXT_MODE refresh_frame_context;
+
+ int ref_frame_sign_bias[REF_FRAMES]; /* Two state 0, 1 */
+
+ struct loopfilter lf;
+ struct segmentation seg;
+ int coded_lossless; // frame is fully lossless at the coded resolution.
+ int all_lossless; // frame is fully lossless at the upscaled resolution.
+
+ int reduced_tx_set_used;
+
+ // Context probabilities for reference frame prediction
+ MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
+ MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+ REFERENCE_MODE reference_mode;
+
+ FRAME_CONTEXT *fc; /* this frame entropy */
+ FRAME_CONTEXT *frame_contexts; // FRAME_CONTEXTS
+ unsigned int frame_context_idx; /* Context to use/update */
+ int fb_of_context_type[REF_FRAMES];
+ int primary_ref_frame;
+
+ unsigned int frame_offset;
+
+ unsigned int current_video_frame;
+
+ aom_bit_depth_t dequant_bit_depth; // bit_depth of current dequantizer
+
+ int error_resilient_mode;
+ int force_primary_ref_none;
+
+ int tile_cols, tile_rows;
+ int last_tile_cols, last_tile_rows;
+
+ int max_tile_width_sb;
+ int min_log2_tile_cols;
+ int max_log2_tile_cols;
+ int max_log2_tile_rows;
+ int min_log2_tile_rows;
+ int min_log2_tiles;
+ int max_tile_height_sb;
+ int uniform_tile_spacing_flag;
+ int log2_tile_cols; // only valid for uniform tiles
+ int log2_tile_rows; // only valid for uniform tiles
+ int tile_col_start_sb[MAX_TILE_COLS + 1]; // valid for 0 <= i <= tile_cols
+ int tile_row_start_sb[MAX_TILE_ROWS + 1]; // valid for 0 <= i <= tile_rows
+ int tile_width, tile_height; // In MI units
+
+ unsigned int large_scale_tile;
+ unsigned int single_tile_decoding;
+
+ int byte_alignment;
+ int skip_loop_filter;
+ int skip_film_grain;
+
+ // Private data associated with the frame buffer callbacks.
+ void *cb_priv;
+ aom_get_frame_buffer_cb_fn_t get_fb_cb;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb;
+
+ // Handles memory for the codec.
+ InternalFrameBufferList int_frame_buffers;
+
+ // External BufferPool passed from outside.
+ BufferPool *buffer_pool;
+
+ PARTITION_CONTEXT **above_seg_context;
+ ENTROPY_CONTEXT **above_context[MAX_MB_PLANE];
+ TXFM_CONTEXT **above_txfm_context;
+ WarpedMotionParams global_motion[REF_FRAMES];
+ aom_film_grain_t film_grain_params;
+
+ int cdef_pri_damping;
+ int cdef_sec_damping;
+ int nb_cdef_strengths;
+ int cdef_strengths[CDEF_MAX_STRENGTHS];
+ int cdef_uv_strengths[CDEF_MAX_STRENGTHS];
+ int cdef_bits;
+
+ int delta_q_present_flag;
+ // Resolution of delta quant
+ int delta_q_res;
+ int delta_lf_present_flag;
+ // Resolution of delta lf level
+ int delta_lf_res;
+ // This is a flag for number of deltas of loop filter level
+ // 0: use 1 delta, for y_vertical, y_horizontal, u, and v
+ // 1: use separate deltas for each filter level
+ int delta_lf_multi;
+ int num_tg;
+ SequenceHeader seq_params;
+ int current_frame_id;
+ int ref_frame_id[REF_FRAMES];
+ int valid_for_referencing[REF_FRAMES];
+ int invalid_delta_frame_id_minus_1;
+ LV_MAP_CTX_TABLE coeff_ctx_table;
+ TPL_MV_REF *tpl_mvs;
+ int tpl_mvs_mem_size;
+ // TODO(jingning): This can be combined with sign_bias later.
+ int8_t ref_frame_side[REF_FRAMES];
+
+ int is_annexb;
+
+ int frame_refs_short_signaling;
+ int temporal_layer_id;
+ int spatial_layer_id;
+ unsigned int number_temporal_layers;
+ unsigned int number_spatial_layers;
+ int num_allocated_above_context_mi_col;
+ int num_allocated_above_contexts;
+ int num_allocated_above_context_planes;
+
+#if TXCOEFF_TIMER
+ int64_t cum_txcoeff_timer;
+ int64_t txcoeff_timer;
+ int txb_count;
+#endif
+
+#if TXCOEFF_COST_TIMER
+ int64_t cum_txcoeff_cost_timer;
+ int64_t txcoeff_cost_timer;
+ int64_t txcoeff_cost_count;
+#endif
+ const cfg_options_t *options;
+} AV1_COMMON;
+
+// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
+// frame reference count.
+static void lock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static void unlock_buffer_pool(BufferPool *const pool) {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(&pool->pool_mutex);
+#else
+ (void)pool;
+#endif
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame(AV1_COMMON *cm, int index) {
+ if (index < 0 || index >= REF_FRAMES) return NULL;
+ if (cm->ref_frame_map[index] < 0) return NULL;
+ assert(cm->ref_frame_map[index] < FRAME_BUFFERS);
+ return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+ const AV1_COMMON *const cm) {
+ return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
+}
+
+static INLINE int get_free_fb(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i)
+ if (frame_bufs[i].ref_count == 0) break;
+
+ if (i != FRAME_BUFFERS) {
+ if (frame_bufs[i].buf.use_external_reference_buffers) {
+ // If this frame buffer's y_buffer, u_buffer, and v_buffer point to the
+ // external reference buffers. Restore the buffer pointers to point to the
+ // internally allocated memory.
+ YV12_BUFFER_CONFIG *ybf = &frame_bufs[i].buf;
+ ybf->y_buffer = ybf->store_buf_adr[0];
+ ybf->u_buffer = ybf->store_buf_adr[1];
+ ybf->v_buffer = ybf->store_buf_adr[2];
+ ybf->use_external_reference_buffers = 0;
+ }
+
+ frame_bufs[i].ref_count = 1;
+ } else {
+ // Reset i to be INVALID_IDX to indicate no free buffer found.
+ i = INVALID_IDX;
+ }
+
+ unlock_buffer_pool(cm->buffer_pool);
+ return i;
+}
+
+static INLINE void ref_cnt_fb(RefCntBuffer *bufs, int *idx, int new_idx) {
+ const int ref_index = *idx;
+
+ if (ref_index >= 0 && bufs[ref_index].ref_count > 0)
+ bufs[ref_index].ref_count--;
+
+ *idx = new_idx;
+
+ bufs[new_idx].ref_count++;
+}
+
+static INLINE int frame_is_intra_only(const AV1_COMMON *const cm) {
+ return cm->frame_type == KEY_FRAME || cm->intra_only;
+}
+
+static INLINE int frame_is_sframe(const AV1_COMMON *cm) {
+ return cm->frame_type == S_FRAME;
+}
+
+static INLINE RefCntBuffer *get_prev_frame(const AV1_COMMON *const cm) {
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+ cm->frame_refs[cm->primary_ref_frame].idx == INVALID_IDX) {
+ return NULL;
+ } else {
+ return &cm->buffer_pool
+ ->frame_bufs[cm->frame_refs[cm->primary_ref_frame].idx];
+ }
+}
+
+// Returns 1 if this frame might allow mvs from some reference frame.
+static INLINE int frame_might_allow_ref_frame_mvs(const AV1_COMMON *cm) {
+ return !cm->error_resilient_mode && cm->seq_params.enable_ref_frame_mvs &&
+ cm->seq_params.enable_order_hint && !frame_is_intra_only(cm);
+}
+
+// Returns 1 if this frame might use warped_motion
+static INLINE int frame_might_allow_warped_motion(const AV1_COMMON *cm) {
+ return !cm->error_resilient_mode && !frame_is_intra_only(cm) &&
+ cm->seq_params.enable_warped_motion;
+}
+
+static INLINE void ensure_mv_buffer(RefCntBuffer *buf, AV1_COMMON *cm) {
+ const int buf_rows = buf->mi_rows;
+ const int buf_cols = buf->mi_cols;
+
+ if (buf->mvs == NULL || buf_rows != cm->mi_rows || buf_cols != cm->mi_cols) {
+ aom_free(buf->mvs);
+ buf->mi_rows = cm->mi_rows;
+ buf->mi_cols = cm->mi_cols;
+ CHECK_MEM_ERROR(cm, buf->mvs,
+ (MV_REF *)aom_calloc(
+ ((cm->mi_rows + 1) >> 1) * ((cm->mi_cols + 1) >> 1),
+ sizeof(*buf->mvs)));
+ aom_free(buf->seg_map);
+ CHECK_MEM_ERROR(cm, buf->seg_map,
+ (uint8_t *)aom_calloc(cm->mi_rows * cm->mi_cols,
+ sizeof(*buf->seg_map)));
+ }
+
+ const int mem_size =
+ ((cm->mi_rows + MAX_MIB_SIZE) >> 1) * (cm->mi_stride >> 1);
+ int realloc = cm->tpl_mvs == NULL;
+ if (cm->tpl_mvs) realloc |= cm->tpl_mvs_mem_size < mem_size;
+
+ if (realloc) {
+ aom_free(cm->tpl_mvs);
+ CHECK_MEM_ERROR(cm, cm->tpl_mvs,
+ (TPL_MV_REF *)aom_calloc(mem_size, sizeof(*cm->tpl_mvs)));
+ cm->tpl_mvs_mem_size = mem_size;
+ }
+}
+
+void cfl_init(CFL_CTX *cfl, const SequenceHeader *seq_params);
+
+static INLINE int av1_num_planes(const AV1_COMMON *cm) {
+ return cm->seq_params.monochrome ? 1 : MAX_MB_PLANE;
+}
+
+static INLINE void av1_init_above_context(AV1_COMMON *cm, MACROBLOCKD *xd,
+ const int tile_row) {
+ const int num_planes = av1_num_planes(cm);
+ for (int i = 0; i < num_planes; ++i) {
+ xd->above_context[i] = cm->above_context[i][tile_row];
+ }
+ xd->above_seg_context = cm->above_seg_context[tile_row];
+ xd->above_txfm_context = cm->above_txfm_context[tile_row];
+}
+
+static INLINE void av1_init_macroblockd(AV1_COMMON *cm, MACROBLOCKD *xd,
+ tran_low_t *dqcoeff) {
+ const int num_planes = av1_num_planes(cm);
+ for (int i = 0; i < num_planes; ++i) {
+ xd->plane[i].dqcoeff = dqcoeff;
+
+ if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
+ memcpy(xd->plane[i].seg_dequant_QTX, cm->y_dequant_QTX,
+ sizeof(cm->y_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, cm->y_iqmatrix, sizeof(cm->y_iqmatrix));
+
+ } else {
+ if (i == AOM_PLANE_U) {
+ memcpy(xd->plane[i].seg_dequant_QTX, cm->u_dequant_QTX,
+ sizeof(cm->u_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, cm->u_iqmatrix,
+ sizeof(cm->u_iqmatrix));
+ } else {
+ memcpy(xd->plane[i].seg_dequant_QTX, cm->v_dequant_QTX,
+ sizeof(cm->v_dequant_QTX));
+ memcpy(xd->plane[i].seg_iqmatrix, cm->v_iqmatrix,
+ sizeof(cm->v_iqmatrix));
+ }
+ }
+ }
+ xd->mi_stride = cm->mi_stride;
+ xd->error_info = &cm->error;
+ cfl_init(&xd->cfl, &cm->seq_params);
+}
+
+static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col,
+ const int num_planes) {
+ int i;
+ int row_offset = mi_row;
+ int col_offset = mi_col;
+ for (i = 0; i < num_planes; ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ // Offset the buffer pointer
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ if (pd->subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+ row_offset = mi_row - 1;
+ if (pd->subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+ col_offset = mi_col - 1;
+ int above_idx = col_offset;
+ int left_idx = row_offset & MAX_MIB_MASK;
+ pd->above_context = &xd->above_context[i][above_idx >> pd->subsampling_x];
+ pd->left_context = &xd->left_context[i][left_idx >> pd->subsampling_y];
+ }
+}
+
+static INLINE int calc_mi_size(int len) {
+ // len is in mi units. Align to a multiple of SBs.
+ return ALIGN_POWER_OF_TWO(len, MAX_MIB_SIZE_LOG2);
+}
+
+static INLINE void set_plane_n4(MACROBLOCKD *const xd, int bw, int bh,
+ const int num_planes) {
+ int i;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].width = (bw * MI_SIZE) >> xd->plane[i].subsampling_x;
+ xd->plane[i].height = (bh * MI_SIZE) >> xd->plane[i].subsampling_y;
+
+ xd->plane[i].width = AOMMAX(xd->plane[i].width, 4);
+ xd->plane[i].height = AOMMAX(xd->plane[i].height, 4);
+ }
+}
+
+static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
+ int mi_row, int bh, int mi_col, int bw,
+ int mi_rows, int mi_cols) {
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_bottom_edge = ((mi_rows - bh - mi_row) * MI_SIZE) * 8;
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
+
+ // Are edges available for intra prediction?
+ xd->up_available = (mi_row > tile->mi_row_start);
+
+ const int ss_x = xd->plane[1].subsampling_x;
+ const int ss_y = xd->plane[1].subsampling_y;
+
+ xd->left_available = (mi_col > tile->mi_col_start);
+ xd->chroma_up_available = xd->up_available;
+ xd->chroma_left_available = xd->left_available;
+ if (ss_x && bw < mi_size_wide[BLOCK_8X8])
+ xd->chroma_left_available = (mi_col - 1) > tile->mi_col_start;
+ if (ss_y && bh < mi_size_high[BLOCK_8X8])
+ xd->chroma_up_available = (mi_row - 1) > tile->mi_row_start;
+ if (xd->up_available) {
+ xd->above_mbmi = xd->mi[-xd->mi_stride];
+ } else {
+ xd->above_mbmi = NULL;
+ }
+
+ if (xd->left_available) {
+ xd->left_mbmi = xd->mi[-1];
+ } else {
+ xd->left_mbmi = NULL;
+ }
+
+ const int chroma_ref = ((mi_row & 0x01) || !(bh & 0x01) || !ss_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !ss_x);
+ if (chroma_ref) {
+ // To help calculate the "above" and "left" chroma blocks, note that the
+ // current block may cover multiple luma blocks (eg, if partitioned into
+ // 4x4 luma blocks).
+ // First, find the top-left-most luma block covered by this chroma block
+ MB_MODE_INFO **base_mi =
+ &xd->mi[-(mi_row & ss_y) * xd->mi_stride - (mi_col & ss_x)];
+
+ // Then, we consider the luma region covered by the left or above 4x4 chroma
+ // prediction. We want to point to the chroma reference block in that
+ // region, which is the bottom-right-most mi unit.
+ // This leads to the following offsets:
+ MB_MODE_INFO *chroma_above_mi =
+ xd->chroma_up_available ? base_mi[-xd->mi_stride + ss_x] : NULL;
+ xd->chroma_above_mbmi = chroma_above_mi;
+
+ MB_MODE_INFO *chroma_left_mi =
+ xd->chroma_left_available ? base_mi[ss_y * xd->mi_stride - 1] : NULL;
+ xd->chroma_left_mbmi = chroma_left_mi;
+ }
+
+ xd->n4_h = bh;
+ xd->n4_w = bw;
+ xd->is_sec_rect = 0;
+ if (xd->n4_w < xd->n4_h) {
+ // Only mark is_sec_rect as 1 for the last block.
+ // For PARTITION_VERT_4, it would be (0, 0, 0, 1);
+ // For other partitions, it would be (0, 1).
+ if (!((mi_col + xd->n4_w) & (xd->n4_h - 1))) xd->is_sec_rect = 1;
+ }
+
+ if (xd->n4_w > xd->n4_h)
+ if (mi_row & (xd->n4_w - 1)) xd->is_sec_rect = 1;
+}
+
+static INLINE aom_cdf_prob *get_y_mode_cdf(FRAME_CONTEXT *tile_ctx,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi) {
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ return tile_ctx->kf_y_cdf[above_ctx][left_ctx];
+}
+
+static INLINE void update_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize) {
+ PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
+ PARTITION_CONTEXT *const left_ctx =
+ xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ memset(above_ctx, partition_context_lookup[subsize].above, bw);
+ memset(left_ctx, partition_context_lookup[subsize].left, bh);
+}
+
+static INLINE int is_chroma_reference(int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int subsampling_x, int subsampling_y) {
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ int ref_pos = ((mi_row & 0x01) || !(bh & 0x01) || !subsampling_y) &&
+ ((mi_col & 0x01) || !(bw & 0x01) || !subsampling_x);
+ return ref_pos;
+}
+
+static INLINE BLOCK_SIZE scale_chroma_bsize(BLOCK_SIZE bsize, int subsampling_x,
+ int subsampling_y) {
+ BLOCK_SIZE bs = bsize;
+ switch (bsize) {
+ case BLOCK_4X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_4X8:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X8;
+ break;
+ case BLOCK_8X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_8X8;
+ break;
+ case BLOCK_4X16:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_x == 1)
+ bs = BLOCK_8X16;
+ else if (subsampling_y == 1)
+ bs = BLOCK_4X16;
+ break;
+ case BLOCK_16X4:
+ if (subsampling_x == 1 && subsampling_y == 1)
+ bs = BLOCK_16X8;
+ else if (subsampling_x == 1)
+ bs = BLOCK_16X4;
+ else if (subsampling_y == 1)
+ bs = BLOCK_16X8;
+ break;
+ default: break;
+ }
+ return bs;
+}
+
+static INLINE aom_cdf_prob cdf_element_prob(const aom_cdf_prob *cdf,
+ size_t element) {
+ assert(cdf != NULL);
+ return (element > 0 ? cdf[element - 1] : CDF_PROB_TOP) - cdf[element];
+}
+
+static INLINE void partition_gather_horz_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_B);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_HORZ_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void partition_gather_vert_alike(aom_cdf_prob *out,
+ const aom_cdf_prob *const in,
+ BLOCK_SIZE bsize) {
+ (void)bsize;
+ out[0] = CDF_PROB_TOP;
+ out[0] -= cdf_element_prob(in, PARTITION_VERT);
+ out[0] -= cdf_element_prob(in, PARTITION_SPLIT);
+ out[0] -= cdf_element_prob(in, PARTITION_HORZ_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_A);
+ out[0] -= cdf_element_prob(in, PARTITION_VERT_B);
+ if (bsize != BLOCK_128X128) out[0] -= cdf_element_prob(in, PARTITION_VERT_4);
+ out[0] = AOM_ICDF(out[0]);
+ out[1] = AOM_ICDF(CDF_PROB_TOP);
+}
+
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE subsize,
+ BLOCK_SIZE bsize,
+ PARTITION_TYPE partition) {
+ if (bsize >= BLOCK_8X8) {
+ const int hbs = mi_size_wide[bsize] / 2;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ switch (partition) {
+ case PARTITION_SPLIT:
+ if (bsize != BLOCK_8X8) break;
+ AOM_FALLTHROUGH_INTENDED;
+ case PARTITION_NONE:
+ case PARTITION_HORZ:
+ case PARTITION_VERT:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+ break;
+ case PARTITION_HORZ_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+ break;
+ case PARTITION_HORZ_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+ break;
+ case PARTITION_VERT_A:
+ update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+ break;
+ case PARTITION_VERT_B:
+ update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+ update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+ break;
+ default: assert(0 && "Invalid partition type");
+ }
+ }
+}
+
+static INLINE int partition_plane_context(const MACROBLOCKD *xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
+ const PARTITION_CONTEXT *left_ctx =
+ xd->left_seg_context + (mi_row & MAX_MIB_MASK);
+ // Minimum partition point is 8x8. Offset the bsl accordingly.
+ const int bsl = mi_size_wide_log2[bsize] - mi_size_wide_log2[BLOCK_8X8];
+ int above = (*above_ctx >> bsl) & 1, left = (*left_ctx >> bsl) & 1;
+
+ assert(mi_size_wide_log2[bsize] == mi_size_high_log2[bsize]);
+ assert(bsl >= 0);
+
+ return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
+}
+
+// Return the number of elements in the partition CDF when
+// partitioning the (square) block with luma block size of bsize.
+static INLINE int partition_cdf_length(BLOCK_SIZE bsize) {
+ if (bsize <= BLOCK_8X8)
+ return PARTITION_TYPES;
+ else if (bsize == BLOCK_128X128)
+ return EXT_PARTITION_TYPES - 2;
+ else
+ return EXT_PARTITION_TYPES;
+}
+
+static INLINE int max_block_wide(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ int max_blocks_wide = block_size_wide[bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ if (xd->mb_to_right_edge < 0)
+ max_blocks_wide += xd->mb_to_right_edge >> (3 + pd->subsampling_x);
+
+ // Scale the width in the transform block unit.
+ return max_blocks_wide >> tx_size_wide_log2[0];
+}
+
+static INLINE int max_block_high(const MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane) {
+ int max_blocks_high = block_size_high[bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ if (xd->mb_to_bottom_edge < 0)
+ max_blocks_high += xd->mb_to_bottom_edge >> (3 + pd->subsampling_y);
+
+ // Scale the height in the transform block unit.
+ return max_blocks_high >> tx_size_high_log2[0];
+}
+
+static INLINE int max_intra_block_width(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane)
+ << tx_size_wide_log2[0];
+ return ALIGN_POWER_OF_TWO(max_blocks_wide, tx_size_wide_log2[tx_size]);
+}
+
+static INLINE int max_intra_block_height(const MACROBLOCKD *xd,
+ BLOCK_SIZE plane_bsize, int plane,
+ TX_SIZE tx_size) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane)
+ << tx_size_high_log2[0];
+ return ALIGN_POWER_OF_TWO(max_blocks_high, tx_size_high_log2[tx_size]);
+}
+
+static INLINE void av1_zero_above_context(AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ int mi_col_start, int mi_col_end, const int tile_row) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ const int width = mi_col_end - mi_col_start;
+ const int aligned_width =
+ ALIGN_POWER_OF_TWO(width, seq_params->mib_size_log2);
+
+ const int offset_y = mi_col_start;
+ const int width_y = aligned_width;
+ const int offset_uv = offset_y >> seq_params->subsampling_x;
+ const int width_uv = width_y >> seq_params->subsampling_x;
+
+ av1_zero_array(cm->above_context[0][tile_row] + offset_y, width_y);
+ if (num_planes > 1) {
+ if (cm->above_context[1][tile_row] && cm->above_context[2][tile_row]) {
+ av1_zero_array(cm->above_context[1][tile_row] + offset_uv, width_uv);
+ av1_zero_array(cm->above_context[2][tile_row] + offset_uv, width_uv);
+ } else {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid value of planes");
+ }
+ }
+
+ av1_zero_array(cm->above_seg_context[tile_row] + mi_col_start, aligned_width);
+
+ memset(cm->above_txfm_context[tile_row] + mi_col_start,
+ tx_size_wide[TX_SIZES_LARGEST],
+ aligned_width * sizeof(TXFM_CONTEXT));
+}
+
+static INLINE void av1_zero_left_context(MACROBLOCKD *const xd) {
+ av1_zero(xd->left_context);
+ av1_zero(xd->left_seg_context);
+
+ memset(xd->left_txfm_context_buffer, tx_size_high[TX_SIZES_LARGEST],
+ sizeof(xd->left_txfm_context_buffer));
+}
+
+// Disable array-bounds checks as the TX_SIZE enum contains values larger than
+// TX_SIZES_ALL (TX_INVALID) which make extending the array as a workaround
+// infeasible. The assert is enough for static analysis and this or other tools
+// asan, valgrind would catch oob access at runtime.
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic ignored "-Warray-bounds"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 4
+#pragma GCC diagnostic warning "-Warray-bounds"
+#endif
+
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx, uint8_t txs, int len) {
+ int i;
+ for (i = 0; i < len; ++i) txfm_ctx[i] = txs;
+}
+
+static INLINE void set_txfm_ctxs(TX_SIZE tx_size, int n4_w, int n4_h, int skip,
+ const MACROBLOCKD *xd) {
+ uint8_t bw = tx_size_wide[tx_size];
+ uint8_t bh = tx_size_high[tx_size];
+
+ if (skip) {
+ bw = n4_w * MI_SIZE;
+ bh = n4_h * MI_SIZE;
+ }
+
+ set_txfm_ctx(xd->above_txfm_context, bw, n4_w);
+ set_txfm_ctx(xd->left_txfm_context, bh, n4_h);
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+ TXFM_CONTEXT *left_ctx,
+ TX_SIZE tx_size, TX_SIZE txb_size) {
+ BLOCK_SIZE bsize = txsize_to_bsize[txb_size];
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+ uint8_t txw = tx_size_wide[tx_size];
+ uint8_t txh = tx_size_high[tx_size];
+ int i;
+ for (i = 0; i < bh; ++i) left_ctx[i] = txh;
+ for (i = 0; i < bw; ++i) above_ctx[i] = txw;
+}
+
+static INLINE TX_SIZE get_sqr_tx_size(int tx_dim) {
+ switch (tx_dim) {
+ case 128:
+ case 64: return TX_64X64; break;
+ case 32: return TX_32X32; break;
+ case 16: return TX_16X16; break;
+ case 8: return TX_8X8; break;
+ default: return TX_4X4;
+ }
+}
+
+static INLINE TX_SIZE get_tx_size(int width, int height) {
+ if (width == height) {
+ return get_sqr_tx_size(width);
+ }
+ if (width < height) {
+ if (width + width == height) {
+ switch (width) {
+ case 4: return TX_4X8; break;
+ case 8: return TX_8X16; break;
+ case 16: return TX_16X32; break;
+ case 32: return TX_32X64; break;
+ }
+ } else {
+ switch (width) {
+ case 4: return TX_4X16; break;
+ case 8: return TX_8X32; break;
+ case 16: return TX_16X64; break;
+ }
+ }
+ } else {
+ if (height + height == width) {
+ switch (height) {
+ case 4: return TX_8X4; break;
+ case 8: return TX_16X8; break;
+ case 16: return TX_32X16; break;
+ case 32: return TX_64X32; break;
+ }
+ } else {
+ switch (height) {
+ case 4: return TX_16X4; break;
+ case 8: return TX_32X8; break;
+ case 16: return TX_64X16; break;
+ }
+ }
+ }
+ assert(0);
+ return TX_4X4;
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+ TXFM_CONTEXT *left_ctx,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ const uint8_t txw = tx_size_wide[tx_size];
+ const uint8_t txh = tx_size_high[tx_size];
+ const int above = *above_ctx < txw;
+ const int left = *left_ctx < txh;
+ int category = TXFM_PARTITION_CONTEXTS;
+
+ // dummy return, not used by others.
+ if (tx_size <= TX_4X4) return 0;
+
+ TX_SIZE max_tx_size =
+ get_sqr_tx_size(AOMMAX(block_size_wide[bsize], block_size_high[bsize]));
+
+ if (max_tx_size >= TX_8X8) {
+ category =
+ (txsize_sqr_up_map[tx_size] != max_tx_size && max_tx_size > TX_8X8) +
+ (TX_SIZES - 1 - max_tx_size) * 2;
+ }
+ assert(category != TXFM_PARTITION_CONTEXTS);
+ return category * 3 + above + left;
+}
+
+// Compute the next partition in the direction of the sb_type stored in the mi
+// array, starting with bsize.
+static INLINE PARTITION_TYPE get_partition(const AV1_COMMON *const cm,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return PARTITION_INVALID;
+
+ const int offset = mi_row * cm->mi_stride + mi_col;
+ MB_MODE_INFO **mi = cm->mi_grid_visible + offset;
+ const BLOCK_SIZE subsize = mi[0]->sb_type;
+
+ if (subsize == bsize) return PARTITION_NONE;
+
+ const int bhigh = mi_size_high[bsize];
+ const int bwide = mi_size_wide[bsize];
+ const int sshigh = mi_size_high[subsize];
+ const int sswide = mi_size_wide[subsize];
+
+ if (bsize > BLOCK_8X8 && mi_row + bwide / 2 < cm->mi_rows &&
+ mi_col + bhigh / 2 < cm->mi_cols) {
+ // In this case, the block might be using an extended partition
+ // type.
+ const MB_MODE_INFO *const mbmi_right = mi[bwide / 2];
+ const MB_MODE_INFO *const mbmi_below = mi[bhigh / 2 * cm->mi_stride];
+
+ if (sswide == bwide) {
+ // Smaller height but same width. Is PARTITION_HORZ_4, PARTITION_HORZ or
+ // PARTITION_HORZ_B. To distinguish the latter two, check if the lower
+ // half was split.
+ if (sshigh * 4 == bhigh) return PARTITION_HORZ_4;
+ assert(sshigh * 2 == bhigh);
+
+ if (mbmi_below->sb_type == subsize)
+ return PARTITION_HORZ;
+ else
+ return PARTITION_HORZ_B;
+ } else if (sshigh == bhigh) {
+ // Smaller width but same height. Is PARTITION_VERT_4, PARTITION_VERT or
+ // PARTITION_VERT_B. To distinguish the latter two, check if the right
+ // half was split.
+ if (sswide * 4 == bwide) return PARTITION_VERT_4;
+ assert(sswide * 2 == bhigh);
+
+ if (mbmi_right->sb_type == subsize)
+ return PARTITION_VERT;
+ else
+ return PARTITION_VERT_B;
+ } else {
+ // Smaller width and smaller height. Might be PARTITION_SPLIT or could be
+ // PARTITION_HORZ_A or PARTITION_VERT_A. If subsize isn't halved in both
+ // dimensions, we immediately know this is a split (which will recurse to
+ // get to subsize). Otherwise look down and to the right. With
+ // PARTITION_VERT_A, the right block will have height bhigh; with
+ // PARTITION_HORZ_A, the lower block with have width bwide. Otherwise
+ // it's PARTITION_SPLIT.
+ if (sswide * 2 != bwide || sshigh * 2 != bhigh) return PARTITION_SPLIT;
+
+ if (mi_size_wide[mbmi_below->sb_type] == bwide) return PARTITION_HORZ_A;
+ if (mi_size_high[mbmi_right->sb_type] == bhigh) return PARTITION_VERT_A;
+
+ return PARTITION_SPLIT;
+ }
+ }
+ const int vert_split = sswide < bwide;
+ const int horz_split = sshigh < bhigh;
+ const int split_idx = (vert_split << 1) | horz_split;
+ assert(split_idx != 0);
+
+ static const PARTITION_TYPE base_partitions[4] = {
+ PARTITION_INVALID, PARTITION_HORZ, PARTITION_VERT, PARTITION_SPLIT
+ };
+
+ return base_partitions[split_idx];
+}
+
+static INLINE void set_use_reference_buffer(AV1_COMMON *const cm, int use) {
+ cm->seq_params.frame_id_numbers_present_flag = use;
+}
+
+static INLINE void set_sb_size(SequenceHeader *const seq_params,
+ BLOCK_SIZE sb_size) {
+ seq_params->sb_size = sb_size;
+ seq_params->mib_size = mi_size_wide[seq_params->sb_size];
+ seq_params->mib_size_log2 = mi_size_wide_log2[seq_params->sb_size];
+}
+
+// Returns true if the frame is fully lossless at the coded resolution.
+// Note: If super-resolution is used, such a frame will still NOT be lossless at
+// the upscaled resolution.
+static INLINE int is_coded_lossless(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ int coded_lossless = 1;
+ if (cm->seg.enabled) {
+ for (int i = 0; i < MAX_SEGMENTS; ++i) {
+ if (!xd->lossless[i]) {
+ coded_lossless = 0;
+ break;
+ }
+ }
+ } else {
+ coded_lossless = xd->lossless[0];
+ }
+ return coded_lossless;
+}
+
+static INLINE int is_valid_seq_level_idx(uint8_t seq_level_idx) {
+ return seq_level_idx < 24 || seq_level_idx == 31;
+}
+
+static INLINE uint8_t major_minor_to_seq_level_idx(BitstreamLevel bl) {
+ assert(bl.major >= LEVEL_MAJOR_MIN && bl.major <= LEVEL_MAJOR_MAX);
+ // Since bl.minor is unsigned a comparison will return a warning:
+ // comparison is always true due to limited range of data type
+ assert(LEVEL_MINOR_MIN == 0);
+ assert(bl.minor <= LEVEL_MINOR_MAX);
+ return ((bl.major - LEVEL_MAJOR_MIN) << LEVEL_MINOR_BITS) + bl.minor;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_ONYXC_INT_H_
diff --git a/third_party/aom/av1/common/ppc/cfl_ppc.c b/third_party/aom/av1/common/ppc/cfl_ppc.c
new file mode 100644
index 0000000000..026a078095
--- /dev/null
+++ b/third_party/aom/av1/common/ppc/cfl_ppc.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <altivec.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#define OFF_0 0
+#define OFF_1 16
+#define OFF_2 32
+#define OFF_3 48
+#define CFL_BUF_LINE_BYTES 64
+#define CFL_LINE_1 64
+#define CFL_LINE_2 128
+#define CFL_LINE_3 192
+
+typedef vector signed char int8x16_t; // NOLINT(runtime/int)
+typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int)
+typedef vector signed short int16x8_t; // NOLINT(runtime/int)
+typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int)
+typedef vector signed int int32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int)
+typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int)
+
+static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
+ int width, int height, int round_offset,
+ int num_pel_log2) {
+ // int16_t *dst = dst_ptr;
+ const int16_t *dst_end = dst + height * CFL_BUF_LINE;
+ const int16_t *sum_buf = (const int16_t *)src_ptr;
+ const int16_t *end = sum_buf + height * CFL_BUF_LINE;
+ const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
+ const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
+ const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+ 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
+
+ int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
+ int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
+ do {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
+ sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ if (width >= 16) {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ }
+ if (width == 32) {
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
+ sum_32x4_1 =
+ vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
+ }
+ } while ((sum_buf += (CFL_BUF_LINE * 2)) < end);
+ int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
+
+ const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
+ sum_32x4 = vec_add(sum_32x4, perm_64);
+ const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
+ sum_32x4 = vec_add(sum_32x4, perm_32);
+ const int32x4_t avg = vec_sr(sum_32x4, div_shift);
+ const int16x8_t vec_avg = vec_pack(avg, avg);
+ do {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg),
+ OFF_0 + CFL_BUF_LINE_BYTES, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg),
+ OFF_0 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg),
+ OFF_0 + CFL_LINE_3, dst);
+ if (width >= 16) {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg),
+ OFF_1 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg),
+ OFF_1 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg),
+ OFF_1 + CFL_LINE_3, dst);
+ }
+ if (width == 32) {
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg),
+ OFF_2 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg),
+ OFF_2 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg),
+ OFF_2 + CFL_LINE_3, dst);
+
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg),
+ OFF_3 + CFL_LINE_1, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg),
+ OFF_3 + CFL_LINE_2, dst);
+ vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg),
+ OFF_3 + CFL_LINE_3, dst);
+ }
+ } while ((dst += CFL_BUF_LINE * 4) < dst_end);
+}
+
+// Declare wrappers for VSX sizes
+CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
+CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
+CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
+CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
+CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
+CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
+CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
+CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
+CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
+
+// Based on observation, for small blocks VSX does not outperform C (no 64bit
+// load and store intrinsics). So we call the C code for block widths 4.
+cfl_subtract_average_fn get_subtract_average_fn_vsx(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ subtract_average_4x4_c, /* 4x4 */
+ subtract_average_8x8_vsx, /* 8x8 */
+ subtract_average_16x16_vsx, /* 16x16 */
+ subtract_average_32x32_vsx, /* 32x32 */
+ cfl_subtract_average_null, /* 64x64 (invalid CFL size) */
+ subtract_average_4x8_c, /* 4x8 */
+ subtract_average_8x4_vsx, /* 8x4 */
+ subtract_average_8x16_vsx, /* 8x16 */
+ subtract_average_16x8_vsx, /* 16x8 */
+ subtract_average_16x32_vsx, /* 16x32 */
+ subtract_average_32x16_vsx, /* 32x16 */
+ cfl_subtract_average_null, /* 32x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x32 (invalid CFL size) */
+ subtract_average_4x16_c, /* 4x16 */
+ subtract_average_16x4_vsx, /* 16x4 */
+ subtract_average_8x32_vsx, /* 8x32 */
+ subtract_average_32x8_vsx, /* 32x8 */
+ cfl_subtract_average_null, /* 16x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/pred_common.c b/third_party/aom/av1/common/pred_common.c
new file mode 100644
index 0000000000..5952441d1f
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+// Returns a context number for the given MB prediction signal
+static InterpFilter get_ref_filter_type(const MB_MODE_INFO *ref_mbmi,
+ const MACROBLOCKD *xd, int dir,
+ MV_REFERENCE_FRAME ref_frame) {
+ (void)xd;
+
+ return ((ref_mbmi->ref_frame[0] == ref_frame ||
+ ref_mbmi->ref_frame[1] == ref_frame)
+ ? av1_extract_interp_filter(ref_mbmi->interp_filters, dir & 0x01)
+ : SWITCHABLE_FILTERS);
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int ctx_offset =
+ (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+ assert(dir == 0 || dir == 1);
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries corresponding to real macroblocks.
+ // The prediction flags in these dummy entries are initialized to 0.
+ int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+ int left_type = SWITCHABLE_FILTERS;
+ int above_type = SWITCHABLE_FILTERS;
+
+ if (xd->left_available)
+ left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+ if (xd->up_available)
+ above_type =
+ get_ref_filter_type(xd->mi[-xd->mi_stride], xd, dir, ref_frame);
+
+ if (left_type == above_type) {
+ filter_type_ctx += left_type;
+ } else if (left_type == SWITCHABLE_FILTERS) {
+ assert(above_type != SWITCHABLE_FILTERS);
+ filter_type_ctx += above_type;
+ } else if (above_type == SWITCHABLE_FILTERS) {
+ assert(left_type != SWITCHABLE_FILTERS);
+ filter_type_ctx += left_type;
+ } else {
+ filter_type_ctx += SWITCHABLE_FILTERS;
+ }
+
+ return filter_type_ctx;
+}
+
+static void palette_add_to_cache(uint16_t *cache, int *n, uint16_t val) {
+ // Do not add an already existing value
+ if (*n > 0 && val == cache[*n - 1]) return;
+
+ cache[(*n)++] = val;
+}
+
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+ uint16_t *cache) {
+ const int row = -xd->mb_to_top_edge >> 3;
+ // Do not refer to above SB row when on SB boundary.
+ const MB_MODE_INFO *const above_mi =
+ (row % (1 << MIN_SB_SIZE_LOG2)) ? xd->above_mbmi : NULL;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int above_n = 0, left_n = 0;
+ if (above_mi) above_n = above_mi->palette_mode_info.palette_size[plane != 0];
+ if (left_mi) left_n = left_mi->palette_mode_info.palette_size[plane != 0];
+ if (above_n == 0 && left_n == 0) return 0;
+ int above_idx = plane * PALETTE_MAX_SIZE;
+ int left_idx = plane * PALETTE_MAX_SIZE;
+ int n = 0;
+ const uint16_t *above_colors =
+ above_mi ? above_mi->palette_mode_info.palette_colors : NULL;
+ const uint16_t *left_colors =
+ left_mi ? left_mi->palette_mode_info.palette_colors : NULL;
+ // Merge the sorted lists of base colors from above and left to get
+ // combined sorted color cache.
+ while (above_n > 0 && left_n > 0) {
+ uint16_t v_above = above_colors[above_idx];
+ uint16_t v_left = left_colors[left_idx];
+ if (v_left < v_above) {
+ palette_add_to_cache(cache, &n, v_left);
+ ++left_idx, --left_n;
+ } else {
+ palette_add_to_cache(cache, &n, v_above);
+ ++above_idx, --above_n;
+ if (v_left == v_above) ++left_idx, --left_n;
+ }
+ }
+ while (above_n-- > 0) {
+ uint16_t val = above_colors[above_idx++];
+ palette_add_to_cache(cache, &n, val);
+ }
+ while (left_n-- > 0) {
+ uint16_t val = left_colors[left_idx++];
+ palette_add_to_cache(cache, &n, val);
+ }
+ assert(n <= 2 * PALETTE_MAX_SIZE);
+ return n;
+}
+
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real macroblocks.
+// The prediction flags in these dummy entries are initialized to 0.
+// 0 - inter/inter, inter/--, --/inter, --/--
+// 1 - intra/inter, inter/intra
+// 2 - intra/--, --/intra
+// 3 - intra/intra
+int av1_get_intra_inter_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ if (has_above && has_left) { // both edges available
+ const int above_intra = !is_inter_block(above_mbmi);
+ const int left_intra = !is_inter_block(left_mbmi);
+ return left_intra && above_intra ? 3 : left_intra || above_intra;
+ } else if (has_above || has_left) { // one edge available
+ return 2 * !is_inter_block(has_above ? above_mbmi : left_mbmi);
+ } else {
+ return 0;
+ }
+}
+
+#define CHECK_BACKWARD_REFS(ref_frame) \
+ (((ref_frame) >= BWDREF_FRAME) && ((ref_frame) <= ALTREF_FRAME))
+#define IS_BACKWARD_REF_FRAME(ref_frame) CHECK_BACKWARD_REFS(ref_frame)
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd) {
+ int ctx;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ // Note:
+ // The mode info data structure has a one element border above and to the
+ // left of the entries corresponding to real macroblocks.
+ // The prediction flags in these dummy entries are initialized to 0.
+ if (has_above && has_left) { // both edges available
+ if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+ // neither edge uses comp pred (0/1)
+ ctx = IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ^
+ IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]);
+ else if (!has_second_ref(above_mbmi))
+ // one of two edges uses comp pred (2/3)
+ ctx = 2 + (IS_BACKWARD_REF_FRAME(above_mbmi->ref_frame[0]) ||
+ !is_inter_block(above_mbmi));
+ else if (!has_second_ref(left_mbmi))
+ // one of two edges uses comp pred (2/3)
+ ctx = 2 + (IS_BACKWARD_REF_FRAME(left_mbmi->ref_frame[0]) ||
+ !is_inter_block(left_mbmi));
+ else // both edges use comp pred (4)
+ ctx = 4;
+ } else if (has_above || has_left) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+ if (!has_second_ref(edge_mbmi))
+ // edge does not use comp pred (0/1)
+ ctx = IS_BACKWARD_REF_FRAME(edge_mbmi->ref_frame[0]);
+ else
+ // edge uses comp pred (3)
+ ctx = 3;
+ } else { // no edges available (1)
+ ctx = 1;
+ }
+ assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+ return ctx;
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd) {
+ int pred_context;
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const int above_in_image = xd->up_available;
+ const int left_in_image = xd->left_available;
+
+ if (above_in_image && left_in_image) { // both edges available
+ const int above_intra = !is_inter_block(above_mbmi);
+ const int left_intra = !is_inter_block(left_mbmi);
+
+ if (above_intra && left_intra) { // intra/intra
+ pred_context = 2;
+ } else if (above_intra || left_intra) { // intra/inter
+ const MB_MODE_INFO *inter_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+ if (!has_second_ref(inter_mbmi)) // single pred
+ pred_context = 2;
+ else // comp pred
+ pred_context = 1 + 2 * has_uni_comp_refs(inter_mbmi);
+ } else { // inter/inter
+ const int a_sg = !has_second_ref(above_mbmi);
+ const int l_sg = !has_second_ref(left_mbmi);
+ const MV_REFERENCE_FRAME frfa = above_mbmi->ref_frame[0];
+ const MV_REFERENCE_FRAME frfl = left_mbmi->ref_frame[0];
+
+ if (a_sg && l_sg) { // single/single
+ pred_context = 1 + 2 * (!(IS_BACKWARD_REF_FRAME(frfa) ^
+ IS_BACKWARD_REF_FRAME(frfl)));
+ } else if (l_sg || a_sg) { // single/comp
+ const int uni_rfc =
+ a_sg ? has_uni_comp_refs(left_mbmi) : has_uni_comp_refs(above_mbmi);
+
+ if (!uni_rfc) // comp bidir
+ pred_context = 1;
+ else // comp unidir
+ pred_context = 3 + (!(IS_BACKWARD_REF_FRAME(frfa) ^
+ IS_BACKWARD_REF_FRAME(frfl)));
+ } else { // comp/comp
+ const int a_uni_rfc = has_uni_comp_refs(above_mbmi);
+ const int l_uni_rfc = has_uni_comp_refs(left_mbmi);
+
+ if (!a_uni_rfc && !l_uni_rfc) // bidir/bidir
+ pred_context = 0;
+ else if (!a_uni_rfc || !l_uni_rfc) // unidir/bidir
+ pred_context = 2;
+ else // unidir/unidir
+ pred_context =
+ 3 + (!((frfa == BWDREF_FRAME) ^ (frfl == BWDREF_FRAME)));
+ }
+ }
+ } else if (above_in_image || left_in_image) { // one edge available
+ const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+ if (!is_inter_block(edge_mbmi)) { // intra
+ pred_context = 2;
+ } else { // inter
+ if (!has_second_ref(edge_mbmi)) // single pred
+ pred_context = 2;
+ else // comp pred
+ pred_context = 4 * has_uni_comp_refs(edge_mbmi);
+ }
+ } else { // no edges available
+ pred_context = 2;
+ }
+
+ assert(pred_context >= 0 && pred_context < COMP_REF_TYPE_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as either
+// (BWDREF, ALTREF), or (LAST, LAST2) / (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as uni-directional.
+//
+// 3 contexts: Voting is used to compare the count of forward references with
+// that of backward references from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of forward references (L, L2, L3, or G)
+ const int frf_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+ // Count of backward references (B or A)
+ const int brf_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+ ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (frf_count == brf_count) ? 1 : ((frf_count < brf_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST2), or (LAST, LAST3) / (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above three.
+//
+// 3 contexts: Voting is used to compare the count of LAST2_FRAME with the
+// total count of LAST3/GOLDEN from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST2
+ const int last2_count = ref_counts[LAST2_FRAME];
+ // Count of LAST3 or GOLDEN
+ const int last3_or_gld_count =
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+ const int pred_context = (last2_count == last3_or_gld_count)
+ ? 1
+ : ((last2_count < last3_or_gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+//
+// Signal the uni-directional compound reference frame pair as
+// either (LAST, LAST3) or (LAST, GOLDEN),
+// conditioning on the pair is known as one of the above two.
+//
+// 3 contexts: Voting is used to compare the count of LAST3_FRAME with the
+// total count of GOLDEN_FRAME from the spatial neighbors.
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST3
+ const int last3_count = ref_counts[LAST3_FRAME];
+ // Count of GOLDEN
+ const int gld_count = ref_counts[GOLDEN_FRAME];
+
+ const int pred_context =
+ (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < UNI_COMP_REF_CONTEXTS);
+ return pred_context;
+}
+
+// == Common context functions for both comp and single ref ==
+//
+// Obtain contexts to signal a reference frame to be either LAST/LAST2 or
+// LAST3/GOLDEN.
+static int get_pred_context_ll2_or_l3gld(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST + LAST2
+ const int last_last2_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME];
+ // Count of LAST3 + GOLDEN
+ const int last3_gld_count =
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+
+ const int pred_context = (last_last2_count == last3_gld_count)
+ ? 1
+ : ((last_last2_count < last3_gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST or LAST2.
+static int get_pred_context_last_or_last2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST
+ const int last_count = ref_counts[LAST_FRAME];
+ // Count of LAST2
+ const int last2_count = ref_counts[LAST2_FRAME];
+
+ const int pred_context =
+ (last_count == last2_count) ? 1 : ((last_count < last2_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame to be either LAST3 or GOLDEN.
+static int get_pred_context_last3_or_gld(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of LAST3
+ const int last3_count = ref_counts[LAST3_FRAME];
+ // Count of GOLDEN
+ const int gld_count = ref_counts[GOLDEN_FRAME];
+
+ const int pred_context =
+ (last3_count == gld_count) ? 1 : ((last3_count < gld_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF/ALTREF2, or
+// ALTREF.
+static int get_pred_context_brfarf2_or_arf(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Counts of BWDREF, ALTREF2, or ALTREF frames (B, A2, or A)
+ const int brfarf2_count =
+ ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME];
+ const int arf_count = ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (brfarf2_count == arf_count) ? 1 : ((brfarf2_count < arf_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// Obtain contexts to signal a reference frame be either BWDREF or ALTREF2.
+static int get_pred_context_brf_or_arf2(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of BWDREF frames (B)
+ const int brf_count = ref_counts[BWDREF_FRAME];
+ // Count of ALTREF2 frames (A2)
+ const int arf2_count = ref_counts[ALTREF2_FRAME];
+
+ const int pred_context =
+ (brf_count == arf2_count) ? 1 : ((brf_count < arf2_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// == Context functions for comp ref ==
+//
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd) {
+ return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd) {
+ return get_pred_context_last_or_last2(xd);
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd) {
+ return get_pred_context_last3_or_gld(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF, or ALTREF2/BWDREF.
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd) {
+ return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// Signal the 2nd reference frame for a compound mode be either
+// ALTREF2 or BWDREF.
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd) {
+ return get_pred_context_brf_or_arf2(xd);
+}
+
+// == Context functions for single ref ==
+//
+// For the bit to signal whether the single reference is a forward reference
+// frame or a backward reference frame.
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+ const uint8_t *const ref_counts = &xd->neighbors_ref_counts[0];
+
+ // Count of forward reference frames
+ const int fwd_count = ref_counts[LAST_FRAME] + ref_counts[LAST2_FRAME] +
+ ref_counts[LAST3_FRAME] + ref_counts[GOLDEN_FRAME];
+ // Count of backward reference frames
+ const int bwd_count = ref_counts[BWDREF_FRAME] + ref_counts[ALTREF2_FRAME] +
+ ref_counts[ALTREF_FRAME];
+
+ const int pred_context =
+ (fwd_count == bwd_count) ? 1 : ((fwd_count < bwd_count) ? 0 : 2);
+
+ assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+ return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// non-ALTREF backward reference frame, knowing that it shall be either of
+// these 2 choices.
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+ return get_pred_context_brfarf2_or_arf(xd);
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+ return get_pred_context_ll2_or_l3gld(xd);
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+ return get_pred_context_last_or_last2(xd);
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+ return get_pred_context_last3_or_gld(xd);
+}
+
+// For the bit to signal whether the single reference is ALTREF2_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd) {
+ return get_pred_context_brf_or_arf2(xd);
+}
diff --git a/third_party/aom/av1/common/pred_common.h b/third_party/aom/av1/common/pred_common.h
new file mode 100644
index 0000000000..6dba2322d6
--- /dev/null
+++ b/third_party/aom/av1/common/pred_common.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_PRED_COMMON_H_
+#define AOM_AV1_COMMON_PRED_COMMON_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE int get_segment_id(const AV1_COMMON *const cm,
+ const uint8_t *segment_ids, BLOCK_SIZE bsize,
+ int mi_row, int mi_col) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+ int x, y, segment_id = MAX_SEGMENTS;
+
+ for (y = 0; y < ymis; ++y)
+ for (x = 0; x < xmis; ++x)
+ segment_id =
+ AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
+
+static INLINE int av1_get_spatial_seg_pred(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd,
+ int mi_row, int mi_col,
+ int *cdf_index) {
+ int prev_ul = -1; // top left segment_id
+ int prev_l = -1; // left segment_id
+ int prev_u = -1; // top segment_id
+ if ((xd->up_available) && (xd->left_available)) {
+ prev_ul = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+ mi_row - 1, mi_col - 1);
+ }
+ if (xd->up_available) {
+ prev_u = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+ mi_row - 1, mi_col - 0);
+ }
+ if (xd->left_available) {
+ prev_l = get_segment_id(cm, cm->current_frame_seg_map, BLOCK_4X4,
+ mi_row - 0, mi_col - 1);
+ }
+
+ // Pick CDF index based on number of matching/out-of-bounds segment IDs.
+ if (prev_ul < 0 || prev_u < 0 || prev_l < 0) /* Edge case */
+ *cdf_index = 0;
+ else if ((prev_ul == prev_u) && (prev_ul == prev_l))
+ *cdf_index = 2;
+ else if ((prev_ul == prev_u) || (prev_ul == prev_l) || (prev_u == prev_l))
+ *cdf_index = 1;
+ else
+ *cdf_index = 0;
+
+ // If 2 or more are identical returns that as predictor, otherwise prev_l.
+ if (prev_u == -1) // edge case
+ return prev_l == -1 ? 0 : prev_l;
+ if (prev_l == -1) // edge case
+ return prev_u;
+ return (prev_ul == prev_u) ? prev_u : prev_l;
+}
+
+static INLINE int av1_get_pred_context_seg_id(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_sip = (above_mi != NULL) ? above_mi->seg_id_predicted : 0;
+ const int left_sip = (left_mi != NULL) ? left_mi->seg_id_predicted : 0;
+
+ return above_sip + left_sip;
+}
+
+static INLINE int get_comp_index_context(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+ int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+ int bck_frame_index = 0, fwd_frame_index = 0;
+ int cur_frame_index = cm->cur_frame->cur_frame_offset;
+
+ if (bck_idx >= 0)
+ bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+
+ if (fwd_idx >= 0)
+ fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+ int fwd = abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index));
+ int bck = abs(get_relative_dist(cm, cur_frame_index, bck_frame_index));
+
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+
+ int above_ctx = 0, left_ctx = 0;
+ const int offset = (fwd == bck);
+
+ if (above_mi) {
+ if (has_second_ref(above_mi))
+ above_ctx = above_mi->compound_idx;
+ else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+ above_ctx = 1;
+ }
+
+ if (left_mi) {
+ if (has_second_ref(left_mi))
+ left_ctx = left_mi->compound_idx;
+ else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+ left_ctx = 1;
+ }
+
+ return above_ctx + left_ctx + 3 * offset;
+}
+
+static INLINE int get_comp_group_idx_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int above_ctx = 0, left_ctx = 0;
+
+ if (above_mi) {
+ if (has_second_ref(above_mi))
+ above_ctx = above_mi->comp_group_idx;
+ else if (above_mi->ref_frame[0] == ALTREF_FRAME)
+ above_ctx = 3;
+ }
+ if (left_mi) {
+ if (has_second_ref(left_mi))
+ left_ctx = left_mi->comp_group_idx;
+ else if (left_mi->ref_frame[0] == ALTREF_FRAME)
+ left_ctx = 3;
+ }
+
+ return AOMMIN(5, above_ctx + left_ctx);
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_seg_id(
+ struct segmentation_probs *segp, const MACROBLOCKD *xd) {
+ return segp->pred_cdf[av1_get_pred_context_seg_id(xd)];
+}
+
+static INLINE int av1_get_skip_mode_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_skip_mode = above_mi ? above_mi->skip_mode : 0;
+ const int left_skip_mode = left_mi ? left_mi->skip_mode : 0;
+ return above_skip_mode + left_skip_mode;
+}
+
+static INLINE int av1_get_skip_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ const int above_skip = above_mi ? above_mi->skip : 0;
+ const int left_skip = left_mi ? left_mi->skip : 0;
+ return above_skip + left_skip;
+}
+
+int av1_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+
+// Get a list of palette base colors that are used in the above and left blocks,
+// referred to as "color cache". The return value is the number of colors in the
+// cache (<= 2 * PALETTE_MAX_SIZE). The color values are stored in "cache"
+// in ascending order.
+int av1_get_palette_cache(const MACROBLOCKD *const xd, int plane,
+ uint16_t *cache);
+
+static INLINE int av1_get_palette_bsize_ctx(BLOCK_SIZE bsize) {
+ return num_pels_log2_lookup[bsize] - num_pels_log2_lookup[BLOCK_8X8];
+}
+
+static INLINE int av1_get_palette_mode_ctx(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ int ctx = 0;
+ if (above_mi) ctx += (above_mi->palette_mode_info.palette_size[0] > 0);
+ if (left_mi) ctx += (left_mi->palette_mode_info.palette_size[0] > 0);
+ return ctx;
+}
+
+int av1_get_intra_inter_context(const MACROBLOCKD *xd);
+
+int av1_get_reference_mode_context(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_reference_mode_cdf(const MACROBLOCKD *xd) {
+ return xd->tile_ctx->comp_inter_cdf[av1_get_reference_mode_context(xd)];
+}
+
+int av1_get_comp_reference_type_context(const MACROBLOCKD *xd);
+
+// == Uni-directional contexts ==
+
+int av1_get_pred_context_uni_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_uni_comp_ref_p2(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_comp_reference_type_cdf(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_comp_reference_type_context(xd);
+ return xd->tile_ctx->comp_ref_type_cdf[pred_context];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p1(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_uni_comp_ref_p2(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_uni_comp_ref_p2(xd);
+ return xd->tile_ctx->uni_comp_ref_cdf[pred_context][2];
+}
+
+// == Bi-directional contexts ==
+
+int av1_get_pred_context_comp_ref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_comp_bwdref_p1(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p(const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p1(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][1];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_ref_p2(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_ref_p2(xd);
+ return xd->tile_ctx->comp_ref_cdf[pred_context][2];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_bwdref_p(xd);
+ return xd->tile_ctx->comp_bwdref_cdf[pred_context][0];
+}
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_comp_bwdref_p1(
+ const MACROBLOCKD *xd) {
+ const int pred_context = av1_get_pred_context_comp_bwdref_p1(xd);
+ return xd->tile_ctx->comp_bwdref_cdf[pred_context][1];
+}
+
+// == Single contexts ==
+
+int av1_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+int av1_get_pred_context_single_ref_p6(const MACROBLOCKD *xd);
+
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p1(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p1(xd)][0];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p2(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p2(xd)][1];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p3(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p3(xd)][2];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p4(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p4(xd)][3];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p5(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p5(xd)][4];
+}
+static INLINE aom_cdf_prob *av1_get_pred_cdf_single_ref_p6(
+ const MACROBLOCKD *xd) {
+ return xd->tile_ctx
+ ->single_ref_cdf[av1_get_pred_context_single_ref_p6(xd)][5];
+}
+
+// Returns a context number for the given MB prediction signal
+// The mode info data structure has a one element border above and to the
+// left of the entries corresponding to real blocks.
+// The prediction flags in these dummy entries are initialized to 0.
+static INLINE int get_tx_size_context(const MACROBLOCKD *xd) {
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[mbmi->sb_type];
+ const int max_tx_wide = tx_size_wide[max_tx_size];
+ const int max_tx_high = tx_size_high[max_tx_size];
+ const int has_above = xd->up_available;
+ const int has_left = xd->left_available;
+
+ int above = xd->above_txfm_context[0] >= max_tx_wide;
+ int left = xd->left_txfm_context[0] >= max_tx_high;
+
+ if (has_above)
+ if (is_inter_block(above_mbmi))
+ above = block_size_wide[above_mbmi->sb_type] >= max_tx_wide;
+
+ if (has_left)
+ if (is_inter_block(left_mbmi))
+ left = block_size_high[left_mbmi->sb_type] >= max_tx_high;
+
+ if (has_above && has_left)
+ return (above + left);
+ else if (has_above)
+ return above;
+ else if (has_left)
+ return left;
+ else
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_PRED_COMMON_H_
diff --git a/third_party/aom/av1/common/quant_common.c b/third_party/aom/av1/common/quant_common.c
new file mode 100644
index 0000000000..0e14da7a38
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.c
@@ -0,0 +1,13676 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/entropy.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/blockd.h"
+
+static const int16_t dc_qlookup_Q3[QINDEX_RANGE] = {
+ 4, 8, 8, 9, 10, 11, 12, 12, 13, 14, 15, 16, 17, 18,
+ 19, 19, 20, 21, 22, 23, 24, 25, 26, 26, 27, 28, 29, 30,
+ 31, 32, 32, 33, 34, 35, 36, 37, 38, 38, 39, 40, 41, 42,
+ 43, 43, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 53, 53,
+ 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65,
+ 66, 66, 67, 68, 69, 70, 70, 71, 72, 73, 74, 74, 75, 76,
+ 77, 78, 78, 79, 80, 81, 81, 82, 83, 84, 85, 85, 87, 88,
+ 90, 92, 93, 95, 96, 98, 99, 101, 102, 104, 105, 107, 108, 110,
+ 111, 113, 114, 116, 117, 118, 120, 121, 123, 125, 127, 129, 131, 134,
+ 136, 138, 140, 142, 144, 146, 148, 150, 152, 154, 156, 158, 161, 164,
+ 166, 169, 172, 174, 177, 180, 182, 185, 187, 190, 192, 195, 199, 202,
+ 205, 208, 211, 214, 217, 220, 223, 226, 230, 233, 237, 240, 243, 247,
+ 250, 253, 257, 261, 265, 269, 272, 276, 280, 284, 288, 292, 296, 300,
+ 304, 309, 313, 317, 322, 326, 330, 335, 340, 344, 349, 354, 359, 364,
+ 369, 374, 379, 384, 389, 395, 400, 406, 411, 417, 423, 429, 435, 441,
+ 447, 454, 461, 467, 475, 482, 489, 497, 505, 513, 522, 530, 539, 549,
+ 559, 569, 579, 590, 602, 614, 626, 640, 654, 668, 684, 700, 717, 736,
+ 755, 775, 796, 819, 843, 869, 896, 925, 955, 988, 1022, 1058, 1098, 1139,
+ 1184, 1232, 1282, 1336,
+};
+
+static const int16_t dc_qlookup_10_Q3[QINDEX_RANGE] = {
+ 4, 9, 10, 13, 15, 17, 20, 22, 25, 28, 31, 34, 37,
+ 40, 43, 47, 50, 53, 57, 60, 64, 68, 71, 75, 78, 82,
+ 86, 90, 93, 97, 101, 105, 109, 113, 116, 120, 124, 128, 132,
+ 136, 140, 143, 147, 151, 155, 159, 163, 166, 170, 174, 178, 182,
+ 185, 189, 193, 197, 200, 204, 208, 212, 215, 219, 223, 226, 230,
+ 233, 237, 241, 244, 248, 251, 255, 259, 262, 266, 269, 273, 276,
+ 280, 283, 287, 290, 293, 297, 300, 304, 307, 310, 314, 317, 321,
+ 324, 327, 331, 334, 337, 343, 350, 356, 362, 369, 375, 381, 387,
+ 394, 400, 406, 412, 418, 424, 430, 436, 442, 448, 454, 460, 466,
+ 472, 478, 484, 490, 499, 507, 516, 525, 533, 542, 550, 559, 567,
+ 576, 584, 592, 601, 609, 617, 625, 634, 644, 655, 666, 676, 687,
+ 698, 708, 718, 729, 739, 749, 759, 770, 782, 795, 807, 819, 831,
+ 844, 856, 868, 880, 891, 906, 920, 933, 947, 961, 975, 988, 1001,
+ 1015, 1030, 1045, 1061, 1076, 1090, 1105, 1120, 1137, 1153, 1170, 1186, 1202,
+ 1218, 1236, 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, 1398, 1416, 1436,
+ 1456, 1476, 1496, 1516, 1537, 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717,
+ 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, 1958, 1990, 2021, 2054, 2088,
+ 2123, 2159, 2197, 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, 2616, 2675,
+ 2737, 2802, 2871, 2944, 3020, 3102, 3188, 3280, 3375, 3478, 3586, 3702, 3823,
+ 3953, 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347,
+};
+
+static const int16_t dc_qlookup_12_Q3[QINDEX_RANGE] = {
+ 4, 12, 18, 25, 33, 41, 50, 60, 70, 80, 91,
+ 103, 115, 127, 140, 153, 166, 180, 194, 208, 222, 237,
+ 251, 266, 281, 296, 312, 327, 343, 358, 374, 390, 405,
+ 421, 437, 453, 469, 484, 500, 516, 532, 548, 564, 580,
+ 596, 611, 627, 643, 659, 674, 690, 706, 721, 737, 752,
+ 768, 783, 798, 814, 829, 844, 859, 874, 889, 904, 919,
+ 934, 949, 964, 978, 993, 1008, 1022, 1037, 1051, 1065, 1080,
+ 1094, 1108, 1122, 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234,
+ 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, 1368, 1393, 1419,
+ 1444, 1469, 1494, 1519, 1544, 1569, 1594, 1618, 1643, 1668, 1692,
+ 1717, 1741, 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, 1957,
+ 1992, 2027, 2061, 2096, 2130, 2165, 2199, 2233, 2267, 2300, 2334,
+ 2367, 2400, 2434, 2467, 2499, 2532, 2575, 2618, 2661, 2704, 2746,
+ 2788, 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, 3177, 3226,
+ 3275, 3324, 3373, 3421, 3469, 3517, 3565, 3621, 3677, 3733, 3788,
+ 3843, 3897, 3951, 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420,
+ 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, 5013, 5083, 5153,
+ 5222, 5291, 5367, 5442, 5517, 5591, 5665, 5745, 5825, 5905, 5984,
+ 6063, 6149, 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, 6966,
+ 7064, 7163, 7269, 7376, 7483, 7599, 7715, 7832, 7958, 8085, 8214,
+ 8352, 8492, 8635, 8788, 8945, 9104, 9275, 9450, 9639, 9832, 10031,
+ 10245, 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, 12750, 13118,
+ 13501, 13913, 14343, 14807, 15290, 15812, 16356, 16943, 17575, 18237, 18949,
+ 19718, 20521, 21387,
+};
+
+static const int16_t ac_qlookup_Q3[QINDEX_RANGE] = {
+ 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+ 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
+ 98, 99, 100, 101, 102, 104, 106, 108, 110, 112, 114, 116, 118,
+ 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144,
+ 146, 148, 150, 152, 155, 158, 161, 164, 167, 170, 173, 176, 179,
+ 182, 185, 188, 191, 194, 197, 200, 203, 207, 211, 215, 219, 223,
+ 227, 231, 235, 239, 243, 247, 251, 255, 260, 265, 270, 275, 280,
+ 285, 290, 295, 300, 305, 311, 317, 323, 329, 335, 341, 347, 353,
+ 359, 366, 373, 380, 387, 394, 401, 408, 416, 424, 432, 440, 448,
+ 456, 465, 474, 483, 492, 501, 510, 520, 530, 540, 550, 560, 571,
+ 582, 593, 604, 615, 627, 639, 651, 663, 676, 689, 702, 715, 729,
+ 743, 757, 771, 786, 801, 816, 832, 848, 864, 881, 898, 915, 933,
+ 951, 969, 988, 1007, 1026, 1046, 1066, 1087, 1108, 1129, 1151, 1173, 1196,
+ 1219, 1243, 1267, 1292, 1317, 1343, 1369, 1396, 1423, 1451, 1479, 1508, 1537,
+ 1567, 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828,
+};
+
+static const int16_t ac_qlookup_10_Q3[QINDEX_RANGE] = {
+ 4, 9, 11, 13, 16, 18, 21, 24, 27, 30, 33, 37, 40,
+ 44, 48, 51, 55, 59, 63, 67, 71, 75, 79, 83, 88, 92,
+ 96, 100, 105, 109, 114, 118, 122, 127, 131, 136, 140, 145, 149,
+ 154, 158, 163, 168, 172, 177, 181, 186, 190, 195, 199, 204, 208,
+ 213, 217, 222, 226, 231, 235, 240, 244, 249, 253, 258, 262, 267,
+ 271, 275, 280, 284, 289, 293, 297, 302, 306, 311, 315, 319, 324,
+ 328, 332, 337, 341, 345, 349, 354, 358, 362, 367, 371, 375, 379,
+ 384, 388, 392, 396, 401, 409, 417, 425, 433, 441, 449, 458, 466,
+ 474, 482, 490, 498, 506, 514, 523, 531, 539, 547, 555, 563, 571,
+ 579, 588, 596, 604, 616, 628, 640, 652, 664, 676, 688, 700, 713,
+ 725, 737, 749, 761, 773, 785, 797, 809, 825, 841, 857, 873, 889,
+ 905, 922, 938, 954, 970, 986, 1002, 1018, 1038, 1058, 1078, 1098, 1118,
+ 1138, 1158, 1178, 1198, 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, 1411,
+ 1435, 1463, 1491, 1519, 1547, 1575, 1603, 1631, 1663, 1695, 1727, 1759, 1791,
+ 1823, 1859, 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, 2199, 2239, 2283,
+ 2327, 2371, 2415, 2459, 2507, 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915,
+ 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, 3455, 3523, 3591, 3659, 3731,
+ 3803, 3876, 3952, 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, 4692, 4784,
+ 4876, 4972, 5068, 5168, 5268, 5372, 5476, 5584, 5692, 5804, 5916, 6032, 6148,
+ 6268, 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312,
+};
+
+static const int16_t ac_qlookup_12_Q3[QINDEX_RANGE] = {
+ 4, 13, 19, 27, 35, 44, 54, 64, 75, 87, 99,
+ 112, 126, 139, 154, 168, 183, 199, 214, 230, 247, 263,
+ 280, 297, 314, 331, 349, 366, 384, 402, 420, 438, 456,
+ 475, 493, 511, 530, 548, 567, 586, 604, 623, 642, 660,
+ 679, 698, 716, 735, 753, 772, 791, 809, 828, 846, 865,
+ 884, 902, 920, 939, 957, 976, 994, 1012, 1030, 1049, 1067,
+ 1085, 1103, 1121, 1139, 1157, 1175, 1193, 1211, 1229, 1246, 1264,
+ 1282, 1299, 1317, 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457,
+ 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, 1627, 1660, 1693,
+ 1725, 1758, 1791, 1824, 1856, 1889, 1922, 1954, 1987, 2020, 2052,
+ 2085, 2118, 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, 2411,
+ 2459, 2508, 2556, 2605, 2653, 2701, 2750, 2798, 2847, 2895, 2943,
+ 2992, 3040, 3088, 3137, 3185, 3234, 3298, 3362, 3426, 3491, 3555,
+ 3619, 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, 4230, 4310,
+ 4390, 4470, 4550, 4631, 4711, 4791, 4871, 4967, 5064, 5160, 5256,
+ 5352, 5448, 5544, 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410,
+ 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, 7579, 7723, 7867,
+ 8011, 8155, 8315, 8475, 8635, 8795, 8956, 9132, 9308, 9484, 9660,
+ 9836, 10028, 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, 11885,
+ 12109, 12333, 12573, 12813, 13053, 13309, 13565, 13821, 14093, 14365, 14637,
+ 14925, 15213, 15502, 15806, 16110, 16414, 16734, 17054, 17390, 17726, 18062,
+ 18414, 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, 21902, 22334,
+ 22766, 23214, 23662, 24126, 24590, 25070, 25551, 26047, 26559, 27071, 27599,
+ 28143, 28687, 29247,
+};
+
+// Coefficient scaling and quantization with AV1 TX are tailored to
+// the AV1 TX transforms. Regardless of the bit-depth of the input,
+// the transform stages scale the coefficient values up by a factor of
+// 8 (3 bits) over the scale of the pixel values. Thus, for 8-bit
+// input, the coefficients have effectively 11 bits of scale depth
+// (8+3), 10-bit input pixels result in 13-bit coefficient depth
+// (10+3) and 12-bit pixels yield 15-bit (12+3) coefficient depth.
+// All quantizers are built using this invariant of x8, 3-bit scaling,
+// thus the Q3 suffix.
+
+// A partial exception to this rule is large transforms; to avoid
+// overflow, TX blocks with > 256 pels (>16x16) are scaled only
+// 4-times unity (2 bits) over the pixel depth, and TX blocks with
+// over 1024 pixels (>32x32) are scaled up only 2x unity (1 bit).
+// This descaling is found via av1_tx_get_scale(). Thus, 16x32, 32x16
+// and 32x32 transforms actually return Q2 coefficients, and 32x64,
+// 64x32 and 64x64 transforms return Q1 coefficients. However, the
+// quantizers are de-scaled down on-the-fly by the same amount
+// (av1_tx_get_scale()) during quantization, and as such the
+// dequantized/decoded coefficients, even for large TX blocks, are always
+// effectively Q3. Meanwhile, quantized/coded coefficients are Q0
+// because Qn quantizers are applied to Qn tx coefficients.
+
+// Note that encoder decision making (which uses the quantizer to
+// generate several bespoke lamdas for RDO and other heuristics)
+// expects quantizers to be larger for higher-bitdepth input. In
+// addition, the minimum allowable quantizer is 4; smaller values will
+// underflow to 0 in the actual quantization routines.
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ switch (bit_depth) {
+ case AOM_BITS_8: return dc_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_10: return dc_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_12: return dc_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ switch (bit_depth) {
+ case AOM_BITS_8: return ac_qlookup_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_10: return ac_qlookup_10_Q3[clamp(qindex + delta, 0, MAXQ)];
+ case AOM_BITS_12: return ac_qlookup_12_Q3[clamp(qindex + delta, 0, MAXQ)];
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+// In AV1 TX, the coefficients are always scaled up a factor of 8 (3
+// bits), so QTX == Q3.
+
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ return av1_dc_quant_Q3(qindex, delta, bit_depth);
+}
+
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth) {
+ return av1_ac_quant_Q3(qindex, delta, bit_depth);
+}
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+ int base_qindex) {
+ if (segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) {
+ const int data = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
+ const int seg_qindex = base_qindex + data;
+ return clamp(seg_qindex, 0, MAXQ);
+ } else {
+ return base_qindex;
+ }
+}
+
+const qm_val_t *av1_iqmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+ TX_SIZE tx_size) {
+ return &cm->giqmatrix[qmlevel][plane][tx_size][0];
+}
+const qm_val_t *av1_qmatrix(AV1_COMMON *cm, int qmlevel, int plane,
+ TX_SIZE tx_size) {
+ return &cm->gqmatrix[qmlevel][plane][tx_size][0];
+}
+
+#define QM_TOTAL_SIZE 3344
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE];
+
+void av1_qm_init(AV1_COMMON *cm) {
+ const int num_planes = av1_num_planes(cm);
+ int q, c, t;
+ int current;
+ for (q = 0; q < NUM_QM_LEVELS; ++q) {
+ for (c = 0; c < num_planes; ++c) {
+ current = 0;
+ for (t = 0; t < TX_SIZES_ALL; ++t) {
+ const int size = tx_size_2d[t];
+ const int qm_tx_size = av1_get_adjusted_tx_size(t);
+ if (q == NUM_QM_LEVELS - 1) {
+ cm->gqmatrix[q][c][t] = NULL;
+ cm->giqmatrix[q][c][t] = NULL;
+ } else if (t != qm_tx_size) { // Reuse matrices for 'qm_tx_size'
+ cm->gqmatrix[q][c][t] = cm->gqmatrix[q][c][qm_tx_size];
+ cm->giqmatrix[q][c][t] = cm->giqmatrix[q][c][qm_tx_size];
+ } else {
+ assert(current + size <= QM_TOTAL_SIZE);
+ cm->gqmatrix[q][c][t] = &wt_matrix_ref[q][c >= 1][current];
+ cm->giqmatrix[q][c][t] = &iwt_matrix_ref[q][c >= 1][current];
+ current += size;
+ }
+ }
+ }
+ }
+}
+
+/* Provide 16 sets of quantization matrices for chroma and luma
+ and each TX size. Matrices for different TX sizes are in fact
+ sub-sampled from the 32x32 and 16x16 sizes, but explicitly
+ defined here for convenience. Intra and inter matrix sets are the
+ same but changing DEFAULT_QM_INTER_OFFSET from zero allows
+ for different matrices for inter and intra blocks in the same
+ frame.
+ Matrices for different QM levels have been rescaled in the
+ frequency domain according to different nominal viewing
+ distances.
+ */
+static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 43, 73, 97, 43, 67, 94, 110, 73, 94, 137, 150, 97, 110, 150, 200,
+ /* Size 8x8 */
+ 32, 32, 38, 51, 68, 84, 95, 109, 32, 35, 40, 49, 63, 76, 89, 102, 38,
+ 40, 54, 65, 78, 91, 98, 106, 51, 49, 65, 82, 97, 111, 113, 121, 68, 63,
+ 78, 97, 117, 134, 138, 142, 84, 76, 91, 111, 134, 152, 159, 168, 95, 89,
+ 98, 113, 138, 159, 183, 199, 109, 102, 106, 121, 142, 168, 199, 220,
+ /* Size 16x16 */
+ 32, 31, 31, 34, 36, 44, 48, 59, 65, 80, 83, 91, 97, 104, 111, 119, 31,
+ 32, 32, 33, 34, 41, 44, 54, 59, 72, 75, 83, 90, 97, 104, 112, 31, 32,
+ 33, 35, 36, 42, 45, 54, 59, 71, 74, 81, 86, 93, 100, 107, 34, 33, 35,
+ 39, 42, 47, 51, 58, 63, 74, 76, 81, 84, 90, 97, 105, 36, 34, 36, 42, 48,
+ 54, 57, 64, 68, 79, 81, 88, 91, 96, 102, 105, 44, 41, 42, 47, 54, 63,
+ 67, 75, 79, 90, 92, 95, 100, 102, 109, 112, 48, 44, 45, 51, 57, 67, 71,
+ 80, 85, 96, 99, 107, 108, 111, 117, 120, 59, 54, 54, 58, 64, 75, 80, 92,
+ 98, 110, 113, 115, 116, 122, 125, 130, 65, 59, 59, 63, 68, 79, 85, 98,
+ 105, 118, 121, 127, 130, 134, 135, 140, 80, 72, 71, 74, 79, 90, 96, 110,
+ 118, 134, 137, 140, 143, 144, 146, 152, 83, 75, 74, 76, 81, 92, 99, 113,
+ 121, 137, 140, 151, 152, 155, 158, 165, 91, 83, 81, 81, 88, 95, 107,
+ 115, 127, 140, 151, 159, 166, 169, 173, 179, 97, 90, 86, 84, 91, 100,
+ 108, 116, 130, 143, 152, 166, 174, 182, 189, 193, 104, 97, 93, 90, 96,
+ 102, 111, 122, 134, 144, 155, 169, 182, 191, 200, 210, 111, 104, 100,
+ 97, 102, 109, 117, 125, 135, 146, 158, 173, 189, 200, 210, 220, 119,
+ 112, 107, 105, 105, 112, 120, 130, 140, 152, 165, 179, 193, 210, 220,
+ 231,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 34, 35, 36, 39, 44, 46, 48, 54, 59, 62, 65, 71,
+ 80, 81, 83, 88, 91, 94, 97, 101, 104, 107, 111, 115, 119, 123, 31, 32,
+ 32, 32, 32, 32, 34, 34, 35, 38, 42, 44, 46, 51, 56, 59, 62, 68, 76, 77,
+ 78, 84, 86, 89, 92, 95, 99, 102, 105, 109, 113, 116, 31, 32, 32, 32, 32,
+ 32, 33, 34, 34, 37, 41, 42, 44, 49, 54, 56, 59, 65, 72, 73, 75, 80, 83,
+ 86, 90, 93, 97, 101, 104, 108, 112, 116, 31, 32, 32, 32, 33, 33, 34, 35,
+ 35, 38, 41, 43, 45, 49, 54, 56, 59, 64, 72, 73, 74, 79, 82, 85, 88, 91,
+ 94, 97, 101, 104, 107, 111, 31, 32, 32, 33, 33, 34, 35, 36, 36, 39, 42,
+ 44, 45, 50, 54, 56, 59, 64, 71, 72, 74, 78, 81, 84, 86, 89, 93, 96, 100,
+ 104, 107, 111, 32, 32, 32, 33, 34, 35, 37, 37, 38, 40, 42, 44, 46, 49,
+ 53, 55, 58, 63, 69, 70, 72, 76, 79, 82, 85, 89, 93, 96, 99, 102, 106,
+ 109, 34, 34, 33, 34, 35, 37, 39, 41, 42, 45, 47, 49, 51, 54, 58, 60, 63,
+ 68, 74, 75, 76, 80, 81, 82, 84, 87, 90, 93, 97, 101, 105, 110, 35, 34,
+ 34, 35, 36, 37, 41, 43, 45, 47, 50, 52, 53, 57, 61, 63, 65, 70, 76, 77,
+ 79, 82, 84, 86, 89, 91, 92, 93, 96, 100, 103, 107, 36, 35, 34, 35, 36,
+ 38, 42, 45, 48, 50, 54, 55, 57, 60, 64, 66, 68, 73, 79, 80, 81, 85, 88,
+ 90, 91, 93, 96, 99, 102, 103, 105, 107, 39, 38, 37, 38, 39, 40, 45, 47,
+ 50, 54, 58, 59, 61, 65, 69, 71, 73, 78, 84, 85, 86, 91, 92, 92, 95, 98,
+ 100, 101, 103, 106, 110, 114, 44, 42, 41, 41, 42, 42, 47, 50, 54, 58,
+ 63, 65, 67, 71, 75, 77, 79, 84, 90, 91, 92, 95, 95, 97, 100, 101, 102,
+ 105, 109, 111, 112, 114, 46, 44, 42, 43, 44, 44, 49, 52, 55, 59, 65, 67,
+ 69, 74, 78, 80, 82, 87, 93, 94, 95, 98, 100, 103, 102, 105, 108, 110,
+ 111, 113, 117, 121, 48, 46, 44, 45, 45, 46, 51, 53, 57, 61, 67, 69, 71,
+ 76, 80, 83, 85, 90, 96, 97, 99, 103, 107, 105, 108, 111, 111, 113, 117,
+ 119, 120, 122, 54, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 74, 76, 82,
+ 87, 89, 92, 97, 104, 105, 106, 111, 110, 111, 114, 113, 116, 120, 120,
+ 121, 125, 130, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+ 92, 95, 98, 103, 110, 111, 113, 115, 115, 119, 116, 120, 122, 122, 125,
+ 129, 130, 130, 62, 59, 56, 56, 56, 55, 60, 63, 66, 71, 77, 80, 83, 89,
+ 95, 98, 101, 107, 114, 115, 117, 119, 123, 121, 125, 126, 125, 129, 131,
+ 131, 135, 140, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+ 98, 101, 105, 111, 118, 119, 121, 126, 127, 128, 130, 130, 134, 133,
+ 135, 140, 140, 140, 71, 68, 65, 64, 64, 63, 68, 70, 73, 78, 84, 87, 90,
+ 97, 103, 107, 111, 117, 125, 126, 128, 134, 132, 136, 133, 138, 137,
+ 140, 143, 142, 145, 150, 80, 76, 72, 72, 71, 69, 74, 76, 79, 84, 90, 93,
+ 96, 104, 110, 114, 118, 125, 134, 135, 137, 139, 140, 139, 143, 142,
+ 144, 146, 146, 151, 152, 151, 81, 77, 73, 73, 72, 70, 75, 77, 80, 85,
+ 91, 94, 97, 105, 111, 115, 119, 126, 135, 137, 138, 144, 147, 146, 148,
+ 149, 151, 150, 156, 155, 157, 163, 83, 78, 75, 74, 74, 72, 76, 79, 81,
+ 86, 92, 95, 99, 106, 113, 117, 121, 128, 137, 138, 140, 147, 151, 156,
+ 152, 157, 155, 161, 158, 162, 165, 164, 88, 84, 80, 79, 78, 76, 80, 82,
+ 85, 91, 95, 98, 103, 111, 115, 119, 126, 134, 139, 144, 147, 152, 154,
+ 158, 163, 159, 165, 163, 168, 168, 169, 176, 91, 86, 83, 82, 81, 79, 81,
+ 84, 88, 92, 95, 100, 107, 110, 115, 123, 127, 132, 140, 147, 151, 154,
+ 159, 161, 166, 171, 169, 173, 173, 176, 179, 177, 94, 89, 86, 85, 84,
+ 82, 82, 86, 90, 92, 97, 103, 105, 111, 119, 121, 128, 136, 139, 146,
+ 156, 158, 161, 166, 168, 174, 179, 178, 180, 183, 183, 190, 97, 92, 90,
+ 88, 86, 85, 84, 89, 91, 95, 100, 102, 108, 114, 116, 125, 130, 133, 143,
+ 148, 152, 163, 166, 168, 174, 176, 182, 187, 189, 188, 193, 191, 101,
+ 95, 93, 91, 89, 89, 87, 91, 93, 98, 101, 105, 111, 113, 120, 126, 130,
+ 138, 142, 149, 157, 159, 171, 174, 176, 183, 184, 191, 195, 199, 197,
+ 204, 104, 99, 97, 94, 93, 93, 90, 92, 96, 100, 102, 108, 111, 116, 122,
+ 125, 134, 137, 144, 151, 155, 165, 169, 179, 182, 184, 191, 193, 200,
+ 204, 210, 206, 107, 102, 101, 97, 96, 96, 93, 93, 99, 101, 105, 110,
+ 113, 120, 122, 129, 133, 140, 146, 150, 161, 163, 173, 178, 187, 191,
+ 193, 200, 202, 210, 214, 222, 111, 105, 104, 101, 100, 99, 97, 96, 102,
+ 103, 109, 111, 117, 120, 125, 131, 135, 143, 146, 156, 158, 168, 173,
+ 180, 189, 195, 200, 202, 210, 212, 220, 224, 115, 109, 108, 104, 104,
+ 102, 101, 100, 103, 106, 111, 113, 119, 121, 129, 131, 140, 142, 151,
+ 155, 162, 168, 176, 183, 188, 199, 204, 210, 212, 220, 222, 230, 119,
+ 113, 112, 107, 107, 106, 105, 103, 105, 110, 112, 117, 120, 125, 130,
+ 135, 140, 145, 152, 157, 165, 169, 179, 183, 193, 197, 210, 214, 220,
+ 222, 231, 232, 123, 116, 116, 111, 111, 109, 110, 107, 107, 114, 114,
+ 121, 122, 130, 130, 140, 140, 150, 151, 163, 164, 176, 177, 190, 191,
+ 204, 206, 222, 224, 230, 232, 242,
+ /* Size 4x8 */
+ 32, 42, 75, 91, 33, 42, 69, 86, 37, 58, 84, 91, 49, 71, 103, 110, 65,
+ 84, 125, 128, 80, 97, 142, 152, 91, 100, 145, 178, 104, 112, 146, 190,
+ /* Size 8x4 */
+ 32, 33, 37, 49, 65, 80, 91, 104, 42, 42, 58, 71, 84, 97, 100, 112, 75,
+ 69, 84, 103, 125, 142, 145, 146, 91, 86, 91, 110, 128, 152, 178, 190,
+ /* Size 8x16 */
+ 32, 32, 36, 53, 65, 87, 93, 99, 31, 33, 34, 49, 59, 78, 86, 93, 32, 34,
+ 36, 50, 59, 77, 82, 89, 34, 37, 42, 54, 63, 79, 80, 88, 36, 38, 48, 60,
+ 68, 84, 86, 90, 44, 43, 53, 71, 79, 95, 94, 97, 48, 46, 56, 76, 85, 102,
+ 105, 105, 58, 54, 63, 87, 98, 116, 112, 115, 65, 58, 68, 92, 105, 124,
+ 122, 124, 79, 70, 79, 104, 118, 141, 135, 135, 82, 72, 81, 106, 121,
+ 144, 149, 146, 91, 80, 88, 106, 130, 148, 162, 159, 97, 86, 94, 107,
+ 128, 157, 167, 171, 103, 93, 98, 114, 131, 150, 174, 186, 110, 100, 101,
+ 117, 138, 161, 183, 193, 118, 107, 105, 118, 136, 157, 182, 203,
+ /* Size 16x8 */
+ 32, 31, 32, 34, 36, 44, 48, 58, 65, 79, 82, 91, 97, 103, 110, 118, 32,
+ 33, 34, 37, 38, 43, 46, 54, 58, 70, 72, 80, 86, 93, 100, 107, 36, 34,
+ 36, 42, 48, 53, 56, 63, 68, 79, 81, 88, 94, 98, 101, 105, 53, 49, 50,
+ 54, 60, 71, 76, 87, 92, 104, 106, 106, 107, 114, 117, 118, 65, 59, 59,
+ 63, 68, 79, 85, 98, 105, 118, 121, 130, 128, 131, 138, 136, 87, 78, 77,
+ 79, 84, 95, 102, 116, 124, 141, 144, 148, 157, 150, 161, 157, 93, 86,
+ 82, 80, 86, 94, 105, 112, 122, 135, 149, 162, 167, 174, 183, 182, 99,
+ 93, 89, 88, 90, 97, 105, 115, 124, 135, 146, 159, 171, 186, 193, 203,
+ /* Size 16x32 */
+ 32, 31, 32, 34, 36, 44, 53, 59, 65, 79, 87, 90, 93, 96, 99, 102, 31, 32,
+ 32, 34, 35, 42, 51, 56, 62, 75, 82, 85, 88, 91, 94, 97, 31, 32, 33, 33,
+ 34, 41, 49, 54, 59, 72, 78, 82, 86, 90, 93, 97, 31, 32, 33, 34, 35, 41,
+ 49, 54, 59, 71, 78, 81, 84, 87, 90, 93, 32, 32, 34, 35, 36, 42, 50, 54,
+ 59, 71, 77, 80, 82, 86, 89, 93, 32, 33, 35, 37, 38, 42, 49, 53, 58, 69,
+ 75, 78, 82, 86, 89, 92, 34, 34, 37, 39, 42, 48, 54, 58, 63, 73, 79, 78,
+ 80, 83, 88, 92, 35, 34, 37, 41, 45, 50, 57, 61, 65, 76, 82, 83, 84, 84,
+ 87, 90, 36, 34, 38, 43, 48, 54, 60, 64, 68, 78, 84, 87, 86, 89, 90, 90,
+ 39, 37, 40, 45, 50, 58, 65, 69, 73, 84, 89, 89, 91, 91, 93, 96, 44, 41,
+ 43, 48, 53, 63, 71, 75, 79, 90, 95, 93, 94, 95, 97, 97, 46, 43, 44, 49,
+ 55, 65, 73, 78, 82, 93, 98, 100, 98, 100, 99, 103, 48, 45, 46, 51, 56,
+ 67, 76, 80, 85, 96, 102, 102, 105, 102, 105, 104, 53, 49, 50, 54, 60,
+ 71, 82, 87, 92, 103, 109, 107, 107, 110, 107, 111, 58, 54, 54, 58, 63,
+ 75, 87, 92, 98, 110, 116, 115, 112, 111, 115, 112, 61, 57, 56, 60, 66,
+ 77, 89, 95, 101, 114, 120, 118, 119, 118, 116, 120, 65, 60, 58, 63, 68,
+ 79, 92, 98, 105, 118, 124, 123, 122, 123, 124, 121, 71, 65, 63, 68, 73,
+ 84, 97, 103, 111, 125, 132, 132, 130, 128, 127, 130, 79, 72, 70, 74, 79,
+ 90, 104, 110, 118, 133, 141, 136, 135, 135, 135, 131, 81, 74, 71, 75,
+ 80, 91, 105, 112, 119, 135, 142, 140, 140, 138, 139, 142, 82, 75, 72,
+ 76, 81, 92, 106, 113, 121, 136, 144, 151, 149, 149, 146, 143, 88, 80,
+ 77, 80, 85, 97, 108, 115, 126, 142, 149, 153, 153, 152, 152, 154, 91,
+ 83, 80, 81, 88, 100, 106, 114, 130, 142, 148, 155, 162, 160, 159, 155,
+ 94, 85, 83, 82, 91, 100, 105, 118, 131, 137, 153, 160, 165, 167, 166,
+ 168, 97, 88, 86, 85, 94, 100, 107, 123, 128, 140, 157, 161, 167, 173,
+ 171, 169, 100, 91, 89, 87, 97, 100, 111, 121, 127, 145, 152, 164, 173,
+ 178, 182, 181, 103, 94, 93, 90, 98, 101, 114, 120, 131, 144, 150, 170,
+ 174, 180, 186, 183, 107, 97, 96, 93, 100, 104, 117, 119, 136, 142, 155,
+ 168, 177, 187, 191, 198, 110, 101, 100, 97, 101, 108, 117, 123, 138,
+ 141, 161, 165, 183, 188, 193, 200, 114, 104, 104, 100, 103, 112, 117,
+ 127, 137, 146, 159, 167, 185, 190, 201, 206, 118, 108, 107, 103, 105,
+ 115, 118, 131, 136, 151, 157, 172, 182, 197, 203, 208, 122, 111, 111,
+ 107, 107, 119, 119, 136, 136, 156, 156, 178, 179, 203, 204, 217,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+ 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 31, 32,
+ 32, 32, 32, 33, 34, 34, 34, 37, 41, 43, 45, 49, 54, 57, 60, 65, 72, 74,
+ 75, 80, 83, 85, 88, 91, 94, 97, 101, 104, 108, 111, 32, 32, 33, 33, 34,
+ 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71, 72, 77, 80,
+ 83, 86, 89, 93, 96, 100, 104, 107, 111, 34, 34, 33, 34, 35, 37, 39, 41,
+ 43, 45, 48, 49, 51, 54, 58, 60, 63, 68, 74, 75, 76, 80, 81, 82, 85, 87,
+ 90, 93, 97, 100, 103, 107, 36, 35, 34, 35, 36, 38, 42, 45, 48, 50, 53,
+ 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88, 91, 94, 97, 98, 100,
+ 101, 103, 105, 107, 44, 42, 41, 41, 42, 42, 48, 50, 54, 58, 63, 65, 67,
+ 71, 75, 77, 79, 84, 90, 91, 92, 97, 100, 100, 100, 100, 101, 104, 108,
+ 112, 115, 119, 53, 51, 49, 49, 50, 49, 54, 57, 60, 65, 71, 73, 76, 82,
+ 87, 89, 92, 97, 104, 105, 106, 108, 106, 105, 107, 111, 114, 117, 117,
+ 117, 118, 119, 59, 56, 54, 54, 54, 53, 58, 61, 64, 69, 75, 78, 80, 87,
+ 92, 95, 98, 103, 110, 112, 113, 115, 114, 118, 123, 121, 120, 119, 123,
+ 127, 131, 136, 65, 62, 59, 59, 59, 58, 63, 65, 68, 73, 79, 82, 85, 92,
+ 98, 101, 105, 111, 118, 119, 121, 126, 130, 131, 128, 127, 131, 136,
+ 138, 137, 136, 136, 79, 75, 72, 71, 71, 69, 73, 76, 78, 84, 90, 93, 96,
+ 103, 110, 114, 118, 125, 133, 135, 136, 142, 142, 137, 140, 145, 144,
+ 142, 141, 146, 151, 156, 87, 82, 78, 78, 77, 75, 79, 82, 84, 89, 95, 98,
+ 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148, 153, 157, 152,
+ 150, 155, 161, 159, 157, 156, 90, 85, 82, 81, 80, 78, 78, 83, 87, 89,
+ 93, 100, 102, 107, 115, 118, 123, 132, 136, 140, 151, 153, 155, 160,
+ 161, 164, 170, 168, 165, 167, 172, 178, 93, 88, 86, 84, 82, 82, 80, 84,
+ 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153, 162,
+ 165, 167, 173, 174, 177, 183, 185, 182, 179, 96, 91, 90, 87, 86, 86, 83,
+ 84, 89, 91, 95, 100, 102, 110, 111, 118, 123, 128, 135, 138, 149, 152,
+ 160, 167, 173, 178, 180, 187, 188, 190, 197, 203, 99, 94, 93, 90, 89,
+ 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+ 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204, 102, 97, 97, 93,
+ 93, 92, 92, 90, 90, 96, 97, 103, 104, 111, 112, 120, 121, 130, 131, 142,
+ 143, 154, 155, 168, 169, 181, 183, 198, 200, 206, 208, 217,
+ /* Size 4x16 */
+ 31, 44, 79, 96, 32, 41, 72, 90, 32, 42, 71, 86, 34, 48, 73, 83, 34, 54,
+ 78, 89, 41, 63, 90, 95, 45, 67, 96, 102, 54, 75, 110, 111, 60, 79, 118,
+ 123, 72, 90, 133, 135, 75, 92, 136, 149, 83, 100, 142, 160, 88, 100,
+ 140, 173, 94, 101, 144, 180, 101, 108, 141, 188, 108, 115, 151, 197,
+ /* Size 16x4 */
+ 31, 32, 32, 34, 34, 41, 45, 54, 60, 72, 75, 83, 88, 94, 101, 108, 44,
+ 41, 42, 48, 54, 63, 67, 75, 79, 90, 92, 100, 100, 101, 108, 115, 79, 72,
+ 71, 73, 78, 90, 96, 110, 118, 133, 136, 142, 140, 144, 141, 151, 96, 90,
+ 86, 83, 89, 95, 102, 111, 123, 135, 149, 160, 173, 180, 188, 197,
+ /* Size 8x32 */
+ 32, 32, 36, 53, 65, 87, 93, 99, 31, 32, 35, 51, 62, 82, 88, 94, 31, 33,
+ 34, 49, 59, 78, 86, 93, 31, 33, 35, 49, 59, 78, 84, 90, 32, 34, 36, 50,
+ 59, 77, 82, 89, 32, 35, 38, 49, 58, 75, 82, 89, 34, 37, 42, 54, 63, 79,
+ 80, 88, 35, 37, 45, 57, 65, 82, 84, 87, 36, 38, 48, 60, 68, 84, 86, 90,
+ 39, 40, 50, 65, 73, 89, 91, 93, 44, 43, 53, 71, 79, 95, 94, 97, 46, 44,
+ 55, 73, 82, 98, 98, 99, 48, 46, 56, 76, 85, 102, 105, 105, 53, 50, 60,
+ 82, 92, 109, 107, 107, 58, 54, 63, 87, 98, 116, 112, 115, 61, 56, 66,
+ 89, 101, 120, 119, 116, 65, 58, 68, 92, 105, 124, 122, 124, 71, 63, 73,
+ 97, 111, 132, 130, 127, 79, 70, 79, 104, 118, 141, 135, 135, 81, 71, 80,
+ 105, 119, 142, 140, 139, 82, 72, 81, 106, 121, 144, 149, 146, 88, 77,
+ 85, 108, 126, 149, 153, 152, 91, 80, 88, 106, 130, 148, 162, 159, 94,
+ 83, 91, 105, 131, 153, 165, 166, 97, 86, 94, 107, 128, 157, 167, 171,
+ 100, 89, 97, 111, 127, 152, 173, 182, 103, 93, 98, 114, 131, 150, 174,
+ 186, 107, 96, 100, 117, 136, 155, 177, 191, 110, 100, 101, 117, 138,
+ 161, 183, 193, 114, 104, 103, 117, 137, 159, 185, 201, 118, 107, 105,
+ 118, 136, 157, 182, 203, 122, 111, 107, 119, 136, 156, 179, 204,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 32, 32, 34, 35, 36, 39, 44, 46, 48, 53, 58, 61, 65, 71,
+ 79, 81, 82, 88, 91, 94, 97, 100, 103, 107, 110, 114, 118, 122, 32, 32,
+ 33, 33, 34, 35, 37, 37, 38, 40, 43, 44, 46, 50, 54, 56, 58, 63, 70, 71,
+ 72, 77, 80, 83, 86, 89, 93, 96, 100, 104, 107, 111, 36, 35, 34, 35, 36,
+ 38, 42, 45, 48, 50, 53, 55, 56, 60, 63, 66, 68, 73, 79, 80, 81, 85, 88,
+ 91, 94, 97, 98, 100, 101, 103, 105, 107, 53, 51, 49, 49, 50, 49, 54, 57,
+ 60, 65, 71, 73, 76, 82, 87, 89, 92, 97, 104, 105, 106, 108, 106, 105,
+ 107, 111, 114, 117, 117, 117, 118, 119, 65, 62, 59, 59, 59, 58, 63, 65,
+ 68, 73, 79, 82, 85, 92, 98, 101, 105, 111, 118, 119, 121, 126, 130, 131,
+ 128, 127, 131, 136, 138, 137, 136, 136, 87, 82, 78, 78, 77, 75, 79, 82,
+ 84, 89, 95, 98, 102, 109, 116, 120, 124, 132, 141, 142, 144, 149, 148,
+ 153, 157, 152, 150, 155, 161, 159, 157, 156, 93, 88, 86, 84, 82, 82, 80,
+ 84, 86, 91, 94, 98, 105, 107, 112, 119, 122, 130, 135, 140, 149, 153,
+ 162, 165, 167, 173, 174, 177, 183, 185, 182, 179, 99, 94, 93, 90, 89,
+ 89, 88, 87, 90, 93, 97, 99, 105, 107, 115, 116, 124, 127, 135, 139, 146,
+ 152, 159, 166, 171, 182, 186, 191, 193, 201, 203, 204 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 35, 46, 57, 66, 46, 60, 69, 71, 57, 69, 90, 90, 66, 71, 90, 109,
+ /* Size 8x8 */
+ 31, 38, 47, 50, 57, 63, 67, 71, 38, 47, 46, 47, 52, 57, 62, 67, 47, 46,
+ 54, 57, 61, 66, 67, 68, 50, 47, 57, 66, 72, 77, 75, 75, 57, 52, 61, 72,
+ 82, 88, 86, 84, 63, 57, 66, 77, 88, 96, 95, 95, 67, 62, 67, 75, 86, 95,
+ 104, 107, 71, 67, 68, 75, 84, 95, 107, 113,
+ /* Size 16x16 */
+ 32, 30, 33, 41, 49, 49, 50, 54, 57, 63, 65, 68, 70, 72, 74, 76, 30, 32,
+ 35, 42, 46, 45, 46, 49, 52, 57, 58, 62, 64, 67, 70, 72, 33, 35, 39, 45,
+ 47, 45, 46, 49, 51, 56, 57, 60, 62, 64, 66, 69, 41, 42, 45, 48, 50, 49,
+ 50, 52, 53, 57, 58, 59, 60, 61, 64, 67, 49, 46, 47, 50, 53, 53, 54, 55,
+ 56, 60, 61, 64, 64, 65, 66, 66, 49, 45, 45, 49, 53, 58, 60, 62, 63, 67,
+ 68, 67, 69, 68, 70, 70, 50, 46, 46, 50, 54, 60, 61, 65, 67, 71, 71, 74,
+ 73, 73, 74, 74, 54, 49, 49, 52, 55, 62, 65, 71, 73, 78, 79, 78, 77, 78,
+ 78, 78, 57, 52, 51, 53, 56, 63, 67, 73, 76, 82, 83, 84, 84, 84, 82, 83,
+ 63, 57, 56, 57, 60, 67, 71, 78, 82, 89, 90, 90, 89, 88, 87, 88, 65, 58,
+ 57, 58, 61, 68, 71, 79, 83, 90, 91, 94, 93, 93, 92, 93, 68, 62, 60, 59,
+ 64, 67, 74, 78, 84, 90, 94, 98, 99, 98, 98, 98, 70, 64, 62, 60, 64, 69,
+ 73, 77, 84, 89, 93, 99, 102, 103, 104, 104, 72, 67, 64, 61, 65, 68, 73,
+ 78, 84, 88, 93, 98, 103, 106, 108, 109, 74, 70, 66, 64, 66, 70, 74, 78,
+ 82, 87, 92, 98, 104, 108, 111, 112, 76, 72, 69, 67, 66, 70, 74, 78, 83,
+ 88, 93, 98, 104, 109, 112, 116,
+ /* Size 32x32 */
+ 32, 31, 30, 32, 33, 36, 41, 45, 49, 48, 49, 50, 50, 52, 54, 56, 57, 60,
+ 63, 64, 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 31, 31, 31, 33,
+ 34, 38, 42, 45, 47, 47, 47, 47, 48, 50, 52, 53, 54, 57, 60, 61, 61, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 30, 31, 32, 33, 35, 40, 42, 44,
+ 46, 45, 45, 45, 46, 47, 49, 51, 52, 54, 57, 58, 58, 61, 62, 63, 64, 66,
+ 67, 68, 70, 71, 72, 74, 32, 33, 33, 35, 37, 41, 43, 45, 47, 46, 45, 46,
+ 46, 47, 49, 50, 51, 54, 57, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 33, 34, 35, 37, 39, 43, 45, 46, 47, 46, 45, 46, 46, 47, 49, 50,
+ 51, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 68, 69, 70, 36, 38,
+ 40, 41, 43, 47, 47, 47, 48, 46, 45, 46, 46, 47, 48, 49, 50, 52, 54, 55,
+ 55, 57, 58, 59, 61, 62, 64, 65, 66, 67, 68, 69, 41, 42, 42, 43, 45, 47,
+ 48, 49, 50, 49, 49, 49, 50, 50, 52, 52, 53, 55, 57, 58, 58, 60, 59, 59,
+ 60, 61, 61, 63, 64, 66, 67, 69, 45, 45, 44, 45, 46, 47, 49, 50, 51, 51,
+ 51, 51, 52, 52, 53, 54, 55, 57, 59, 59, 60, 61, 61, 62, 63, 63, 63, 63,
+ 63, 64, 65, 66, 49, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 54, 54, 54,
+ 55, 56, 56, 58, 60, 61, 61, 63, 64, 64, 64, 64, 65, 66, 66, 66, 66, 66,
+ 48, 47, 45, 46, 46, 46, 49, 51, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61,
+ 63, 64, 64, 66, 66, 65, 66, 67, 67, 67, 67, 68, 69, 70, 49, 47, 45, 45,
+ 45, 45, 49, 51, 53, 55, 58, 59, 60, 61, 62, 63, 63, 65, 67, 67, 68, 69,
+ 67, 68, 69, 68, 68, 69, 70, 70, 70, 70, 50, 47, 45, 46, 46, 46, 49, 51,
+ 54, 56, 59, 60, 60, 62, 64, 64, 65, 67, 69, 69, 70, 70, 71, 71, 70, 70,
+ 71, 71, 71, 71, 72, 74, 50, 48, 46, 46, 46, 46, 50, 52, 54, 56, 60, 60,
+ 61, 63, 65, 66, 67, 68, 71, 71, 71, 73, 74, 72, 73, 74, 73, 73, 74, 74,
+ 74, 74, 52, 50, 47, 47, 47, 47, 50, 52, 54, 57, 61, 62, 63, 66, 68, 69,
+ 70, 72, 75, 75, 75, 77, 75, 75, 76, 75, 75, 76, 75, 75, 76, 77, 54, 52,
+ 49, 49, 49, 48, 52, 53, 55, 58, 62, 64, 65, 68, 71, 72, 73, 75, 78, 78,
+ 79, 79, 78, 79, 77, 78, 78, 77, 78, 79, 78, 78, 56, 53, 51, 50, 50, 49,
+ 52, 54, 56, 59, 63, 64, 66, 69, 72, 73, 75, 77, 80, 80, 81, 81, 82, 80,
+ 81, 81, 79, 81, 80, 79, 81, 82, 57, 54, 52, 51, 51, 50, 53, 55, 56, 60,
+ 63, 65, 67, 70, 73, 75, 76, 79, 82, 82, 83, 85, 84, 83, 84, 83, 84, 82,
+ 82, 84, 83, 82, 60, 57, 54, 54, 53, 52, 55, 57, 58, 61, 65, 67, 68, 72,
+ 75, 77, 79, 82, 85, 85, 86, 88, 86, 87, 85, 86, 85, 85, 86, 84, 85, 86,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 63, 67, 69, 71, 75, 78, 80, 82, 85,
+ 89, 89, 90, 90, 90, 89, 89, 88, 88, 88, 87, 88, 88, 87, 64, 61, 58, 57,
+ 57, 55, 58, 59, 61, 64, 67, 69, 71, 75, 78, 80, 82, 85, 89, 90, 91, 92,
+ 93, 92, 92, 91, 91, 90, 91, 90, 90, 92, 65, 61, 58, 58, 57, 55, 58, 60,
+ 61, 64, 68, 70, 71, 75, 79, 81, 83, 86, 90, 91, 91, 94, 94, 96, 93, 94,
+ 93, 94, 92, 93, 93, 92, 67, 63, 61, 60, 59, 57, 60, 61, 63, 66, 69, 70,
+ 73, 77, 79, 81, 85, 88, 90, 92, 94, 96, 96, 97, 98, 95, 97, 95, 96, 95,
+ 95, 96, 68, 64, 62, 61, 60, 58, 59, 61, 64, 66, 67, 71, 74, 75, 78, 82,
+ 84, 86, 90, 93, 94, 96, 98, 98, 99, 100, 98, 99, 98, 98, 98, 97, 69, 65,
+ 63, 62, 61, 59, 59, 62, 64, 65, 68, 71, 72, 75, 79, 80, 83, 87, 89, 92,
+ 96, 97, 98, 100, 100, 101, 102, 101, 101, 101, 100, 102, 70, 66, 64, 63,
+ 62, 61, 60, 63, 64, 66, 69, 70, 73, 76, 77, 81, 84, 85, 89, 92, 93, 98,
+ 99, 100, 102, 102, 103, 104, 104, 103, 104, 102, 71, 67, 66, 64, 63, 62,
+ 61, 63, 64, 67, 68, 70, 74, 75, 78, 81, 83, 86, 88, 91, 94, 95, 100,
+ 101, 102, 104, 104, 105, 106, 107, 105, 107, 72, 68, 67, 65, 64, 64, 61,
+ 63, 65, 67, 68, 71, 73, 75, 78, 79, 84, 85, 88, 91, 93, 97, 98, 102,
+ 103, 104, 106, 106, 108, 108, 109, 107, 73, 69, 68, 66, 65, 65, 63, 63,
+ 66, 67, 69, 71, 73, 76, 77, 81, 82, 85, 88, 90, 94, 95, 99, 101, 104,
+ 105, 106, 109, 108, 110, 111, 112, 74, 70, 70, 67, 66, 66, 64, 63, 66,
+ 67, 70, 71, 74, 75, 78, 80, 82, 86, 87, 91, 92, 96, 98, 101, 104, 106,
+ 108, 108, 111, 111, 112, 113, 75, 71, 71, 68, 68, 67, 66, 64, 66, 68,
+ 70, 71, 74, 75, 79, 79, 84, 84, 88, 90, 93, 95, 98, 101, 103, 107, 108,
+ 110, 111, 113, 113, 115, 76, 72, 72, 69, 69, 68, 67, 65, 66, 69, 70, 72,
+ 74, 76, 78, 81, 83, 85, 88, 90, 93, 95, 98, 100, 104, 105, 109, 111,
+ 112, 113, 116, 115, 78, 74, 74, 70, 70, 69, 69, 66, 66, 70, 70, 74, 74,
+ 77, 78, 82, 82, 86, 87, 92, 92, 96, 97, 102, 102, 107, 107, 112, 113,
+ 115, 115, 118,
+ /* Size 4x8 */
+ 31, 47, 60, 66, 40, 45, 54, 61, 46, 56, 64, 64, 48, 61, 75, 73, 54, 65,
+ 85, 82, 61, 69, 92, 92, 64, 68, 90, 102, 68, 71, 87, 105,
+ /* Size 8x4 */
+ 31, 40, 46, 48, 54, 61, 64, 68, 47, 45, 56, 61, 65, 69, 68, 71, 60, 54,
+ 64, 75, 85, 92, 90, 87, 66, 61, 64, 73, 82, 92, 102, 105,
+ /* Size 8x16 */
+ 32, 37, 48, 52, 57, 66, 68, 71, 30, 40, 46, 48, 52, 60, 63, 66, 33, 43,
+ 47, 47, 51, 59, 60, 63, 42, 47, 50, 50, 53, 60, 59, 62, 49, 48, 53, 54,
+ 57, 62, 62, 62, 49, 46, 53, 61, 64, 69, 66, 66, 50, 46, 54, 64, 67, 73,
+ 72, 70, 54, 49, 55, 68, 73, 80, 76, 75, 57, 50, 56, 70, 76, 84, 80, 79,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 56, 61, 75, 83, 93, 93, 89, 68, 59,
+ 64, 74, 86, 94, 98, 94, 70, 62, 66, 73, 83, 96, 99, 98, 72, 64, 66, 75,
+ 83, 92, 101, 104, 74, 67, 66, 74, 84, 94, 103, 106, 76, 69, 67, 73, 82,
+ 91, 101, 109,
+ /* Size 16x8 */
+ 32, 30, 33, 42, 49, 49, 50, 54, 57, 63, 64, 68, 70, 72, 74, 76, 37, 40,
+ 43, 47, 48, 46, 46, 49, 50, 55, 56, 59, 62, 64, 67, 69, 48, 46, 47, 50,
+ 53, 53, 54, 55, 56, 60, 61, 64, 66, 66, 66, 67, 52, 48, 47, 50, 54, 61,
+ 64, 68, 70, 75, 75, 74, 73, 75, 74, 73, 57, 52, 51, 53, 57, 64, 67, 73,
+ 76, 82, 83, 86, 83, 83, 84, 82, 66, 60, 59, 60, 62, 69, 73, 80, 84, 92,
+ 93, 94, 96, 92, 94, 91, 68, 63, 60, 59, 62, 66, 72, 76, 80, 87, 93, 98,
+ 99, 101, 103, 101, 71, 66, 63, 62, 62, 66, 70, 75, 79, 84, 89, 94, 98,
+ 104, 106, 109,
+ /* Size 16x32 */
+ 32, 31, 37, 42, 48, 49, 52, 54, 57, 63, 66, 67, 68, 69, 71, 72, 31, 31,
+ 38, 42, 47, 47, 50, 52, 54, 60, 63, 64, 65, 66, 67, 68, 30, 32, 40, 42,
+ 46, 45, 48, 50, 52, 57, 60, 62, 63, 65, 66, 68, 32, 34, 41, 44, 46, 45,
+ 48, 49, 51, 57, 59, 61, 62, 63, 64, 65, 33, 36, 43, 45, 47, 46, 47, 49,
+ 51, 56, 59, 60, 60, 62, 63, 65, 37, 40, 47, 47, 47, 45, 47, 48, 50, 54,
+ 57, 58, 60, 61, 62, 63, 42, 43, 47, 48, 50, 49, 50, 52, 53, 57, 60, 58,
+ 59, 60, 62, 63, 45, 44, 47, 49, 51, 51, 52, 54, 55, 59, 61, 61, 61, 60,
+ 61, 61, 49, 46, 48, 50, 53, 53, 54, 55, 57, 60, 62, 63, 62, 63, 62, 62,
+ 48, 46, 47, 50, 53, 56, 57, 59, 60, 64, 66, 65, 65, 64, 64, 65, 49, 45,
+ 46, 49, 53, 58, 61, 62, 64, 67, 69, 67, 66, 66, 66, 65, 49, 46, 46, 49,
+ 53, 59, 62, 64, 65, 69, 71, 70, 68, 68, 67, 68, 50, 46, 46, 50, 54, 59,
+ 64, 65, 67, 71, 73, 72, 72, 70, 70, 69, 52, 48, 47, 50, 54, 61, 66, 68,
+ 71, 75, 77, 74, 73, 73, 71, 72, 54, 50, 49, 52, 55, 62, 68, 71, 73, 78,
+ 80, 78, 76, 74, 75, 73, 55, 51, 49, 52, 56, 63, 69, 72, 75, 80, 82, 80,
+ 79, 78, 76, 77, 57, 52, 50, 53, 56, 64, 70, 73, 76, 82, 84, 82, 80, 80,
+ 79, 77, 60, 54, 52, 55, 58, 65, 72, 75, 79, 85, 88, 86, 84, 82, 81, 81,
+ 63, 57, 55, 58, 60, 67, 75, 78, 82, 89, 92, 88, 87, 85, 84, 81, 64, 58,
+ 55, 58, 61, 68, 75, 78, 82, 89, 92, 90, 89, 87, 86, 86, 64, 59, 56, 58,
+ 61, 68, 75, 79, 83, 90, 93, 95, 93, 91, 89, 87, 67, 61, 58, 60, 63, 69,
+ 76, 79, 85, 92, 95, 96, 94, 92, 91, 91, 68, 62, 59, 60, 64, 71, 74, 78,
+ 86, 91, 94, 96, 98, 96, 94, 91, 69, 62, 60, 60, 65, 70, 72, 79, 85, 88,
+ 95, 98, 99, 98, 97, 96, 70, 63, 62, 60, 66, 69, 73, 81, 83, 89, 96, 97,
+ 99, 101, 98, 97, 71, 64, 63, 61, 67, 68, 74, 79, 82, 90, 93, 98, 102,
+ 102, 102, 101, 72, 65, 64, 62, 66, 68, 75, 78, 83, 89, 92, 100, 101,
+ 103, 104, 102, 73, 66, 65, 63, 66, 69, 75, 76, 84, 87, 93, 98, 102, 105,
+ 106, 107, 74, 67, 67, 64, 66, 70, 74, 77, 84, 86, 94, 96, 103, 105, 106,
+ 107, 75, 68, 68, 65, 66, 71, 74, 78, 83, 87, 93, 96, 103, 105, 109, 109,
+ 76, 69, 69, 66, 67, 72, 73, 80, 82, 88, 91, 97, 101, 107, 109, 110, 77,
+ 70, 70, 67, 67, 73, 73, 81, 81, 90, 90, 99, 99, 108, 108, 113,
+ /* Size 32x16 */
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+ 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 31, 31, 32, 34,
+ 36, 40, 43, 44, 46, 46, 45, 46, 46, 48, 50, 51, 52, 54, 57, 58, 59, 61,
+ 62, 62, 63, 64, 65, 66, 67, 68, 69, 70, 37, 38, 40, 41, 43, 47, 47, 47,
+ 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58, 59, 60, 62, 63,
+ 64, 65, 67, 68, 69, 70, 42, 42, 42, 44, 45, 47, 48, 49, 50, 50, 49, 49,
+ 50, 50, 52, 52, 53, 55, 58, 58, 58, 60, 60, 60, 60, 61, 62, 63, 64, 65,
+ 66, 67, 48, 47, 46, 46, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 55, 56,
+ 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 66, 66, 66, 66, 67, 67, 49, 47,
+ 45, 45, 46, 45, 49, 51, 53, 56, 58, 59, 59, 61, 62, 63, 64, 65, 67, 68,
+ 68, 69, 71, 70, 69, 68, 68, 69, 70, 71, 72, 73, 52, 50, 48, 48, 47, 47,
+ 50, 52, 54, 57, 61, 62, 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72,
+ 73, 74, 75, 75, 74, 74, 73, 73, 54, 52, 50, 49, 49, 48, 52, 54, 55, 59,
+ 62, 64, 65, 68, 71, 72, 73, 75, 78, 78, 79, 79, 78, 79, 81, 79, 78, 76,
+ 77, 78, 80, 81, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71,
+ 73, 75, 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81,
+ 63, 60, 57, 57, 56, 54, 57, 59, 60, 64, 67, 69, 71, 75, 78, 80, 82, 85,
+ 89, 89, 90, 92, 91, 88, 89, 90, 89, 87, 86, 87, 88, 90, 66, 63, 60, 59,
+ 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92, 93, 95,
+ 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 67, 64, 62, 61, 60, 58, 58, 61,
+ 63, 65, 67, 70, 72, 74, 78, 80, 82, 86, 88, 90, 95, 96, 96, 98, 97, 98,
+ 100, 98, 96, 96, 97, 99, 68, 65, 63, 62, 60, 60, 59, 61, 62, 65, 66, 68,
+ 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99, 99, 102, 101, 102, 103,
+ 103, 101, 99, 69, 66, 65, 63, 62, 61, 60, 60, 63, 64, 66, 68, 70, 73,
+ 74, 78, 80, 82, 85, 87, 91, 92, 96, 98, 101, 102, 103, 105, 105, 105,
+ 107, 108, 71, 67, 66, 64, 63, 62, 62, 61, 62, 64, 66, 67, 70, 71, 75,
+ 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102, 104, 106, 106, 109, 109,
+ 108, 72, 68, 68, 65, 65, 63, 63, 61, 62, 65, 65, 68, 69, 72, 73, 77, 77,
+ 81, 81, 86, 87, 91, 91, 96, 97, 101, 102, 107, 107, 109, 110, 113,
+ /* Size 4x16 */
+ 31, 49, 63, 69, 32, 45, 57, 65, 36, 46, 56, 62, 43, 49, 57, 60, 46, 53,
+ 60, 63, 45, 58, 67, 66, 46, 59, 71, 70, 50, 62, 78, 74, 52, 64, 82, 80,
+ 57, 67, 89, 85, 59, 68, 90, 91, 62, 71, 91, 96, 63, 69, 89, 101, 65, 68,
+ 89, 103, 67, 70, 86, 105, 69, 72, 88, 107,
+ /* Size 16x4 */
+ 31, 32, 36, 43, 46, 45, 46, 50, 52, 57, 59, 62, 63, 65, 67, 69, 49, 45,
+ 46, 49, 53, 58, 59, 62, 64, 67, 68, 71, 69, 68, 70, 72, 63, 57, 56, 57,
+ 60, 67, 71, 78, 82, 89, 90, 91, 89, 89, 86, 88, 69, 65, 62, 60, 63, 66,
+ 70, 74, 80, 85, 91, 96, 101, 103, 105, 107,
+ /* Size 8x32 */
+ 32, 37, 48, 52, 57, 66, 68, 71, 31, 38, 47, 50, 54, 63, 65, 67, 30, 40,
+ 46, 48, 52, 60, 63, 66, 32, 41, 46, 48, 51, 59, 62, 64, 33, 43, 47, 47,
+ 51, 59, 60, 63, 37, 47, 47, 47, 50, 57, 60, 62, 42, 47, 50, 50, 53, 60,
+ 59, 62, 45, 47, 51, 52, 55, 61, 61, 61, 49, 48, 53, 54, 57, 62, 62, 62,
+ 48, 47, 53, 57, 60, 66, 65, 64, 49, 46, 53, 61, 64, 69, 66, 66, 49, 46,
+ 53, 62, 65, 71, 68, 67, 50, 46, 54, 64, 67, 73, 72, 70, 52, 47, 54, 66,
+ 71, 77, 73, 71, 54, 49, 55, 68, 73, 80, 76, 75, 55, 49, 56, 69, 75, 82,
+ 79, 76, 57, 50, 56, 70, 76, 84, 80, 79, 60, 52, 58, 72, 79, 88, 84, 81,
+ 63, 55, 60, 75, 82, 92, 87, 84, 64, 55, 61, 75, 82, 92, 89, 86, 64, 56,
+ 61, 75, 83, 93, 93, 89, 67, 58, 63, 76, 85, 95, 94, 91, 68, 59, 64, 74,
+ 86, 94, 98, 94, 69, 60, 65, 72, 85, 95, 99, 97, 70, 62, 66, 73, 83, 96,
+ 99, 98, 71, 63, 67, 74, 82, 93, 102, 102, 72, 64, 66, 75, 83, 92, 101,
+ 104, 73, 65, 66, 75, 84, 93, 102, 106, 74, 67, 66, 74, 84, 94, 103, 106,
+ 75, 68, 66, 74, 83, 93, 103, 109, 76, 69, 67, 73, 82, 91, 101, 109, 77,
+ 70, 67, 73, 81, 90, 99, 108,
+ /* Size 32x8 */
+ 32, 31, 30, 32, 33, 37, 42, 45, 49, 48, 49, 49, 50, 52, 54, 55, 57, 60,
+ 63, 64, 64, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 37, 38, 40, 41,
+ 43, 47, 47, 47, 48, 47, 46, 46, 46, 47, 49, 49, 50, 52, 55, 55, 56, 58,
+ 59, 60, 62, 63, 64, 65, 67, 68, 69, 70, 48, 47, 46, 46, 47, 47, 50, 51,
+ 53, 53, 53, 53, 54, 54, 55, 56, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67,
+ 66, 66, 66, 66, 67, 67, 52, 50, 48, 48, 47, 47, 50, 52, 54, 57, 61, 62,
+ 64, 66, 68, 69, 70, 72, 75, 75, 75, 76, 74, 72, 73, 74, 75, 75, 74, 74,
+ 73, 73, 57, 54, 52, 51, 51, 50, 53, 55, 57, 60, 64, 65, 67, 71, 73, 75,
+ 76, 79, 82, 82, 83, 85, 86, 85, 83, 82, 83, 84, 84, 83, 82, 81, 66, 63,
+ 60, 59, 59, 57, 60, 61, 62, 66, 69, 71, 73, 77, 80, 82, 84, 88, 92, 92,
+ 93, 95, 94, 95, 96, 93, 92, 93, 94, 93, 91, 90, 68, 65, 63, 62, 60, 60,
+ 59, 61, 62, 65, 66, 68, 72, 73, 76, 79, 80, 84, 87, 89, 93, 94, 98, 99,
+ 99, 102, 101, 102, 103, 103, 101, 99, 71, 67, 66, 64, 63, 62, 62, 61,
+ 62, 64, 66, 67, 70, 71, 75, 76, 79, 81, 84, 86, 89, 91, 94, 97, 98, 102,
+ 104, 106, 106, 109, 109, 108 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 41, 69, 92, 41, 63, 88, 103, 69, 88, 127, 140, 92, 103, 140, 184,
+ /* Size 8x8 */
+ 32, 32, 37, 47, 62, 78, 90, 102, 32, 35, 39, 46, 58, 72, 84, 96, 37, 39,
+ 51, 60, 71, 84, 93, 100, 47, 46, 60, 73, 87, 100, 106, 113, 62, 58, 71,
+ 87, 105, 121, 129, 132, 78, 72, 84, 100, 121, 140, 148, 155, 90, 84, 93,
+ 106, 129, 148, 169, 183, 102, 96, 100, 113, 132, 155, 183, 201,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 36, 39, 47, 54, 61, 71, 80, 86, 92, 98, 104, 111, 31,
+ 32, 32, 33, 34, 37, 44, 50, 56, 65, 73, 79, 85, 91, 98, 105, 31, 32, 33,
+ 34, 36, 39, 45, 50, 56, 64, 71, 77, 82, 88, 94, 100, 32, 33, 34, 36, 40,
+ 42, 47, 51, 57, 65, 71, 76, 80, 85, 91, 98, 36, 34, 36, 40, 48, 50, 56,
+ 60, 65, 73, 79, 84, 86, 90, 95, 98, 39, 37, 39, 42, 50, 54, 60, 65, 70,
+ 78, 84, 89, 95, 96, 102, 105, 47, 44, 45, 47, 56, 60, 69, 75, 81, 89,
+ 95, 100, 102, 104, 109, 112, 54, 50, 50, 51, 60, 65, 75, 82, 89, 97,
+ 104, 109, 110, 114, 117, 121, 61, 56, 56, 57, 65, 70, 81, 89, 97, 106,
+ 113, 119, 122, 126, 125, 130, 71, 65, 64, 65, 73, 78, 89, 97, 106, 117,
+ 125, 131, 134, 134, 136, 141, 80, 73, 71, 71, 79, 84, 95, 104, 113, 125,
+ 134, 140, 142, 145, 146, 152, 86, 79, 77, 76, 84, 89, 100, 109, 119,
+ 131, 140, 147, 154, 157, 160, 165, 92, 85, 82, 80, 86, 95, 102, 110,
+ 122, 134, 142, 154, 162, 168, 174, 178, 98, 91, 88, 85, 90, 96, 104,
+ 114, 126, 134, 145, 157, 168, 176, 184, 193, 104, 98, 94, 91, 95, 102,
+ 109, 117, 125, 136, 146, 160, 174, 184, 193, 201, 111, 105, 100, 98, 98,
+ 105, 112, 121, 130, 141, 152, 165, 178, 193, 201, 210,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 61, 65,
+ 71, 76, 80, 83, 86, 89, 92, 95, 98, 101, 104, 108, 111, 114, 31, 32, 32,
+ 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 58, 62, 68, 72, 76,
+ 78, 82, 85, 88, 90, 93, 96, 99, 102, 105, 109, 31, 32, 32, 32, 32, 32,
+ 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 56, 60, 65, 70, 73, 76, 79, 82,
+ 85, 88, 91, 95, 98, 101, 105, 109, 31, 32, 32, 32, 32, 33, 33, 34, 35,
+ 36, 38, 41, 44, 45, 49, 54, 56, 59, 65, 69, 72, 75, 78, 81, 84, 86, 89,
+ 92, 95, 98, 101, 104, 31, 32, 32, 32, 33, 34, 34, 35, 36, 38, 39, 42,
+ 45, 46, 50, 54, 56, 59, 64, 68, 71, 74, 77, 79, 82, 85, 88, 91, 94, 97,
+ 100, 104, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 45, 46, 49,
+ 53, 55, 58, 63, 66, 69, 72, 74, 78, 81, 84, 87, 90, 93, 96, 99, 102, 32,
+ 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44, 47, 48, 51, 55, 57, 60, 65,
+ 68, 71, 73, 76, 78, 80, 82, 85, 88, 91, 95, 98, 102, 34, 34, 33, 34, 35,
+ 37, 38, 39, 42, 44, 45, 47, 50, 51, 54, 58, 60, 63, 68, 71, 74, 76, 79,
+ 82, 85, 86, 87, 88, 90, 93, 96, 99, 36, 35, 34, 35, 36, 38, 40, 42, 48,
+ 50, 50, 54, 56, 57, 60, 64, 65, 68, 73, 76, 79, 81, 84, 86, 86, 88, 90,
+ 93, 95, 97, 98, 100, 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58,
+ 60, 63, 67, 68, 71, 76, 79, 82, 84, 87, 87, 90, 93, 94, 95, 96, 100,
+ 103, 106, 39, 38, 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65,
+ 69, 70, 73, 78, 81, 84, 86, 89, 92, 95, 95, 96, 99, 102, 104, 105, 106,
+ 44, 42, 41, 41, 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 77, 79,
+ 84, 88, 90, 92, 95, 97, 97, 99, 102, 103, 103, 106, 109, 113, 47, 45,
+ 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+ 95, 97, 100, 100, 102, 105, 104, 106, 109, 111, 112, 113, 49, 47, 46,
+ 45, 46, 46, 48, 51, 57, 60, 62, 68, 71, 73, 77, 81, 83, 87, 92, 95, 98,
+ 100, 103, 105, 107, 106, 109, 112, 112, 113, 117, 120, 54, 51, 50, 49,
+ 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+ 106, 109, 112, 110, 113, 114, 114, 117, 121, 121, 121, 59, 56, 54, 54,
+ 54, 53, 55, 58, 64, 67, 69, 75, 79, 81, 87, 92, 94, 98, 103, 107, 110,
+ 113, 116, 114, 117, 118, 117, 121, 122, 122, 125, 129, 61, 58, 56, 56,
+ 56, 55, 57, 60, 65, 68, 70, 77, 81, 83, 89, 94, 97, 101, 106, 110, 113,
+ 116, 119, 120, 122, 121, 126, 124, 125, 130, 130, 130, 65, 62, 60, 59,
+ 59, 58, 60, 63, 68, 71, 73, 79, 84, 87, 92, 98, 101, 105, 111, 115, 118,
+ 121, 124, 128, 125, 129, 128, 131, 133, 132, 135, 139, 71, 68, 65, 65,
+ 64, 63, 65, 68, 73, 76, 78, 84, 89, 92, 97, 103, 106, 111, 117, 122,
+ 125, 128, 131, 131, 134, 132, 134, 136, 136, 140, 141, 140, 76, 72, 70,
+ 69, 68, 66, 68, 71, 76, 79, 81, 88, 92, 95, 101, 107, 110, 115, 122,
+ 127, 130, 133, 136, 136, 138, 139, 141, 140, 145, 143, 146, 151, 80, 76,
+ 73, 72, 71, 69, 71, 74, 79, 82, 84, 90, 95, 98, 104, 110, 113, 118, 125,
+ 130, 134, 137, 140, 146, 142, 146, 145, 149, 146, 150, 152, 151, 83, 78,
+ 76, 75, 74, 72, 73, 76, 81, 84, 86, 92, 97, 100, 106, 113, 116, 121,
+ 128, 133, 137, 140, 144, 147, 152, 148, 154, 151, 156, 155, 156, 162,
+ 86, 82, 79, 78, 77, 74, 76, 79, 84, 87, 89, 95, 100, 103, 109, 116, 119,
+ 124, 131, 136, 140, 144, 147, 150, 154, 159, 157, 160, 160, 162, 165,
+ 162, 89, 85, 82, 81, 79, 78, 78, 82, 86, 87, 92, 97, 100, 105, 112, 114,
+ 120, 128, 131, 136, 146, 147, 150, 155, 156, 161, 166, 165, 167, 169,
+ 169, 175, 92, 88, 85, 84, 82, 81, 80, 85, 86, 90, 95, 97, 102, 107, 110,
+ 117, 122, 125, 134, 138, 142, 152, 154, 156, 162, 163, 168, 173, 174,
+ 174, 178, 176, 95, 90, 88, 86, 85, 84, 82, 86, 88, 93, 95, 99, 105, 106,
+ 113, 118, 121, 129, 132, 139, 146, 148, 159, 161, 163, 169, 170, 176,
+ 180, 183, 181, 187, 98, 93, 91, 89, 88, 87, 85, 87, 90, 94, 96, 102,
+ 104, 109, 114, 117, 126, 128, 134, 141, 145, 154, 157, 166, 168, 170,
+ 176, 178, 184, 188, 193, 188, 101, 96, 95, 92, 91, 90, 88, 88, 93, 95,
+ 99, 103, 106, 112, 114, 121, 124, 131, 136, 140, 149, 151, 160, 165,
+ 173, 176, 178, 184, 186, 192, 196, 203, 104, 99, 98, 95, 94, 93, 91, 90,
+ 95, 96, 102, 103, 109, 112, 117, 122, 125, 133, 136, 145, 146, 156, 160,
+ 167, 174, 180, 184, 186, 193, 194, 201, 204, 108, 102, 101, 98, 97, 96,
+ 95, 93, 97, 100, 104, 106, 111, 113, 121, 122, 130, 132, 140, 143, 150,
+ 155, 162, 169, 174, 183, 188, 192, 194, 201, 202, 210, 111, 105, 105,
+ 101, 100, 99, 98, 96, 98, 103, 105, 109, 112, 117, 121, 125, 130, 135,
+ 141, 146, 152, 156, 165, 169, 178, 181, 193, 196, 201, 202, 210, 211,
+ 114, 109, 109, 104, 104, 102, 102, 99, 100, 106, 106, 113, 113, 120,
+ 121, 129, 130, 139, 140, 151, 151, 162, 162, 175, 176, 187, 188, 203,
+ 204, 210, 211, 219,
+ /* Size 4x8 */
+ 32, 42, 69, 88, 33, 42, 64, 83, 36, 56, 77, 88, 46, 67, 93, 105, 60, 79,
+ 112, 122, 75, 92, 130, 144, 86, 95, 136, 167, 98, 105, 136, 177,
+ /* Size 8x4 */
+ 32, 33, 36, 46, 60, 75, 86, 98, 42, 42, 56, 67, 79, 92, 95, 105, 69, 64,
+ 77, 93, 112, 130, 136, 136, 88, 83, 88, 105, 122, 144, 167, 177,
+ /* Size 8x16 */
+ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 44, 60, 72, 84, 90, 32, 34,
+ 36, 45, 59, 71, 80, 87, 32, 35, 40, 47, 60, 71, 78, 85, 36, 37, 48, 56,
+ 68, 78, 83, 87, 39, 40, 50, 60, 73, 84, 91, 94, 47, 45, 56, 69, 84, 95,
+ 101, 101, 53, 50, 60, 75, 92, 103, 108, 110, 61, 56, 65, 81, 100, 113,
+ 116, 118, 71, 64, 73, 89, 111, 125, 129, 129, 79, 70, 79, 95, 118, 133,
+ 142, 138, 86, 76, 84, 100, 124, 140, 153, 150, 92, 82, 89, 101, 121,
+ 148, 157, 161, 98, 88, 93, 108, 124, 141, 163, 174, 104, 94, 95, 110,
+ 129, 151, 171, 181, 110, 100, 98, 111, 127, 147, 169, 188,
+ /* Size 16x8 */
+ 32, 31, 32, 32, 36, 39, 47, 53, 61, 71, 79, 86, 92, 98, 104, 110, 32,
+ 32, 34, 35, 37, 40, 45, 50, 56, 64, 70, 76, 82, 88, 94, 100, 36, 35, 36,
+ 40, 48, 50, 56, 60, 65, 73, 79, 84, 89, 93, 95, 98, 47, 44, 45, 47, 56,
+ 60, 69, 75, 81, 89, 95, 100, 101, 108, 110, 111, 65, 60, 59, 60, 68, 73,
+ 84, 92, 100, 111, 118, 124, 121, 124, 129, 127, 79, 72, 71, 71, 78, 84,
+ 95, 103, 113, 125, 133, 140, 148, 141, 151, 147, 90, 84, 80, 78, 83, 91,
+ 101, 108, 116, 129, 142, 153, 157, 163, 171, 169, 96, 90, 87, 85, 87,
+ 94, 101, 110, 118, 129, 138, 150, 161, 174, 181, 188,
+ /* Size 16x32 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 65, 73, 79, 87, 90, 93, 96, 99, 31, 32,
+ 32, 33, 35, 42, 45, 51, 62, 69, 75, 83, 86, 88, 91, 94, 31, 32, 32, 33,
+ 35, 41, 44, 49, 60, 67, 72, 80, 84, 87, 90, 94, 31, 32, 33, 33, 35, 41,
+ 44, 49, 59, 66, 71, 79, 82, 84, 87, 90, 32, 32, 34, 34, 36, 42, 45, 50,
+ 59, 65, 71, 78, 80, 83, 87, 90, 32, 33, 35, 36, 38, 42, 45, 49, 58, 64,
+ 69, 76, 80, 83, 86, 88, 32, 33, 35, 36, 40, 44, 47, 51, 60, 66, 71, 76,
+ 78, 81, 85, 89, 34, 34, 36, 38, 42, 48, 50, 54, 63, 69, 73, 80, 82, 81,
+ 84, 86, 36, 34, 37, 40, 48, 54, 56, 60, 68, 74, 78, 84, 83, 86, 87, 87,
+ 38, 36, 39, 41, 49, 56, 58, 63, 71, 77, 81, 86, 88, 88, 90, 93, 39, 37,
+ 40, 42, 50, 58, 60, 65, 73, 79, 84, 90, 91, 92, 94, 93, 44, 41, 42, 45,
+ 53, 63, 66, 71, 79, 85, 90, 96, 94, 96, 96, 99, 47, 44, 45, 47, 56, 66,
+ 69, 75, 84, 90, 95, 99, 101, 98, 101, 99, 49, 46, 47, 48, 57, 67, 71,
+ 77, 86, 93, 97, 103, 103, 105, 102, 106, 53, 49, 50, 51, 60, 71, 75, 82,
+ 92, 99, 103, 111, 108, 107, 110, 107, 58, 54, 54, 55, 63, 75, 79, 87,
+ 98, 105, 110, 114, 114, 113, 111, 115, 61, 56, 56, 57, 65, 77, 81, 89,
+ 100, 107, 113, 118, 116, 117, 118, 116, 65, 60, 59, 60, 68, 79, 84, 92,
+ 105, 112, 118, 126, 124, 122, 121, 124, 71, 65, 64, 65, 73, 84, 89, 97,
+ 111, 119, 125, 130, 129, 129, 129, 125, 76, 69, 68, 69, 76, 88, 92, 101,
+ 115, 123, 130, 134, 134, 131, 132, 135, 79, 72, 70, 71, 79, 90, 95, 104,
+ 118, 127, 133, 143, 142, 141, 138, 136, 82, 75, 73, 74, 81, 92, 97, 106,
+ 121, 130, 136, 146, 145, 144, 144, 145, 86, 78, 76, 77, 84, 95, 100,
+ 109, 124, 133, 140, 147, 153, 151, 150, 146, 89, 81, 79, 78, 87, 95, 99,
+ 112, 124, 130, 145, 152, 156, 157, 156, 158, 92, 84, 82, 80, 89, 95,
+ 101, 116, 121, 132, 148, 151, 157, 163, 161, 159, 95, 86, 85, 83, 92,
+ 95, 105, 114, 120, 136, 143, 155, 163, 167, 171, 170, 98, 89, 88, 85,
+ 93, 95, 108, 113, 124, 136, 141, 160, 163, 169, 174, 171, 101, 92, 91,
+ 88, 94, 98, 110, 112, 128, 133, 146, 158, 166, 175, 179, 185, 104, 95,
+ 94, 91, 95, 101, 110, 115, 129, 132, 151, 154, 171, 175, 181, 186, 107,
+ 98, 97, 94, 96, 105, 110, 119, 128, 136, 149, 156, 173, 177, 188, 192,
+ 110, 101, 100, 97, 98, 108, 111, 123, 127, 141, 147, 161, 169, 183, 188,
+ 193, 114, 104, 104, 100, 100, 111, 111, 126, 127, 145, 145, 166, 166,
+ 189, 190, 201,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+ 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 31, 32, 32,
+ 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 56, 60, 65, 69, 72,
+ 75, 78, 81, 84, 86, 89, 92, 95, 98, 101, 104, 32, 32, 32, 33, 34, 35,
+ 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70, 73, 76, 79,
+ 82, 85, 88, 91, 94, 97, 100, 104, 32, 33, 33, 33, 34, 36, 36, 38, 40,
+ 41, 42, 45, 47, 48, 51, 55, 57, 60, 65, 69, 71, 74, 77, 78, 80, 83, 85,
+ 88, 91, 94, 97, 100, 36, 35, 35, 35, 36, 38, 40, 42, 48, 49, 50, 53, 56,
+ 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87, 89, 92, 93, 94, 95, 96, 98,
+ 100, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56, 58, 63, 66, 67, 71, 75, 77,
+ 79, 84, 88, 90, 92, 95, 95, 95, 95, 95, 98, 101, 105, 108, 111, 47, 45,
+ 44, 44, 45, 45, 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 81, 84, 89, 92,
+ 95, 97, 100, 99, 101, 105, 108, 110, 110, 110, 111, 111, 53, 51, 49, 49,
+ 50, 49, 51, 54, 60, 63, 65, 71, 75, 77, 82, 87, 89, 92, 97, 101, 104,
+ 106, 109, 112, 116, 114, 113, 112, 115, 119, 123, 126, 65, 62, 60, 59,
+ 59, 58, 60, 63, 68, 71, 73, 79, 84, 86, 92, 98, 100, 105, 111, 115, 118,
+ 121, 124, 124, 121, 120, 124, 128, 129, 128, 127, 127, 73, 69, 67, 66,
+ 65, 64, 66, 69, 74, 77, 79, 85, 90, 93, 99, 105, 107, 112, 119, 123,
+ 127, 130, 133, 130, 132, 136, 136, 133, 132, 136, 141, 145, 79, 75, 72,
+ 71, 71, 69, 71, 73, 78, 81, 84, 90, 95, 97, 103, 110, 113, 118, 125,
+ 130, 133, 136, 140, 145, 148, 143, 141, 146, 151, 149, 147, 145, 87, 83,
+ 80, 79, 78, 76, 76, 80, 84, 86, 90, 96, 99, 103, 111, 114, 118, 126,
+ 130, 134, 143, 146, 147, 152, 151, 155, 160, 158, 154, 156, 161, 166,
+ 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94, 101, 103, 108, 114, 116,
+ 124, 129, 134, 142, 145, 153, 156, 157, 163, 163, 166, 171, 173, 169,
+ 166, 93, 88, 87, 84, 83, 83, 81, 81, 86, 88, 92, 96, 98, 105, 107, 113,
+ 117, 122, 129, 131, 141, 144, 151, 157, 163, 167, 169, 175, 175, 177,
+ 183, 189, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90, 94, 96, 101, 102, 110,
+ 111, 118, 121, 129, 132, 138, 144, 150, 156, 161, 171, 174, 179, 181,
+ 188, 188, 190, 99, 94, 94, 90, 90, 88, 89, 86, 87, 93, 93, 99, 99, 106,
+ 107, 115, 116, 124, 125, 135, 136, 145, 146, 158, 159, 170, 171, 185,
+ 186, 192, 193, 201,
+ /* Size 4x16 */
+ 31, 44, 73, 93, 32, 41, 67, 87, 32, 42, 65, 83, 33, 44, 66, 81, 34, 54,
+ 74, 86, 37, 58, 79, 92, 44, 66, 90, 98, 49, 71, 99, 107, 56, 77, 107,
+ 117, 65, 84, 119, 129, 72, 90, 127, 141, 78, 95, 133, 151, 84, 95, 132,
+ 163, 89, 95, 136, 169, 95, 101, 132, 175, 101, 108, 141, 183,
+ /* Size 16x4 */
+ 31, 32, 32, 33, 34, 37, 44, 49, 56, 65, 72, 78, 84, 89, 95, 101, 44, 41,
+ 42, 44, 54, 58, 66, 71, 77, 84, 90, 95, 95, 95, 101, 108, 73, 67, 65,
+ 66, 74, 79, 90, 99, 107, 119, 127, 133, 132, 136, 132, 141, 93, 87, 83,
+ 81, 86, 92, 98, 107, 117, 129, 141, 151, 163, 169, 175, 183,
+ /* Size 8x32 */
+ 32, 32, 36, 47, 65, 79, 90, 96, 31, 32, 35, 45, 62, 75, 86, 91, 31, 32,
+ 35, 44, 60, 72, 84, 90, 31, 33, 35, 44, 59, 71, 82, 87, 32, 34, 36, 45,
+ 59, 71, 80, 87, 32, 35, 38, 45, 58, 69, 80, 86, 32, 35, 40, 47, 60, 71,
+ 78, 85, 34, 36, 42, 50, 63, 73, 82, 84, 36, 37, 48, 56, 68, 78, 83, 87,
+ 38, 39, 49, 58, 71, 81, 88, 90, 39, 40, 50, 60, 73, 84, 91, 94, 44, 42,
+ 53, 66, 79, 90, 94, 96, 47, 45, 56, 69, 84, 95, 101, 101, 49, 47, 57,
+ 71, 86, 97, 103, 102, 53, 50, 60, 75, 92, 103, 108, 110, 58, 54, 63, 79,
+ 98, 110, 114, 111, 61, 56, 65, 81, 100, 113, 116, 118, 65, 59, 68, 84,
+ 105, 118, 124, 121, 71, 64, 73, 89, 111, 125, 129, 129, 76, 68, 76, 92,
+ 115, 130, 134, 132, 79, 70, 79, 95, 118, 133, 142, 138, 82, 73, 81, 97,
+ 121, 136, 145, 144, 86, 76, 84, 100, 124, 140, 153, 150, 89, 79, 87, 99,
+ 124, 145, 156, 156, 92, 82, 89, 101, 121, 148, 157, 161, 95, 85, 92,
+ 105, 120, 143, 163, 171, 98, 88, 93, 108, 124, 141, 163, 174, 101, 91,
+ 94, 110, 128, 146, 166, 179, 104, 94, 95, 110, 129, 151, 171, 181, 107,
+ 97, 96, 110, 128, 149, 173, 188, 110, 100, 98, 111, 127, 147, 169, 188,
+ 114, 104, 100, 111, 127, 145, 166, 190,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 61, 65,
+ 71, 76, 79, 82, 86, 89, 92, 95, 98, 101, 104, 107, 110, 114, 32, 32, 32,
+ 33, 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 56, 59, 64, 68, 70,
+ 73, 76, 79, 82, 85, 88, 91, 94, 97, 100, 104, 36, 35, 35, 35, 36, 38,
+ 40, 42, 48, 49, 50, 53, 56, 57, 60, 63, 65, 68, 73, 76, 79, 81, 84, 87,
+ 89, 92, 93, 94, 95, 96, 98, 100, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58,
+ 60, 66, 69, 71, 75, 79, 81, 84, 89, 92, 95, 97, 100, 99, 101, 105, 108,
+ 110, 110, 110, 111, 111, 65, 62, 60, 59, 59, 58, 60, 63, 68, 71, 73, 79,
+ 84, 86, 92, 98, 100, 105, 111, 115, 118, 121, 124, 124, 121, 120, 124,
+ 128, 129, 128, 127, 127, 79, 75, 72, 71, 71, 69, 71, 73, 78, 81, 84, 90,
+ 95, 97, 103, 110, 113, 118, 125, 130, 133, 136, 140, 145, 148, 143, 141,
+ 146, 151, 149, 147, 145, 90, 86, 84, 82, 80, 80, 78, 82, 83, 88, 91, 94,
+ 101, 103, 108, 114, 116, 124, 129, 134, 142, 145, 153, 156, 157, 163,
+ 163, 166, 171, 173, 169, 166, 96, 91, 90, 87, 87, 86, 85, 84, 87, 90,
+ 94, 96, 101, 102, 110, 111, 118, 121, 129, 132, 138, 144, 150, 156, 161,
+ 171, 174, 179, 181, 188, 188, 190 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 45, 56, 64, 45, 58, 66, 69, 56, 66, 86, 87, 64, 69, 87, 105,
+ /* Size 8x8 */
+ 31, 38, 47, 48, 54, 61, 66, 69, 38, 47, 47, 46, 50, 55, 61, 65, 47, 47,
+ 53, 55, 58, 63, 65, 66, 48, 46, 55, 62, 67, 72, 73, 73, 54, 50, 58, 67,
+ 76, 83, 84, 82, 61, 55, 63, 72, 83, 91, 92, 92, 66, 61, 65, 73, 84, 92,
+ 101, 103, 69, 65, 66, 73, 82, 92, 103, 109,
+ /* Size 16x16 */
+ 32, 30, 33, 38, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 30, 31,
+ 35, 41, 46, 46, 46, 48, 51, 55, 58, 60, 63, 65, 68, 70, 33, 35, 39, 44,
+ 47, 46, 46, 47, 50, 53, 56, 58, 60, 62, 65, 67, 38, 41, 44, 47, 49, 48,
+ 47, 48, 50, 53, 55, 58, 58, 60, 62, 65, 49, 46, 47, 49, 53, 53, 54, 54,
+ 56, 58, 60, 62, 62, 63, 64, 64, 48, 46, 46, 48, 53, 54, 56, 57, 59, 61,
+ 63, 65, 67, 66, 68, 68, 50, 46, 46, 47, 54, 56, 61, 63, 65, 68, 70, 72,
+ 71, 71, 72, 72, 52, 48, 47, 48, 54, 57, 63, 66, 69, 72, 75, 76, 75, 76,
+ 76, 76, 55, 51, 50, 50, 56, 59, 65, 69, 73, 77, 79, 81, 81, 81, 80, 80,
+ 60, 55, 53, 53, 58, 61, 68, 72, 77, 82, 85, 87, 87, 85, 84, 85, 63, 58,
+ 56, 55, 60, 63, 70, 75, 79, 85, 89, 91, 91, 90, 89, 90, 66, 60, 58, 58,
+ 62, 65, 72, 76, 81, 87, 91, 94, 96, 95, 95, 95, 68, 63, 60, 58, 62, 67,
+ 71, 75, 81, 87, 91, 96, 99, 100, 100, 100, 70, 65, 62, 60, 63, 66, 71,
+ 76, 81, 85, 90, 95, 100, 103, 104, 105, 72, 68, 65, 62, 64, 68, 72, 76,
+ 80, 84, 89, 95, 100, 104, 107, 108, 74, 70, 67, 65, 64, 68, 72, 76, 80,
+ 85, 90, 95, 100, 105, 108, 111,
+ /* Size 32x32 */
+ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 31, 32,
+ 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 53, 54, 57, 59, 60, 61,
+ 63, 64, 65, 66, 67, 67, 68, 69, 70, 71, 30, 31, 31, 32, 35, 39, 41, 42,
+ 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 55, 57, 58, 59, 60, 62, 63, 64,
+ 65, 67, 68, 69, 70, 71, 31, 32, 32, 33, 36, 40, 41, 43, 46, 46, 45, 45,
+ 46, 46, 47, 49, 50, 51, 54, 56, 57, 58, 59, 61, 62, 63, 63, 64, 65, 66,
+ 67, 68, 33, 34, 35, 36, 39, 43, 44, 45, 47, 46, 46, 45, 46, 47, 47, 49,
+ 50, 51, 53, 55, 56, 57, 58, 59, 60, 61, 62, 63, 65, 66, 67, 68, 36, 38,
+ 39, 40, 43, 47, 47, 47, 48, 47, 46, 45, 46, 46, 47, 48, 49, 50, 52, 53,
+ 54, 55, 56, 58, 59, 61, 62, 63, 64, 65, 66, 66, 38, 40, 41, 41, 44, 47,
+ 47, 48, 49, 48, 48, 47, 47, 47, 48, 49, 50, 51, 53, 54, 55, 56, 58, 58,
+ 58, 59, 60, 61, 62, 64, 65, 66, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50,
+ 49, 49, 50, 50, 50, 52, 52, 53, 55, 56, 57, 58, 59, 60, 61, 61, 61, 61,
+ 62, 63, 63, 64, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54,
+ 54, 55, 56, 56, 58, 59, 60, 61, 62, 63, 62, 62, 63, 64, 64, 64, 64, 64,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 58, 58,
+ 60, 61, 62, 63, 64, 64, 64, 65, 65, 65, 65, 66, 67, 68, 48, 47, 46, 45,
+ 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 63, 63, 64,
+ 65, 66, 67, 66, 66, 67, 68, 68, 68, 68, 49, 47, 45, 45, 45, 45, 47, 49,
+ 53, 55, 55, 58, 59, 60, 61, 62, 63, 63, 65, 66, 67, 68, 69, 69, 68, 68,
+ 69, 69, 69, 69, 70, 71, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 64, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 71, 71, 72, 72,
+ 72, 71, 51, 48, 47, 46, 47, 46, 47, 50, 54, 55, 56, 60, 61, 62, 64, 66,
+ 66, 67, 69, 70, 71, 72, 73, 73, 74, 73, 73, 74, 73, 73, 74, 75, 52, 50,
+ 48, 47, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64, 66, 68, 69, 70, 72, 74,
+ 75, 75, 76, 77, 75, 76, 76, 75, 76, 77, 76, 75, 54, 52, 50, 49, 49, 48,
+ 49, 52, 55, 57, 58, 62, 64, 66, 68, 71, 72, 73, 75, 77, 78, 79, 80, 78,
+ 79, 78, 77, 78, 78, 77, 78, 79, 55, 53, 51, 50, 50, 49, 50, 52, 56, 58,
+ 59, 63, 65, 66, 69, 72, 73, 74, 77, 78, 79, 80, 81, 81, 81, 80, 81, 80,
+ 80, 81, 80, 79, 57, 54, 52, 51, 51, 50, 51, 53, 56, 58, 60, 63, 66, 67,
+ 70, 73, 74, 76, 79, 80, 82, 83, 84, 85, 83, 84, 83, 83, 83, 82, 82, 83,
+ 60, 57, 55, 54, 53, 52, 53, 55, 58, 60, 61, 65, 68, 69, 72, 75, 77, 79,
+ 82, 84, 85, 86, 87, 86, 87, 85, 85, 85, 84, 86, 85, 84, 62, 59, 57, 56,
+ 55, 53, 54, 56, 59, 61, 63, 66, 69, 70, 74, 77, 78, 80, 84, 86, 87, 88,
+ 90, 89, 89, 88, 88, 87, 88, 87, 87, 88, 63, 60, 58, 57, 56, 54, 55, 57,
+ 60, 62, 63, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90, 91, 93, 91, 91,
+ 90, 91, 89, 90, 90, 89, 65, 61, 59, 58, 57, 55, 56, 58, 61, 63, 64, 68,
+ 71, 72, 75, 79, 80, 83, 86, 88, 90, 91, 93, 94, 95, 92, 94, 92, 93, 92,
+ 91, 93, 66, 63, 60, 59, 58, 56, 58, 59, 62, 64, 65, 69, 72, 73, 76, 80,
+ 81, 84, 87, 90, 91, 93, 94, 95, 96, 97, 95, 95, 95, 95, 95, 93, 67, 64,
+ 62, 61, 59, 58, 58, 60, 63, 64, 66, 69, 71, 73, 77, 78, 81, 85, 86, 89,
+ 93, 94, 95, 97, 97, 98, 99, 97, 97, 97, 96, 98, 68, 65, 63, 62, 60, 59,
+ 58, 61, 62, 64, 67, 68, 71, 74, 75, 79, 81, 83, 87, 89, 91, 95, 96, 97,
+ 99, 98, 100, 100, 100, 99, 100, 98, 69, 66, 64, 63, 61, 61, 59, 61, 62,
+ 65, 66, 68, 72, 73, 76, 78, 80, 84, 85, 88, 91, 92, 97, 98, 98, 101,
+ 100, 102, 102, 103, 101, 102, 70, 67, 65, 63, 62, 62, 60, 61, 63, 65,
+ 66, 69, 71, 73, 76, 77, 81, 83, 85, 88, 90, 94, 95, 99, 100, 100, 103,
+ 102, 104, 104, 105, 103, 71, 67, 67, 64, 63, 63, 61, 61, 64, 65, 67, 69,
+ 71, 74, 75, 78, 80, 83, 85, 87, 91, 92, 95, 97, 100, 102, 102, 105, 104,
+ 106, 106, 108, 72, 68, 68, 65, 65, 64, 62, 62, 64, 65, 68, 69, 72, 73,
+ 76, 78, 80, 83, 84, 88, 89, 93, 95, 97, 100, 102, 104, 104, 107, 106,
+ 108, 108, 73, 69, 69, 66, 66, 65, 64, 63, 64, 66, 68, 69, 72, 73, 77,
+ 77, 81, 82, 86, 87, 90, 92, 95, 97, 99, 103, 104, 106, 106, 109, 108,
+ 110, 74, 70, 70, 67, 67, 66, 65, 63, 64, 67, 68, 70, 72, 74, 76, 78, 80,
+ 82, 85, 87, 90, 91, 95, 96, 100, 101, 105, 106, 108, 108, 111, 110, 75,
+ 71, 71, 68, 68, 66, 66, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 84,
+ 88, 89, 93, 93, 98, 98, 102, 103, 108, 108, 110, 110, 113,
+ /* Size 4x8 */
+ 31, 47, 57, 65, 40, 45, 52, 61, 46, 55, 61, 63, 47, 60, 70, 72, 52, 64,
+ 79, 81, 59, 68, 87, 90, 63, 66, 88, 99, 66, 69, 85, 102,
+ /* Size 8x4 */
+ 31, 40, 46, 47, 52, 59, 63, 66, 47, 45, 55, 60, 64, 68, 66, 69, 57, 52,
+ 61, 70, 79, 87, 88, 85, 65, 61, 63, 72, 81, 90, 99, 102,
+ /* Size 8x16 */
+ 32, 35, 48, 50, 57, 63, 68, 70, 30, 38, 46, 46, 52, 58, 63, 65, 33, 41,
+ 47, 46, 51, 56, 60, 63, 39, 46, 48, 47, 51, 55, 58, 61, 49, 48, 53, 54,
+ 57, 60, 61, 61, 48, 46, 53, 56, 60, 64, 65, 65, 50, 46, 54, 61, 66, 70,
+ 71, 69, 52, 47, 54, 63, 71, 75, 75, 74, 55, 49, 56, 65, 74, 79, 79, 78,
+ 60, 53, 58, 68, 79, 85, 85, 82, 63, 55, 60, 70, 82, 89, 91, 87, 66, 58,
+ 62, 72, 84, 91, 95, 91, 68, 60, 64, 71, 81, 94, 97, 96, 70, 62, 65, 73,
+ 81, 89, 98, 101, 72, 65, 65, 72, 82, 92, 100, 103, 74, 67, 65, 71, 79,
+ 89, 98, 105,
+ /* Size 16x8 */
+ 32, 30, 33, 39, 49, 48, 50, 52, 55, 60, 63, 66, 68, 70, 72, 74, 35, 38,
+ 41, 46, 48, 46, 46, 47, 49, 53, 55, 58, 60, 62, 65, 67, 48, 46, 47, 48,
+ 53, 53, 54, 54, 56, 58, 60, 62, 64, 65, 65, 65, 50, 46, 46, 47, 54, 56,
+ 61, 63, 65, 68, 70, 72, 71, 73, 72, 71, 57, 52, 51, 51, 57, 60, 66, 71,
+ 74, 79, 82, 84, 81, 81, 82, 79, 63, 58, 56, 55, 60, 64, 70, 75, 79, 85,
+ 89, 91, 94, 89, 92, 89, 68, 63, 60, 58, 61, 65, 71, 75, 79, 85, 91, 95,
+ 97, 98, 100, 98, 70, 65, 63, 61, 61, 65, 69, 74, 78, 82, 87, 91, 96,
+ 101, 103, 105,
+ /* Size 16x32 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 57, 61, 63, 67, 68, 69, 70, 71, 31, 31,
+ 37, 40, 47, 47, 48, 50, 54, 57, 60, 63, 64, 65, 66, 67, 30, 32, 38, 40,
+ 46, 45, 46, 48, 52, 55, 58, 61, 63, 64, 65, 67, 31, 33, 38, 41, 46, 45,
+ 46, 48, 52, 55, 57, 60, 61, 62, 63, 64, 33, 36, 41, 44, 47, 46, 46, 47,
+ 51, 54, 56, 59, 60, 61, 63, 64, 37, 40, 45, 47, 47, 45, 46, 47, 50, 52,
+ 54, 57, 59, 61, 62, 62, 39, 41, 46, 47, 48, 47, 47, 48, 51, 54, 55, 57,
+ 58, 59, 61, 62, 42, 43, 46, 48, 50, 49, 50, 50, 53, 56, 57, 60, 60, 59,
+ 60, 60, 49, 46, 48, 49, 53, 53, 54, 54, 57, 59, 60, 63, 61, 62, 61, 61,
+ 48, 46, 47, 48, 53, 55, 55, 56, 58, 61, 62, 64, 64, 63, 63, 64, 48, 46,
+ 46, 48, 53, 56, 56, 57, 60, 62, 64, 66, 65, 65, 65, 64, 49, 45, 45, 47,
+ 53, 58, 59, 61, 64, 66, 67, 69, 67, 67, 66, 67, 50, 46, 46, 48, 54, 59,
+ 61, 63, 66, 68, 70, 71, 71, 68, 69, 67, 51, 47, 47, 48, 54, 60, 61, 64,
+ 68, 70, 71, 73, 72, 72, 70, 71, 52, 48, 47, 48, 54, 61, 63, 66, 71, 73,
+ 75, 77, 75, 73, 74, 71, 54, 50, 49, 50, 55, 62, 65, 68, 73, 76, 78, 79,
+ 78, 76, 74, 75, 55, 51, 49, 50, 56, 63, 65, 69, 74, 77, 79, 81, 79, 78,
+ 78, 75, 57, 52, 50, 51, 56, 64, 66, 70, 76, 79, 82, 85, 83, 81, 79, 79,
+ 60, 54, 53, 53, 58, 65, 68, 72, 79, 82, 85, 87, 85, 84, 82, 80, 62, 56,
+ 54, 55, 60, 66, 69, 74, 81, 84, 87, 88, 87, 85, 84, 84, 63, 57, 55, 56,
+ 60, 67, 70, 75, 82, 86, 89, 92, 91, 89, 87, 84, 64, 59, 56, 57, 61, 68,
+ 71, 75, 83, 87, 90, 93, 92, 90, 89, 89, 66, 60, 58, 58, 62, 69, 72, 76,
+ 84, 88, 91, 94, 95, 93, 91, 89, 67, 61, 59, 58, 63, 68, 71, 78, 83, 86,
+ 93, 96, 96, 96, 94, 94, 68, 62, 60, 59, 64, 67, 71, 79, 81, 86, 94, 95,
+ 97, 98, 96, 94, 69, 63, 61, 60, 65, 66, 72, 77, 80, 88, 91, 96, 99, 99,
+ 100, 98, 70, 64, 62, 60, 65, 66, 73, 76, 81, 87, 89, 97, 98, 100, 101,
+ 99, 71, 65, 64, 61, 65, 67, 73, 74, 82, 85, 90, 95, 99, 102, 103, 104,
+ 72, 65, 65, 62, 65, 68, 72, 75, 82, 83, 92, 93, 100, 102, 103, 104, 73,
+ 66, 66, 63, 65, 69, 72, 76, 81, 85, 90, 93, 100, 102, 105, 106, 74, 67,
+ 67, 64, 65, 70, 71, 77, 79, 86, 89, 94, 98, 103, 105, 106, 75, 68, 68,
+ 65, 65, 71, 71, 78, 78, 87, 87, 96, 96, 105, 105, 109,
+ /* Size 32x16 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 31, 31, 32, 33,
+ 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 51, 52, 54, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 65, 66, 67, 68, 35, 37, 38, 38, 41, 45, 46, 46,
+ 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56, 58, 59, 60, 61,
+ 62, 64, 65, 66, 67, 68, 38, 40, 40, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+ 48, 48, 48, 50, 50, 51, 53, 55, 56, 57, 58, 58, 59, 60, 60, 61, 62, 63,
+ 64, 65, 48, 47, 46, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+ 56, 56, 58, 60, 60, 61, 62, 63, 64, 65, 65, 65, 65, 65, 65, 65, 49, 47,
+ 45, 45, 46, 45, 47, 49, 53, 55, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 68, 67, 66, 66, 67, 68, 69, 70, 71, 50, 48, 46, 46, 46, 46,
+ 47, 50, 54, 55, 56, 59, 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71,
+ 71, 72, 73, 73, 72, 72, 71, 71, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56,
+ 57, 61, 63, 64, 66, 68, 69, 70, 72, 74, 75, 75, 76, 78, 79, 77, 76, 74,
+ 75, 76, 77, 78, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68,
+ 71, 73, 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78,
+ 61, 57, 55, 55, 54, 52, 54, 56, 59, 61, 62, 66, 68, 70, 73, 76, 77, 79,
+ 82, 84, 86, 87, 88, 86, 86, 88, 87, 85, 83, 85, 86, 87, 63, 60, 58, 57,
+ 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87, 89, 90,
+ 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 67, 63, 61, 60, 59, 57, 57, 60,
+ 63, 64, 66, 69, 71, 73, 77, 79, 81, 85, 87, 88, 92, 93, 94, 96, 95, 96,
+ 97, 95, 93, 93, 94, 96, 68, 64, 63, 61, 60, 59, 58, 60, 61, 64, 65, 67,
+ 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96, 97, 99, 98, 99, 100,
+ 100, 98, 96, 69, 65, 64, 62, 61, 61, 59, 59, 62, 63, 65, 67, 68, 72, 73,
+ 76, 78, 81, 84, 85, 89, 90, 93, 96, 98, 99, 100, 102, 102, 102, 103,
+ 105, 70, 66, 65, 63, 63, 62, 61, 60, 61, 63, 65, 66, 69, 70, 74, 74, 78,
+ 79, 82, 84, 87, 89, 91, 94, 96, 100, 101, 103, 103, 105, 105, 105, 71,
+ 67, 67, 64, 64, 62, 62, 60, 61, 64, 64, 67, 67, 71, 71, 75, 75, 79, 80,
+ 84, 84, 89, 89, 94, 94, 98, 99, 104, 104, 106, 106, 109,
+ /* Size 4x16 */
+ 31, 49, 61, 69, 32, 45, 55, 64, 36, 46, 54, 61, 41, 47, 54, 59, 46, 53,
+ 59, 62, 46, 56, 62, 65, 46, 59, 68, 68, 48, 61, 73, 73, 51, 63, 77, 78,
+ 54, 65, 82, 84, 57, 67, 86, 89, 60, 69, 88, 93, 62, 67, 86, 98, 64, 66,
+ 87, 100, 65, 68, 83, 102, 67, 70, 86, 103,
+ /* Size 16x4 */
+ 31, 32, 36, 41, 46, 46, 46, 48, 51, 54, 57, 60, 62, 64, 65, 67, 49, 45,
+ 46, 47, 53, 56, 59, 61, 63, 65, 67, 69, 67, 66, 68, 70, 61, 55, 54, 54,
+ 59, 62, 68, 73, 77, 82, 86, 88, 86, 87, 83, 86, 69, 64, 61, 59, 62, 65,
+ 68, 73, 78, 84, 89, 93, 98, 100, 102, 103,
+ /* Size 8x32 */
+ 32, 35, 48, 50, 57, 63, 68, 70, 31, 37, 47, 48, 54, 60, 64, 66, 30, 38,
+ 46, 46, 52, 58, 63, 65, 31, 38, 46, 46, 52, 57, 61, 63, 33, 41, 47, 46,
+ 51, 56, 60, 63, 37, 45, 47, 46, 50, 54, 59, 62, 39, 46, 48, 47, 51, 55,
+ 58, 61, 42, 46, 50, 50, 53, 57, 60, 60, 49, 48, 53, 54, 57, 60, 61, 61,
+ 48, 47, 53, 55, 58, 62, 64, 63, 48, 46, 53, 56, 60, 64, 65, 65, 49, 45,
+ 53, 59, 64, 67, 67, 66, 50, 46, 54, 61, 66, 70, 71, 69, 51, 47, 54, 61,
+ 68, 71, 72, 70, 52, 47, 54, 63, 71, 75, 75, 74, 54, 49, 55, 65, 73, 78,
+ 78, 74, 55, 49, 56, 65, 74, 79, 79, 78, 57, 50, 56, 66, 76, 82, 83, 79,
+ 60, 53, 58, 68, 79, 85, 85, 82, 62, 54, 60, 69, 81, 87, 87, 84, 63, 55,
+ 60, 70, 82, 89, 91, 87, 64, 56, 61, 71, 83, 90, 92, 89, 66, 58, 62, 72,
+ 84, 91, 95, 91, 67, 59, 63, 71, 83, 93, 96, 94, 68, 60, 64, 71, 81, 94,
+ 97, 96, 69, 61, 65, 72, 80, 91, 99, 100, 70, 62, 65, 73, 81, 89, 98,
+ 101, 71, 64, 65, 73, 82, 90, 99, 103, 72, 65, 65, 72, 82, 92, 100, 103,
+ 73, 66, 65, 72, 81, 90, 100, 105, 74, 67, 65, 71, 79, 89, 98, 105, 75,
+ 68, 65, 71, 78, 87, 96, 105,
+ /* Size 32x8 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 55, 57,
+ 60, 62, 63, 64, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 35, 37, 38, 38,
+ 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 49, 50, 53, 54, 55, 56,
+ 58, 59, 60, 61, 62, 64, 65, 66, 67, 68, 48, 47, 46, 46, 47, 47, 48, 50,
+ 53, 53, 53, 53, 54, 54, 54, 55, 56, 56, 58, 60, 60, 61, 62, 63, 64, 65,
+ 65, 65, 65, 65, 65, 65, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 65, 65, 66, 68, 69, 70, 71, 72, 71, 71, 72, 73, 73, 72, 72,
+ 71, 71, 57, 54, 52, 52, 51, 50, 51, 53, 57, 58, 60, 64, 66, 68, 71, 73,
+ 74, 76, 79, 81, 82, 83, 84, 83, 81, 80, 81, 82, 82, 81, 79, 78, 63, 60,
+ 58, 57, 56, 54, 55, 57, 60, 62, 64, 67, 70, 71, 75, 78, 79, 82, 85, 87,
+ 89, 90, 91, 93, 94, 91, 89, 90, 92, 90, 89, 87, 68, 64, 63, 61, 60, 59,
+ 58, 60, 61, 64, 65, 67, 71, 72, 75, 78, 79, 83, 85, 87, 91, 92, 95, 96,
+ 97, 99, 98, 99, 100, 100, 98, 96, 70, 66, 65, 63, 63, 62, 61, 60, 61,
+ 63, 65, 66, 69, 70, 74, 74, 78, 79, 82, 84, 87, 89, 91, 94, 96, 100,
+ 101, 103, 103, 105, 105, 105 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 38, 63, 86, 38, 56, 78, 97, 63, 78, 113, 130, 86, 97, 130, 169,
+ /* Size 8x8 */
+ 32, 32, 35, 46, 57, 76, 85, 96, 32, 34, 37, 45, 54, 70, 79, 90, 35, 37,
+ 48, 56, 64, 79, 87, 93, 46, 45, 56, 70, 80, 96, 100, 105, 57, 54, 64,
+ 80, 93, 111, 121, 122, 76, 70, 79, 96, 111, 134, 138, 144, 85, 79, 87,
+ 100, 121, 138, 156, 168, 96, 90, 93, 105, 122, 144, 168, 184,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 34, 39, 44, 49, 58, 65, 71, 81, 87, 93, 98, 104, 31, 32,
+ 32, 32, 34, 38, 41, 46, 54, 60, 66, 75, 81, 86, 92, 98, 31, 32, 33, 34,
+ 36, 39, 42, 46, 53, 59, 64, 73, 78, 83, 88, 94, 32, 32, 34, 35, 37, 40,
+ 42, 46, 52, 58, 63, 71, 75, 80, 86, 92, 34, 34, 36, 37, 42, 47, 50, 53,
+ 59, 65, 70, 77, 82, 85, 89, 92, 39, 38, 39, 40, 47, 54, 58, 62, 68, 73,
+ 78, 85, 90, 90, 96, 98, 44, 41, 42, 42, 50, 58, 63, 68, 74, 79, 84, 91,
+ 96, 98, 102, 104, 49, 46, 46, 46, 53, 62, 68, 73, 81, 87, 92, 99, 103,
+ 107, 109, 112, 58, 54, 53, 52, 59, 68, 74, 81, 90, 97, 102, 110, 114,
+ 118, 117, 121, 65, 60, 59, 58, 65, 73, 79, 87, 97, 105, 111, 120, 125,
+ 125, 126, 130, 71, 66, 64, 63, 70, 78, 84, 92, 102, 111, 117, 127, 133,
+ 134, 136, 141, 81, 75, 73, 71, 77, 85, 91, 99, 110, 120, 127, 137, 143,
+ 145, 148, 152, 87, 81, 78, 75, 82, 90, 96, 103, 114, 125, 133, 143, 150,
+ 156, 160, 163, 93, 86, 83, 80, 85, 90, 98, 107, 118, 125, 134, 145, 156,
+ 163, 169, 177, 98, 92, 88, 86, 89, 96, 102, 109, 117, 126, 136, 148,
+ 160, 169, 176, 184, 104, 98, 94, 92, 92, 98, 104, 112, 121, 130, 141,
+ 152, 163, 177, 184, 191,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 34, 36, 39, 41, 44, 48, 49, 54, 58, 59,
+ 65, 69, 71, 80, 81, 83, 87, 90, 93, 95, 98, 101, 104, 107, 31, 32, 32,
+ 32, 32, 32, 32, 34, 34, 35, 38, 39, 42, 46, 47, 51, 55, 57, 62, 66, 68,
+ 76, 77, 78, 83, 85, 88, 90, 93, 96, 99, 101, 31, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 38, 39, 41, 45, 46, 50, 54, 55, 60, 64, 66, 73, 75, 76, 81,
+ 83, 86, 89, 92, 95, 98, 101, 31, 32, 32, 32, 32, 32, 32, 33, 34, 34, 37,
+ 38, 41, 44, 45, 49, 53, 54, 59, 63, 65, 72, 74, 75, 79, 81, 84, 86, 89,
+ 91, 94, 97, 31, 32, 32, 32, 33, 33, 34, 35, 36, 36, 39, 40, 42, 45, 46,
+ 50, 53, 54, 59, 63, 64, 71, 73, 74, 78, 80, 83, 85, 88, 91, 94, 97, 32,
+ 32, 32, 32, 33, 34, 34, 36, 36, 37, 40, 40, 42, 45, 46, 49, 53, 54, 58,
+ 62, 63, 70, 72, 73, 77, 79, 82, 85, 87, 90, 92, 95, 32, 32, 32, 32, 34,
+ 34, 35, 37, 37, 38, 40, 41, 42, 45, 46, 49, 52, 54, 58, 61, 63, 69, 71,
+ 72, 75, 78, 80, 83, 86, 89, 92, 95, 34, 34, 33, 33, 35, 36, 37, 39, 41,
+ 42, 45, 46, 47, 50, 51, 54, 57, 59, 63, 66, 68, 74, 75, 76, 80, 81, 82,
+ 83, 85, 87, 90, 93, 34, 34, 34, 34, 36, 36, 37, 41, 42, 45, 47, 48, 50,
+ 53, 53, 56, 59, 61, 65, 68, 70, 76, 77, 78, 82, 83, 85, 88, 89, 90, 92,
+ 93, 36, 35, 34, 34, 36, 37, 38, 42, 45, 48, 50, 51, 54, 56, 57, 60, 63,
+ 64, 68, 71, 73, 79, 80, 81, 85, 87, 89, 89, 90, 93, 96, 99, 39, 38, 38,
+ 37, 39, 40, 40, 45, 47, 50, 54, 55, 58, 61, 62, 65, 68, 69, 73, 76, 78,
+ 84, 85, 86, 90, 89, 90, 93, 96, 97, 98, 99, 41, 39, 39, 38, 40, 40, 41,
+ 46, 48, 51, 55, 56, 59, 62, 63, 67, 70, 71, 75, 78, 80, 86, 87, 88, 91,
+ 93, 96, 97, 97, 99, 102, 105, 44, 42, 41, 41, 42, 42, 42, 47, 50, 54,
+ 58, 59, 63, 66, 68, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 98, 98, 99,
+ 102, 104, 104, 105, 48, 46, 45, 44, 45, 45, 45, 50, 53, 56, 61, 62, 66,
+ 70, 71, 76, 79, 80, 85, 88, 90, 96, 97, 98, 101, 100, 102, 105, 105,
+ 105, 109, 112, 49, 47, 46, 45, 46, 46, 46, 51, 53, 57, 62, 63, 68, 71,
+ 73, 77, 81, 82, 87, 90, 92, 98, 99, 100, 103, 106, 107, 106, 109, 112,
+ 112, 112, 54, 51, 50, 49, 50, 49, 49, 54, 56, 60, 65, 67, 71, 76, 77,
+ 82, 86, 87, 92, 96, 97, 104, 105, 106, 110, 110, 109, 113, 114, 113,
+ 116, 120, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74, 79, 81,
+ 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 114, 118, 116, 117, 121,
+ 121, 120, 59, 57, 55, 54, 54, 54, 54, 59, 61, 64, 69, 71, 75, 80, 82,
+ 87, 91, 93, 99, 102, 104, 111, 112, 113, 117, 121, 120, 122, 124, 122,
+ 125, 129, 65, 62, 60, 59, 59, 58, 58, 63, 65, 68, 73, 75, 79, 85, 87,
+ 92, 97, 99, 105, 109, 111, 118, 120, 121, 125, 124, 125, 127, 126, 130,
+ 130, 129, 69, 66, 64, 63, 63, 62, 61, 66, 68, 71, 76, 78, 83, 88, 90,
+ 96, 100, 102, 109, 113, 115, 123, 125, 126, 129, 130, 131, 130, 134,
+ 133, 135, 139, 71, 68, 66, 65, 64, 63, 63, 68, 70, 73, 78, 80, 84, 90,
+ 92, 97, 102, 104, 111, 115, 117, 125, 127, 128, 133, 136, 134, 139, 136,
+ 139, 141, 140, 80, 76, 73, 72, 71, 70, 69, 74, 76, 79, 84, 86, 90, 96,
+ 98, 104, 109, 111, 118, 123, 125, 134, 136, 137, 142, 138, 143, 140,
+ 144, 144, 144, 149, 81, 77, 75, 74, 73, 72, 71, 75, 77, 80, 85, 87, 91,
+ 97, 99, 105, 110, 112, 120, 125, 127, 136, 137, 139, 143, 148, 145, 148,
+ 148, 150, 152, 149, 83, 78, 76, 75, 74, 73, 72, 76, 78, 81, 86, 88, 92,
+ 98, 100, 106, 111, 113, 121, 126, 128, 137, 139, 140, 145, 149, 153,
+ 153, 154, 155, 155, 161, 87, 83, 81, 79, 78, 77, 75, 80, 82, 85, 90, 91,
+ 96, 101, 103, 110, 114, 117, 125, 129, 133, 142, 143, 145, 150, 151,
+ 156, 159, 160, 160, 163, 161, 90, 85, 83, 81, 80, 79, 78, 81, 83, 87,
+ 89, 93, 98, 100, 106, 110, 114, 121, 124, 130, 136, 138, 148, 149, 151,
+ 156, 157, 162, 166, 168, 166, 172, 93, 88, 86, 84, 83, 82, 80, 82, 85,
+ 89, 90, 96, 98, 102, 107, 109, 118, 120, 125, 131, 134, 143, 145, 153,
+ 156, 157, 163, 164, 169, 172, 177, 172, 95, 90, 89, 86, 85, 85, 83, 83,
+ 88, 89, 93, 97, 99, 105, 106, 113, 116, 122, 127, 130, 139, 140, 148,
+ 153, 159, 162, 164, 169, 170, 176, 179, 185, 98, 93, 92, 89, 88, 87, 86,
+ 85, 89, 90, 96, 97, 102, 105, 109, 114, 117, 124, 126, 134, 136, 144,
+ 148, 154, 160, 166, 169, 170, 176, 177, 184, 186, 101, 96, 95, 91, 91,
+ 90, 89, 87, 90, 93, 97, 99, 104, 105, 112, 113, 121, 122, 130, 133, 139,
+ 144, 150, 155, 160, 168, 172, 176, 177, 184, 185, 191, 104, 99, 98, 94,
+ 94, 92, 92, 90, 92, 96, 98, 102, 104, 109, 112, 116, 121, 125, 130, 135,
+ 141, 144, 152, 155, 163, 166, 177, 179, 184, 185, 191, 192, 107, 101,
+ 101, 97, 97, 95, 95, 93, 93, 99, 99, 105, 105, 112, 112, 120, 120, 129,
+ 129, 139, 140, 149, 149, 161, 161, 172, 172, 185, 186, 191, 192, 199,
+ /* Size 4x8 */
+ 32, 38, 62, 86, 32, 40, 58, 80, 34, 51, 68, 85, 44, 61, 85, 101, 54, 69,
+ 98, 117, 72, 84, 118, 136, 82, 89, 129, 157, 92, 98, 127, 165,
+ /* Size 8x4 */
+ 32, 32, 34, 44, 54, 72, 82, 92, 38, 40, 51, 61, 69, 84, 89, 98, 62, 58,
+ 68, 85, 98, 118, 129, 127, 86, 80, 85, 101, 117, 136, 157, 165,
+ /* Size 8x16 */
+ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 41, 54, 73, 81, 88, 32, 33,
+ 36, 42, 53, 71, 78, 84, 32, 34, 38, 42, 52, 69, 76, 82, 34, 36, 44, 50,
+ 59, 75, 81, 84, 39, 39, 50, 58, 68, 84, 88, 90, 44, 42, 53, 63, 74, 90,
+ 97, 97, 49, 46, 57, 67, 81, 97, 104, 105, 57, 53, 63, 74, 90, 108, 111,
+ 113, 65, 59, 68, 79, 97, 118, 123, 122, 71, 64, 73, 84, 102, 125, 135,
+ 131, 81, 72, 80, 91, 110, 135, 145, 141, 87, 77, 85, 96, 114, 140, 148,
+ 151, 92, 83, 88, 102, 117, 133, 153, 163, 98, 88, 89, 103, 121, 141,
+ 160, 169, 103, 94, 92, 103, 119, 137, 158, 175,
+ /* Size 16x8 */
+ 32, 31, 32, 32, 34, 39, 44, 49, 57, 65, 71, 81, 87, 92, 98, 103, 32, 32,
+ 33, 34, 36, 39, 42, 46, 53, 59, 64, 72, 77, 83, 88, 94, 36, 35, 36, 38,
+ 44, 50, 53, 57, 63, 68, 73, 80, 85, 88, 89, 92, 44, 41, 42, 42, 50, 58,
+ 63, 67, 74, 79, 84, 91, 96, 102, 103, 103, 58, 54, 53, 52, 59, 68, 74,
+ 81, 90, 97, 102, 110, 114, 117, 121, 119, 79, 73, 71, 69, 75, 84, 90,
+ 97, 108, 118, 125, 135, 140, 133, 141, 137, 88, 81, 78, 76, 81, 88, 97,
+ 104, 111, 123, 135, 145, 148, 153, 160, 158, 93, 88, 84, 82, 84, 90, 97,
+ 105, 113, 122, 131, 141, 151, 163, 169, 175,
+ /* Size 16x32 */
+ 32, 31, 32, 32, 36, 39, 44, 53, 58, 65, 79, 81, 88, 90, 93, 96, 31, 32,
+ 32, 32, 35, 38, 42, 51, 55, 62, 75, 77, 83, 86, 88, 91, 31, 32, 32, 32,
+ 35, 38, 41, 50, 54, 60, 73, 75, 81, 84, 88, 91, 31, 32, 32, 33, 34, 37,
+ 41, 49, 53, 59, 72, 74, 79, 82, 84, 87, 32, 32, 33, 34, 36, 39, 42, 50,
+ 53, 59, 71, 72, 78, 81, 84, 87, 32, 32, 34, 34, 37, 40, 42, 49, 53, 58,
+ 70, 71, 77, 80, 83, 85, 32, 33, 34, 35, 38, 40, 42, 49, 52, 58, 69, 70,
+ 76, 78, 82, 86, 34, 34, 35, 37, 42, 45, 48, 54, 57, 63, 73, 75, 79, 79,
+ 81, 83, 34, 34, 36, 37, 44, 47, 50, 56, 59, 65, 75, 77, 81, 83, 84, 84,
+ 36, 34, 37, 38, 48, 51, 54, 60, 63, 68, 78, 80, 85, 85, 86, 89, 39, 37,
+ 39, 40, 50, 54, 58, 65, 68, 73, 84, 85, 88, 89, 90, 89, 40, 38, 40, 41,
+ 51, 55, 59, 67, 70, 75, 85, 87, 91, 92, 92, 95, 44, 41, 42, 43, 53, 58,
+ 63, 71, 74, 79, 90, 91, 97, 94, 97, 95, 47, 44, 45, 46, 56, 61, 66, 75,
+ 79, 85, 95, 97, 99, 101, 98, 102, 49, 46, 46, 47, 57, 62, 67, 77, 81,
+ 86, 97, 99, 104, 102, 105, 102, 53, 49, 50, 50, 60, 65, 71, 82, 86, 92,
+ 103, 105, 109, 108, 106, 110, 57, 53, 53, 53, 63, 68, 74, 86, 90, 97,
+ 108, 110, 111, 112, 113, 110, 59, 54, 54, 54, 64, 69, 75, 87, 91, 98,
+ 111, 112, 119, 117, 115, 118, 65, 60, 59, 58, 68, 73, 79, 92, 97, 105,
+ 118, 119, 123, 123, 122, 119, 69, 63, 62, 62, 71, 76, 83, 96, 100, 109,
+ 122, 124, 127, 125, 125, 128, 71, 65, 64, 63, 73, 78, 84, 97, 102, 111,
+ 125, 127, 135, 134, 131, 129, 79, 72, 71, 70, 79, 84, 90, 104, 109, 118,
+ 133, 135, 137, 136, 136, 137, 81, 74, 72, 71, 80, 85, 91, 105, 110, 120,
+ 135, 137, 145, 143, 141, 138, 82, 75, 73, 72, 81, 86, 92, 106, 111, 121,
+ 136, 139, 147, 148, 147, 149, 87, 79, 77, 76, 85, 90, 96, 110, 114, 125,
+ 140, 143, 148, 154, 151, 149, 90, 82, 80, 78, 87, 89, 99, 108, 113, 129,
+ 135, 146, 153, 157, 160, 159, 92, 84, 83, 81, 88, 90, 102, 106, 117,
+ 128, 133, 150, 153, 158, 163, 160, 95, 87, 85, 83, 88, 92, 103, 105,
+ 120, 125, 137, 148, 155, 164, 168, 173, 98, 89, 88, 85, 89, 95, 103,
+ 108, 121, 124, 141, 144, 160, 164, 169, 174, 100, 92, 91, 88, 90, 98,
+ 103, 111, 120, 127, 139, 146, 161, 165, 175, 179, 103, 94, 94, 90, 92,
+ 101, 103, 114, 119, 131, 137, 150, 158, 170, 175, 180, 106, 97, 97, 93,
+ 93, 104, 104, 118, 118, 135, 135, 154, 155, 175, 176, 187,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+ 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 31, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 37, 38, 41, 44, 46, 49, 53, 54, 60, 63, 65,
+ 72, 74, 75, 79, 82, 84, 87, 89, 92, 94, 97, 32, 32, 32, 32, 33, 34, 34,
+ 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64, 71, 72, 73, 77,
+ 80, 83, 85, 88, 91, 94, 97, 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 40,
+ 41, 43, 46, 47, 50, 53, 54, 58, 62, 63, 70, 71, 72, 76, 78, 81, 83, 85,
+ 88, 90, 93, 36, 35, 35, 34, 36, 37, 38, 42, 44, 48, 50, 51, 53, 56, 57,
+ 60, 63, 64, 68, 71, 73, 79, 80, 81, 85, 87, 88, 88, 89, 90, 92, 93, 39,
+ 38, 38, 37, 39, 40, 40, 45, 47, 51, 54, 55, 58, 61, 62, 65, 68, 69, 73,
+ 76, 78, 84, 85, 86, 90, 89, 90, 92, 95, 98, 101, 104, 44, 42, 41, 41,
+ 42, 42, 42, 48, 50, 54, 58, 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90,
+ 91, 92, 96, 99, 102, 103, 103, 103, 103, 104, 53, 51, 50, 49, 50, 49,
+ 49, 54, 56, 60, 65, 67, 71, 75, 77, 82, 86, 87, 92, 96, 97, 104, 105,
+ 106, 110, 108, 106, 105, 108, 111, 114, 118, 58, 55, 54, 53, 53, 53, 52,
+ 57, 59, 63, 68, 70, 74, 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111,
+ 114, 113, 117, 120, 121, 120, 119, 118, 65, 62, 60, 59, 59, 58, 58, 63,
+ 65, 68, 73, 75, 79, 85, 86, 92, 97, 98, 105, 109, 111, 118, 120, 121,
+ 125, 129, 128, 125, 124, 127, 131, 135, 79, 75, 73, 72, 71, 70, 69, 73,
+ 75, 78, 84, 85, 90, 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136,
+ 140, 135, 133, 137, 141, 139, 137, 135, 81, 77, 75, 74, 72, 71, 70, 75,
+ 77, 80, 85, 87, 91, 97, 99, 105, 110, 112, 119, 124, 127, 135, 137, 139,
+ 143, 146, 150, 148, 144, 146, 150, 154, 88, 83, 81, 79, 78, 77, 76, 79,
+ 81, 85, 88, 91, 97, 99, 104, 109, 111, 119, 123, 127, 135, 137, 145,
+ 147, 148, 153, 153, 155, 160, 161, 158, 155, 90, 86, 84, 82, 81, 80, 78,
+ 79, 83, 85, 89, 92, 94, 101, 102, 108, 112, 117, 123, 125, 134, 136,
+ 143, 148, 154, 157, 158, 164, 164, 165, 170, 175, 93, 88, 88, 84, 84,
+ 83, 82, 81, 84, 86, 90, 92, 97, 98, 105, 106, 113, 115, 122, 125, 131,
+ 136, 141, 147, 151, 160, 163, 168, 169, 175, 175, 176, 96, 91, 91, 87,
+ 87, 85, 86, 83, 84, 89, 89, 95, 95, 102, 102, 110, 110, 118, 119, 128,
+ 129, 137, 138, 149, 149, 159, 160, 173, 174, 179, 180, 187,
+ /* Size 4x16 */
+ 31, 39, 65, 90, 32, 38, 60, 84, 32, 39, 59, 81, 33, 40, 58, 78, 34, 47,
+ 65, 83, 37, 54, 73, 89, 41, 58, 79, 94, 46, 62, 86, 102, 53, 68, 97,
+ 112, 60, 73, 105, 123, 65, 78, 111, 134, 74, 85, 120, 143, 79, 90, 125,
+ 154, 84, 90, 128, 158, 89, 95, 124, 164, 94, 101, 131, 170,
+ /* Size 16x4 */
+ 31, 32, 32, 33, 34, 37, 41, 46, 53, 60, 65, 74, 79, 84, 89, 94, 39, 38,
+ 39, 40, 47, 54, 58, 62, 68, 73, 78, 85, 90, 90, 95, 101, 65, 60, 59, 58,
+ 65, 73, 79, 86, 97, 105, 111, 120, 125, 128, 124, 131, 90, 84, 81, 78,
+ 83, 89, 94, 102, 112, 123, 134, 143, 154, 158, 164, 170,
+ /* Size 8x32 */
+ 32, 32, 36, 44, 58, 79, 88, 93, 31, 32, 35, 42, 55, 75, 83, 88, 31, 32,
+ 35, 41, 54, 73, 81, 88, 31, 32, 34, 41, 53, 72, 79, 84, 32, 33, 36, 42,
+ 53, 71, 78, 84, 32, 34, 37, 42, 53, 70, 77, 83, 32, 34, 38, 42, 52, 69,
+ 76, 82, 34, 35, 42, 48, 57, 73, 79, 81, 34, 36, 44, 50, 59, 75, 81, 84,
+ 36, 37, 48, 54, 63, 78, 85, 86, 39, 39, 50, 58, 68, 84, 88, 90, 40, 40,
+ 51, 59, 70, 85, 91, 92, 44, 42, 53, 63, 74, 90, 97, 97, 47, 45, 56, 66,
+ 79, 95, 99, 98, 49, 46, 57, 67, 81, 97, 104, 105, 53, 50, 60, 71, 86,
+ 103, 109, 106, 57, 53, 63, 74, 90, 108, 111, 113, 59, 54, 64, 75, 91,
+ 111, 119, 115, 65, 59, 68, 79, 97, 118, 123, 122, 69, 62, 71, 83, 100,
+ 122, 127, 125, 71, 64, 73, 84, 102, 125, 135, 131, 79, 71, 79, 90, 109,
+ 133, 137, 136, 81, 72, 80, 91, 110, 135, 145, 141, 82, 73, 81, 92, 111,
+ 136, 147, 147, 87, 77, 85, 96, 114, 140, 148, 151, 90, 80, 87, 99, 113,
+ 135, 153, 160, 92, 83, 88, 102, 117, 133, 153, 163, 95, 85, 88, 103,
+ 120, 137, 155, 168, 98, 88, 89, 103, 121, 141, 160, 169, 100, 91, 90,
+ 103, 120, 139, 161, 175, 103, 94, 92, 103, 119, 137, 158, 175, 106, 97,
+ 93, 104, 118, 135, 155, 176,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 34, 36, 39, 40, 44, 47, 49, 53, 57, 59,
+ 65, 69, 71, 79, 81, 82, 87, 90, 92, 95, 98, 100, 103, 106, 32, 32, 32,
+ 32, 33, 34, 34, 35, 36, 37, 39, 40, 42, 45, 46, 50, 53, 54, 59, 62, 64,
+ 71, 72, 73, 77, 80, 83, 85, 88, 91, 94, 97, 36, 35, 35, 34, 36, 37, 38,
+ 42, 44, 48, 50, 51, 53, 56, 57, 60, 63, 64, 68, 71, 73, 79, 80, 81, 85,
+ 87, 88, 88, 89, 90, 92, 93, 44, 42, 41, 41, 42, 42, 42, 48, 50, 54, 58,
+ 59, 63, 66, 67, 71, 74, 75, 79, 83, 84, 90, 91, 92, 96, 99, 102, 103,
+ 103, 103, 103, 104, 58, 55, 54, 53, 53, 53, 52, 57, 59, 63, 68, 70, 74,
+ 79, 81, 86, 90, 91, 97, 100, 102, 109, 110, 111, 114, 113, 117, 120,
+ 121, 120, 119, 118, 79, 75, 73, 72, 71, 70, 69, 73, 75, 78, 84, 85, 90,
+ 95, 97, 103, 108, 111, 118, 122, 125, 133, 135, 136, 140, 135, 133, 137,
+ 141, 139, 137, 135, 88, 83, 81, 79, 78, 77, 76, 79, 81, 85, 88, 91, 97,
+ 99, 104, 109, 111, 119, 123, 127, 135, 137, 145, 147, 148, 153, 153,
+ 155, 160, 161, 158, 155, 93, 88, 88, 84, 84, 83, 82, 81, 84, 86, 90, 92,
+ 97, 98, 105, 106, 113, 115, 122, 125, 131, 136, 141, 147, 151, 160, 163,
+ 168, 169, 175, 175, 176 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 45, 53, 63, 45, 55, 62, 67, 53, 62, 80, 84, 63, 67, 84, 101,
+ /* Size 8x8 */
+ 31, 36, 47, 48, 52, 60, 64, 67, 36, 43, 47, 46, 49, 55, 59, 63, 47, 47,
+ 53, 54, 55, 60, 63, 64, 48, 46, 54, 61, 65, 70, 71, 71, 52, 49, 55, 65,
+ 71, 78, 81, 79, 60, 55, 60, 70, 78, 89, 89, 89, 64, 59, 63, 71, 81, 89,
+ 97, 99, 67, 63, 64, 71, 79, 89, 99, 104,
+ /* Size 16x16 */
+ 32, 30, 33, 36, 44, 48, 49, 51, 54, 57, 60, 64, 67, 68, 70, 72, 30, 31,
+ 35, 39, 44, 46, 46, 47, 50, 53, 55, 59, 61, 64, 66, 68, 33, 35, 39, 43,
+ 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 36, 39, 43, 47, 47, 46,
+ 45, 46, 48, 50, 52, 55, 57, 58, 61, 63, 44, 44, 46, 47, 50, 51, 51, 51,
+ 53, 54, 56, 59, 61, 61, 63, 62, 48, 46, 46, 46, 51, 54, 55, 56, 58, 60,
+ 61, 64, 65, 64, 66, 66, 49, 46, 45, 45, 51, 55, 58, 60, 62, 63, 65, 68,
+ 69, 69, 69, 69, 51, 47, 47, 46, 51, 56, 60, 62, 65, 67, 69, 72, 73, 74,
+ 73, 73, 54, 50, 49, 48, 53, 58, 62, 65, 70, 73, 75, 78, 79, 79, 77, 77,
+ 57, 53, 51, 50, 54, 60, 63, 67, 73, 76, 79, 82, 84, 83, 82, 82, 60, 55,
+ 53, 52, 56, 61, 65, 69, 75, 79, 82, 86, 88, 87, 86, 87, 64, 59, 57, 55,
+ 59, 64, 68, 72, 78, 82, 86, 90, 93, 92, 91, 92, 67, 61, 59, 57, 61, 65,
+ 69, 73, 79, 84, 88, 93, 95, 96, 96, 96, 68, 64, 61, 58, 61, 64, 69, 74,
+ 79, 83, 87, 92, 96, 99, 100, 101, 70, 66, 63, 61, 63, 66, 69, 73, 77,
+ 82, 86, 91, 96, 100, 103, 104, 72, 68, 65, 63, 62, 66, 69, 73, 77, 82,
+ 87, 92, 96, 101, 104, 106,
+ /* Size 32x32 */
+ 32, 31, 30, 30, 33, 35, 36, 41, 44, 49, 48, 48, 49, 50, 51, 52, 54, 55,
+ 57, 59, 60, 63, 64, 65, 67, 68, 68, 69, 70, 71, 72, 73, 31, 31, 31, 31,
+ 34, 36, 38, 42, 44, 47, 47, 47, 47, 48, 48, 50, 51, 52, 54, 56, 57, 60,
+ 61, 61, 63, 64, 65, 66, 67, 67, 68, 69, 30, 31, 31, 31, 35, 37, 39, 42,
+ 44, 47, 46, 46, 46, 47, 47, 48, 50, 51, 53, 54, 55, 58, 59, 60, 61, 63,
+ 64, 65, 66, 67, 68, 69, 30, 31, 31, 32, 35, 37, 40, 42, 44, 46, 45, 45,
+ 45, 46, 46, 47, 49, 50, 52, 53, 54, 57, 58, 58, 60, 61, 62, 63, 63, 64,
+ 65, 66, 33, 34, 35, 35, 39, 41, 43, 45, 46, 47, 46, 46, 45, 46, 47, 47,
+ 49, 49, 51, 53, 53, 56, 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 35, 36,
+ 37, 37, 41, 43, 45, 46, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+ 53, 55, 56, 56, 58, 59, 60, 61, 62, 63, 64, 64, 36, 38, 39, 40, 43, 45,
+ 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 48, 48, 50, 51, 52, 54, 55, 55,
+ 57, 58, 58, 59, 61, 62, 63, 64, 41, 42, 42, 42, 45, 46, 47, 48, 49, 50,
+ 49, 49, 49, 50, 50, 50, 51, 52, 53, 54, 55, 57, 58, 58, 60, 60, 59, 59,
+ 60, 61, 61, 62, 44, 44, 44, 44, 46, 46, 47, 49, 50, 51, 51, 51, 51, 51,
+ 51, 52, 53, 53, 54, 56, 56, 59, 59, 59, 61, 61, 61, 62, 63, 62, 62, 62,
+ 49, 47, 47, 46, 47, 47, 48, 50, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55,
+ 56, 58, 58, 60, 61, 61, 63, 63, 64, 63, 63, 64, 65, 66, 48, 47, 46, 45,
+ 46, 46, 46, 49, 51, 53, 54, 54, 55, 56, 56, 57, 58, 59, 60, 61, 61, 63,
+ 64, 64, 65, 65, 64, 65, 66, 66, 66, 66, 48, 47, 46, 45, 46, 46, 46, 49,
+ 51, 53, 54, 55, 56, 57, 57, 58, 59, 60, 61, 62, 63, 65, 65, 65, 66, 67,
+ 68, 67, 67, 67, 68, 69, 49, 47, 46, 45, 45, 45, 45, 49, 51, 53, 55, 56,
+ 58, 59, 60, 61, 62, 62, 63, 65, 65, 67, 68, 68, 69, 70, 69, 69, 69, 70,
+ 69, 69, 50, 48, 47, 46, 46, 46, 46, 50, 51, 54, 56, 57, 59, 61, 62, 63,
+ 64, 65, 66, 68, 68, 70, 71, 71, 72, 71, 71, 72, 71, 71, 71, 72, 51, 48,
+ 47, 46, 47, 46, 46, 50, 51, 54, 56, 57, 60, 62, 62, 64, 65, 66, 67, 69,
+ 69, 71, 72, 72, 73, 74, 74, 72, 73, 74, 73, 73, 52, 50, 48, 47, 47, 47,
+ 47, 50, 52, 54, 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75,
+ 77, 76, 75, 76, 76, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55,
+ 58, 59, 62, 64, 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 78,
+ 77, 78, 77, 77, 55, 52, 51, 50, 49, 49, 48, 52, 53, 55, 59, 60, 62, 65,
+ 66, 68, 70, 71, 73, 75, 76, 78, 79, 79, 80, 81, 80, 80, 81, 79, 79, 81,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 56, 60, 61, 63, 66, 67, 70, 73, 73,
+ 76, 78, 79, 82, 82, 83, 84, 83, 83, 83, 82, 83, 82, 81, 59, 56, 54, 53,
+ 53, 52, 51, 54, 56, 58, 61, 62, 65, 68, 69, 72, 74, 75, 78, 80, 81, 84,
+ 85, 85, 86, 86, 86, 84, 85, 84, 84, 85, 60, 57, 55, 54, 53, 53, 52, 55,
+ 56, 58, 61, 63, 65, 68, 69, 72, 75, 76, 79, 81, 82, 85, 86, 86, 88, 88,
+ 87, 88, 86, 87, 87, 85, 63, 60, 58, 57, 56, 55, 54, 57, 59, 60, 63, 65,
+ 67, 70, 71, 75, 77, 78, 82, 84, 85, 89, 89, 90, 92, 89, 91, 89, 90, 89,
+ 88, 89, 64, 61, 59, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75,
+ 78, 79, 82, 85, 86, 89, 90, 91, 93, 94, 92, 92, 91, 91, 92, 90, 65, 61,
+ 60, 58, 57, 56, 55, 58, 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 83, 85,
+ 86, 90, 91, 91, 93, 94, 95, 94, 94, 94, 93, 94, 67, 63, 61, 60, 59, 58,
+ 57, 60, 61, 63, 65, 66, 69, 72, 73, 77, 79, 80, 84, 86, 88, 92, 93, 93,
+ 95, 95, 96, 97, 96, 95, 96, 94, 68, 64, 63, 61, 60, 59, 58, 60, 61, 63,
+ 65, 67, 70, 71, 74, 76, 78, 81, 83, 86, 88, 89, 94, 94, 95, 97, 97, 98,
+ 99, 99, 97, 99, 68, 65, 64, 62, 61, 60, 58, 59, 61, 64, 64, 68, 69, 71,
+ 74, 75, 79, 80, 83, 86, 87, 91, 92, 95, 96, 97, 99, 99, 100, 100, 101,
+ 99, 69, 66, 65, 63, 62, 61, 59, 59, 62, 63, 65, 67, 69, 72, 72, 76, 78,
+ 80, 83, 84, 88, 89, 92, 94, 97, 98, 99, 101, 100, 102, 102, 104, 70, 67,
+ 66, 63, 63, 62, 61, 60, 63, 63, 66, 67, 69, 71, 73, 76, 77, 81, 82, 85,
+ 86, 90, 91, 94, 96, 99, 100, 100, 103, 102, 104, 104, 71, 67, 67, 64,
+ 64, 63, 62, 61, 62, 64, 66, 67, 70, 71, 74, 74, 78, 79, 83, 84, 87, 89,
+ 91, 94, 95, 99, 100, 102, 102, 104, 104, 106, 72, 68, 68, 65, 65, 64,
+ 63, 61, 62, 65, 66, 68, 69, 71, 73, 75, 77, 79, 82, 84, 87, 88, 92, 93,
+ 96, 97, 101, 102, 104, 104, 106, 106, 73, 69, 69, 66, 66, 64, 64, 62,
+ 62, 66, 66, 69, 69, 72, 73, 76, 77, 81, 81, 85, 85, 89, 90, 94, 94, 99,
+ 99, 104, 104, 106, 106, 108,
+ /* Size 4x8 */
+ 31, 47, 54, 64, 38, 46, 50, 60, 46, 53, 57, 62, 46, 56, 66, 71, 50, 59,
+ 74, 79, 57, 64, 82, 88, 61, 65, 85, 97, 65, 67, 82, 99,
+ /* Size 8x4 */
+ 31, 38, 46, 46, 50, 57, 61, 65, 47, 46, 53, 56, 59, 64, 65, 67, 54, 50,
+ 57, 66, 74, 82, 85, 82, 64, 60, 62, 71, 79, 88, 97, 99,
+ /* Size 8x16 */
+ 32, 34, 48, 49, 54, 63, 67, 69, 31, 36, 46, 46, 50, 58, 62, 65, 33, 40,
+ 47, 46, 49, 56, 59, 62, 37, 44, 47, 45, 48, 54, 57, 60, 44, 46, 51, 51,
+ 53, 59, 60, 61, 48, 46, 53, 56, 58, 64, 64, 64, 49, 45, 53, 58, 62, 67,
+ 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 54, 49, 55, 62, 70, 77, 77, 76,
+ 57, 51, 56, 64, 73, 82, 83, 81, 60, 53, 58, 65, 75, 85, 89, 85, 64, 57,
+ 61, 68, 78, 89, 93, 89, 66, 59, 63, 69, 79, 91, 94, 93, 68, 61, 63, 71,
+ 79, 87, 96, 98, 70, 63, 63, 70, 80, 89, 97, 100, 72, 65, 63, 69, 77, 86,
+ 95, 102,
+ /* Size 16x8 */
+ 32, 31, 33, 37, 44, 48, 49, 51, 54, 57, 60, 64, 66, 68, 70, 72, 34, 36,
+ 40, 44, 46, 46, 45, 47, 49, 51, 53, 57, 59, 61, 63, 65, 48, 46, 47, 47,
+ 51, 53, 53, 54, 55, 56, 58, 61, 63, 63, 63, 63, 49, 46, 46, 45, 51, 56,
+ 58, 60, 62, 64, 65, 68, 69, 71, 70, 69, 54, 50, 49, 48, 53, 58, 62, 65,
+ 70, 73, 75, 78, 79, 79, 80, 77, 63, 58, 56, 54, 59, 64, 67, 71, 77, 82,
+ 85, 89, 91, 87, 89, 86, 67, 62, 59, 57, 60, 64, 70, 73, 77, 83, 89, 93,
+ 94, 96, 97, 95, 69, 65, 62, 60, 61, 64, 68, 72, 76, 81, 85, 89, 93, 98,
+ 100, 102,
+ /* Size 16x32 */
+ 32, 31, 34, 37, 48, 48, 49, 52, 54, 57, 63, 64, 67, 68, 69, 69, 31, 31,
+ 35, 38, 47, 47, 47, 50, 51, 54, 60, 61, 63, 64, 65, 66, 31, 32, 36, 39,
+ 46, 46, 46, 48, 50, 53, 58, 59, 62, 63, 65, 66, 30, 32, 36, 40, 46, 45,
+ 45, 48, 49, 52, 57, 58, 60, 61, 62, 63, 33, 36, 40, 43, 47, 46, 46, 47,
+ 49, 51, 56, 57, 59, 60, 62, 63, 35, 38, 42, 45, 47, 46, 45, 47, 48, 50,
+ 55, 56, 58, 60, 61, 61, 37, 40, 44, 47, 47, 46, 45, 47, 48, 50, 54, 55,
+ 57, 58, 60, 61, 42, 43, 45, 47, 50, 50, 49, 50, 51, 53, 57, 58, 59, 58,
+ 59, 59, 44, 44, 46, 47, 51, 51, 51, 52, 53, 54, 59, 59, 60, 61, 61, 60,
+ 49, 46, 47, 48, 53, 53, 53, 54, 55, 57, 60, 61, 63, 62, 62, 63, 48, 46,
+ 46, 47, 53, 54, 56, 57, 58, 60, 64, 64, 64, 64, 64, 63, 48, 45, 46, 46,
+ 53, 55, 56, 58, 59, 61, 65, 65, 66, 66, 65, 66, 49, 45, 45, 46, 53, 56,
+ 58, 61, 62, 64, 67, 68, 70, 67, 68, 66, 50, 46, 46, 46, 54, 56, 59, 63,
+ 65, 66, 70, 71, 70, 71, 68, 70, 51, 47, 47, 47, 54, 57, 60, 64, 65, 68,
+ 71, 72, 73, 71, 72, 70, 52, 48, 47, 47, 54, 57, 61, 66, 68, 71, 75, 75,
+ 76, 75, 73, 73, 54, 49, 49, 48, 55, 58, 62, 68, 70, 73, 77, 78, 77, 77,
+ 76, 74, 54, 50, 49, 49, 55, 59, 62, 68, 70, 74, 78, 79, 81, 79, 77, 78,
+ 57, 52, 51, 50, 56, 60, 64, 70, 73, 76, 82, 82, 83, 82, 81, 78, 59, 54,
+ 52, 52, 58, 61, 65, 72, 74, 78, 84, 85, 85, 83, 82, 82, 60, 54, 53, 52,
+ 58, 62, 65, 72, 75, 79, 85, 86, 89, 87, 85, 82, 63, 57, 56, 55, 60, 64,
+ 67, 75, 77, 82, 89, 90, 90, 88, 87, 86, 64, 58, 57, 55, 61, 64, 68, 75,
+ 78, 82, 89, 90, 93, 91, 89, 87, 64, 59, 57, 56, 61, 65, 68, 75, 78, 83,
+ 90, 91, 94, 93, 92, 91, 66, 60, 59, 57, 63, 66, 69, 77, 79, 84, 91, 93,
+ 94, 95, 93, 91, 67, 61, 60, 58, 63, 65, 70, 75, 78, 85, 88, 93, 96, 97,
+ 97, 95, 68, 62, 61, 59, 63, 64, 71, 74, 79, 84, 87, 94, 96, 97, 98, 96,
+ 69, 63, 62, 60, 63, 65, 71, 72, 80, 82, 88, 93, 96, 99, 100, 101, 70,
+ 64, 63, 60, 63, 66, 70, 73, 80, 81, 89, 90, 97, 99, 100, 101, 71, 65,
+ 64, 61, 63, 67, 70, 74, 78, 82, 88, 90, 97, 99, 102, 103, 72, 65, 65,
+ 62, 63, 68, 69, 75, 77, 83, 86, 92, 95, 100, 102, 103, 73, 66, 66, 63,
+ 63, 69, 69, 76, 76, 84, 84, 93, 93, 101, 101, 105,
+ /* Size 32x16 */
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+ 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 31, 31, 32, 32,
+ 36, 38, 40, 43, 44, 46, 46, 45, 45, 46, 47, 48, 49, 50, 52, 54, 54, 57,
+ 58, 59, 60, 61, 62, 63, 64, 65, 65, 66, 34, 35, 36, 36, 40, 42, 44, 45,
+ 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56, 57, 57, 59, 60,
+ 61, 62, 63, 64, 65, 66, 37, 38, 39, 40, 43, 45, 47, 47, 47, 48, 47, 46,
+ 46, 46, 47, 47, 48, 49, 50, 52, 52, 55, 55, 56, 57, 58, 59, 60, 60, 61,
+ 62, 63, 48, 47, 46, 46, 47, 47, 47, 50, 51, 53, 53, 53, 53, 54, 54, 54,
+ 55, 55, 56, 58, 58, 60, 61, 61, 63, 63, 63, 63, 63, 63, 63, 63, 48, 47,
+ 46, 45, 46, 46, 46, 50, 51, 53, 54, 55, 56, 56, 57, 57, 58, 59, 60, 61,
+ 62, 64, 64, 65, 66, 65, 64, 65, 66, 67, 68, 69, 49, 47, 46, 45, 46, 45,
+ 45, 49, 51, 53, 56, 56, 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68,
+ 69, 70, 71, 71, 70, 70, 69, 69, 52, 50, 48, 48, 47, 47, 47, 50, 52, 54,
+ 57, 58, 61, 63, 64, 66, 68, 68, 70, 72, 72, 75, 75, 75, 77, 75, 74, 72,
+ 73, 74, 75, 76, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65,
+ 65, 68, 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76,
+ 57, 54, 53, 52, 51, 50, 50, 53, 54, 57, 60, 61, 64, 66, 68, 71, 73, 74,
+ 76, 78, 79, 82, 82, 83, 84, 85, 84, 82, 81, 82, 83, 84, 63, 60, 58, 57,
+ 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84, 85, 89,
+ 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 64, 61, 59, 58, 57, 56, 55, 58,
+ 59, 61, 64, 65, 68, 71, 72, 75, 78, 79, 82, 85, 86, 90, 90, 91, 93, 93,
+ 94, 93, 90, 90, 92, 93, 67, 63, 62, 60, 59, 58, 57, 59, 60, 63, 64, 66,
+ 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94, 94, 96, 96, 96, 97, 97,
+ 95, 93, 68, 64, 63, 61, 60, 60, 58, 58, 61, 62, 64, 66, 67, 71, 71, 75,
+ 77, 79, 82, 83, 87, 88, 91, 93, 95, 97, 97, 99, 99, 99, 100, 101, 69,
+ 65, 65, 62, 62, 61, 60, 59, 61, 62, 64, 65, 68, 68, 72, 73, 76, 77, 81,
+ 82, 85, 87, 89, 92, 93, 97, 98, 100, 100, 102, 102, 101, 69, 66, 66, 63,
+ 63, 61, 61, 59, 60, 63, 63, 66, 66, 70, 70, 73, 74, 78, 78, 82, 82, 86,
+ 87, 91, 91, 95, 96, 101, 101, 103, 103, 105,
+ /* Size 4x16 */
+ 31, 48, 57, 68, 32, 46, 53, 63, 36, 46, 51, 60, 40, 46, 50, 58, 44, 51,
+ 54, 61, 46, 54, 60, 64, 45, 56, 64, 67, 47, 57, 68, 71, 49, 58, 73, 77,
+ 52, 60, 76, 82, 54, 62, 79, 87, 58, 64, 82, 91, 60, 66, 84, 95, 62, 64,
+ 84, 97, 64, 66, 81, 99, 65, 68, 83, 100,
+ /* Size 16x4 */
+ 31, 32, 36, 40, 44, 46, 45, 47, 49, 52, 54, 58, 60, 62, 64, 65, 48, 46,
+ 46, 46, 51, 54, 56, 57, 58, 60, 62, 64, 66, 64, 66, 68, 57, 53, 51, 50,
+ 54, 60, 64, 68, 73, 76, 79, 82, 84, 84, 81, 83, 68, 63, 60, 58, 61, 64,
+ 67, 71, 77, 82, 87, 91, 95, 97, 99, 100,
+ /* Size 8x32 */
+ 32, 34, 48, 49, 54, 63, 67, 69, 31, 35, 47, 47, 51, 60, 63, 65, 31, 36,
+ 46, 46, 50, 58, 62, 65, 30, 36, 46, 45, 49, 57, 60, 62, 33, 40, 47, 46,
+ 49, 56, 59, 62, 35, 42, 47, 45, 48, 55, 58, 61, 37, 44, 47, 45, 48, 54,
+ 57, 60, 42, 45, 50, 49, 51, 57, 59, 59, 44, 46, 51, 51, 53, 59, 60, 61,
+ 49, 47, 53, 53, 55, 60, 63, 62, 48, 46, 53, 56, 58, 64, 64, 64, 48, 46,
+ 53, 56, 59, 65, 66, 65, 49, 45, 53, 58, 62, 67, 70, 68, 50, 46, 54, 59,
+ 65, 70, 70, 68, 51, 47, 54, 60, 65, 71, 73, 72, 52, 47, 54, 61, 68, 75,
+ 76, 73, 54, 49, 55, 62, 70, 77, 77, 76, 54, 49, 55, 62, 70, 78, 81, 77,
+ 57, 51, 56, 64, 73, 82, 83, 81, 59, 52, 58, 65, 74, 84, 85, 82, 60, 53,
+ 58, 65, 75, 85, 89, 85, 63, 56, 60, 67, 77, 89, 90, 87, 64, 57, 61, 68,
+ 78, 89, 93, 89, 64, 57, 61, 68, 78, 90, 94, 92, 66, 59, 63, 69, 79, 91,
+ 94, 93, 67, 60, 63, 70, 78, 88, 96, 97, 68, 61, 63, 71, 79, 87, 96, 98,
+ 69, 62, 63, 71, 80, 88, 96, 100, 70, 63, 63, 70, 80, 89, 97, 100, 71,
+ 64, 63, 70, 78, 88, 97, 102, 72, 65, 63, 69, 77, 86, 95, 102, 73, 66,
+ 63, 69, 76, 84, 93, 101,
+ /* Size 32x8 */
+ 32, 31, 31, 30, 33, 35, 37, 42, 44, 49, 48, 48, 49, 50, 51, 52, 54, 54,
+ 57, 59, 60, 63, 64, 64, 66, 67, 68, 69, 70, 71, 72, 73, 34, 35, 36, 36,
+ 40, 42, 44, 45, 46, 47, 46, 46, 45, 46, 47, 47, 49, 49, 51, 52, 53, 56,
+ 57, 57, 59, 60, 61, 62, 63, 64, 65, 66, 48, 47, 46, 46, 47, 47, 47, 50,
+ 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 58, 58, 60, 61, 61, 63, 63,
+ 63, 63, 63, 63, 63, 63, 49, 47, 46, 45, 46, 45, 45, 49, 51, 53, 56, 56,
+ 58, 59, 60, 61, 62, 62, 64, 65, 65, 67, 68, 68, 69, 70, 71, 71, 70, 70,
+ 69, 69, 54, 51, 50, 49, 49, 48, 48, 51, 53, 55, 58, 59, 62, 65, 65, 68,
+ 70, 70, 73, 74, 75, 77, 78, 78, 79, 78, 79, 80, 80, 78, 77, 76, 63, 60,
+ 58, 57, 56, 55, 54, 57, 59, 60, 64, 65, 67, 70, 71, 75, 77, 78, 82, 84,
+ 85, 89, 89, 90, 91, 88, 87, 88, 89, 88, 86, 84, 67, 63, 62, 60, 59, 58,
+ 57, 59, 60, 63, 64, 66, 70, 70, 73, 76, 77, 81, 83, 85, 89, 90, 93, 94,
+ 94, 96, 96, 96, 97, 97, 95, 93, 69, 65, 65, 62, 62, 61, 60, 59, 61, 62,
+ 64, 65, 68, 68, 72, 73, 76, 77, 81, 82, 85, 87, 89, 92, 93, 97, 98, 100,
+ 100, 102, 102, 101 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 37, 58, 81, 37, 54, 72, 91, 58, 72, 102, 121, 81, 91, 121, 156,
+ /* Size 8x8 */
+ 32, 32, 35, 42, 53, 68, 78, 90, 32, 33, 36, 42, 51, 64, 74, 84, 35, 36,
+ 46, 52, 60, 72, 80, 87, 42, 42, 52, 63, 73, 84, 92, 98, 53, 51, 60, 73,
+ 86, 100, 109, 114, 68, 64, 72, 84, 100, 117, 128, 133, 78, 74, 80, 92,
+ 109, 128, 140, 155, 90, 84, 87, 98, 114, 133, 155, 168,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 34, 36, 41, 47, 54, 59, 65, 74, 82, 87, 92, 97, 31, 32,
+ 32, 32, 34, 35, 39, 45, 50, 55, 61, 69, 76, 81, 87, 92, 31, 32, 33, 33,
+ 35, 36, 40, 44, 49, 54, 59, 67, 73, 78, 83, 88, 32, 32, 33, 35, 37, 38,
+ 41, 45, 49, 53, 58, 65, 71, 75, 80, 86, 34, 34, 35, 37, 39, 42, 46, 50,
+ 54, 58, 63, 70, 76, 80, 84, 85, 36, 35, 36, 38, 42, 48, 52, 56, 60, 64,
+ 68, 75, 80, 85, 90, 91, 41, 39, 40, 41, 46, 52, 57, 62, 67, 71, 75, 83,
+ 88, 92, 95, 97, 47, 45, 44, 45, 50, 56, 62, 69, 75, 79, 84, 91, 97, 100,
+ 102, 104, 54, 50, 49, 49, 54, 60, 67, 75, 82, 87, 92, 100, 106, 110,
+ 109, 112, 59, 55, 54, 53, 58, 64, 71, 79, 87, 92, 98, 106, 112, 117,
+ 117, 121, 65, 61, 59, 58, 63, 68, 75, 84, 92, 98, 105, 114, 120, 125,
+ 126, 130, 74, 69, 67, 65, 70, 75, 83, 91, 100, 106, 114, 123, 131, 135,
+ 137, 140, 82, 76, 73, 71, 76, 80, 88, 97, 106, 112, 120, 131, 139, 144,
+ 148, 150, 87, 81, 78, 75, 80, 85, 92, 100, 110, 117, 125, 135, 144, 150,
+ 155, 162, 92, 87, 83, 80, 84, 90, 95, 102, 109, 117, 126, 137, 148, 155,
+ 162, 168, 97, 92, 88, 86, 85, 91, 97, 104, 112, 121, 130, 140, 150, 162,
+ 168, 174,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 54, 56,
+ 59, 64, 65, 71, 74, 80, 82, 83, 87, 90, 92, 95, 97, 100, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 51, 53, 56, 61, 62, 68,
+ 71, 76, 78, 78, 83, 85, 88, 90, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 38, 39, 42, 45, 45, 50, 52, 55, 60, 61, 67, 69, 74, 76, 77,
+ 81, 84, 87, 89, 92, 95, 31, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37,
+ 38, 41, 44, 44, 49, 51, 54, 58, 59, 65, 68, 72, 74, 75, 79, 81, 84, 86,
+ 88, 90, 31, 32, 32, 32, 33, 33, 33, 34, 35, 36, 36, 39, 40, 42, 44, 45,
+ 49, 51, 54, 58, 59, 64, 67, 71, 73, 74, 78, 80, 83, 85, 88, 90, 31, 32,
+ 32, 32, 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 45, 45, 50, 51, 54, 58,
+ 59, 64, 67, 71, 73, 74, 78, 80, 82, 84, 86, 89, 32, 32, 32, 32, 33, 34,
+ 35, 36, 37, 38, 38, 40, 41, 42, 45, 46, 49, 51, 53, 57, 58, 63, 65, 69,
+ 71, 72, 75, 78, 80, 83, 86, 89, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39,
+ 40, 42, 43, 44, 47, 47, 51, 53, 55, 59, 60, 65, 67, 71, 73, 73, 77, 78,
+ 80, 82, 84, 86, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47,
+ 50, 51, 54, 56, 58, 62, 63, 68, 70, 74, 76, 76, 80, 82, 84, 85, 85, 86,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 59, 60,
+ 62, 66, 67, 72, 74, 78, 79, 80, 83, 84, 85, 87, 90, 92, 36, 35, 35, 34,
+ 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 60, 61, 64, 67, 68, 73,
+ 75, 79, 80, 81, 85, 87, 90, 91, 91, 92, 39, 38, 38, 37, 39, 39, 40, 42,
+ 45, 49, 50, 54, 55, 58, 60, 61, 65, 66, 69, 72, 73, 78, 80, 84, 86, 86,
+ 90, 91, 91, 92, 95, 97, 41, 40, 39, 38, 40, 40, 41, 43, 46, 50, 52, 55,
+ 57, 60, 62, 63, 67, 69, 71, 75, 75, 80, 83, 86, 88, 89, 92, 93, 95, 97,
+ 97, 98, 44, 42, 42, 41, 42, 42, 42, 44, 47, 52, 54, 58, 60, 63, 66, 67,
+ 71, 73, 75, 79, 79, 84, 86, 90, 92, 92, 96, 98, 98, 98, 101, 104, 47,
+ 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66, 69, 70, 75, 77, 79,
+ 83, 84, 89, 91, 95, 97, 97, 100, 99, 102, 105, 104, 104, 48, 46, 45, 44,
+ 45, 45, 46, 47, 51, 55, 57, 61, 63, 67, 70, 71, 76, 78, 80, 84, 85, 90,
+ 93, 96, 98, 99, 102, 106, 106, 105, 108, 111, 54, 51, 50, 49, 49, 50,
+ 49, 51, 54, 59, 60, 65, 67, 71, 75, 76, 82, 84, 87, 91, 92, 97, 100,
+ 104, 106, 106, 110, 108, 109, 112, 112, 111, 56, 53, 52, 51, 51, 51, 51,
+ 53, 56, 60, 61, 66, 69, 73, 77, 78, 84, 86, 89, 93, 94, 100, 102, 106,
+ 108, 109, 112, 113, 115, 114, 116, 119, 59, 56, 55, 54, 54, 54, 53, 55,
+ 58, 62, 64, 69, 71, 75, 79, 80, 87, 89, 92, 97, 98, 103, 106, 110, 112,
+ 113, 117, 118, 117, 121, 121, 119, 64, 61, 60, 58, 58, 58, 57, 59, 62,
+ 66, 67, 72, 75, 79, 83, 84, 91, 93, 97, 102, 103, 109, 112, 116, 118,
+ 119, 122, 121, 125, 123, 125, 128, 65, 62, 61, 59, 59, 59, 58, 60, 63,
+ 67, 68, 73, 75, 79, 84, 85, 92, 94, 98, 103, 105, 111, 114, 118, 120,
+ 121, 125, 129, 126, 129, 130, 129, 71, 68, 67, 65, 64, 64, 63, 65, 68,
+ 72, 73, 78, 80, 84, 89, 90, 97, 100, 103, 109, 111, 117, 120, 125, 127,
+ 128, 133, 130, 134, 133, 133, 137, 74, 71, 69, 68, 67, 67, 65, 67, 70,
+ 74, 75, 80, 83, 86, 91, 93, 100, 102, 106, 112, 114, 120, 123, 128, 131,
+ 131, 135, 137, 137, 138, 140, 137, 80, 76, 74, 72, 71, 71, 69, 71, 74,
+ 78, 79, 84, 86, 90, 95, 96, 104, 106, 110, 116, 118, 125, 128, 134, 136,
+ 137, 142, 141, 142, 143, 143, 147, 82, 78, 76, 74, 73, 73, 71, 73, 76,
+ 79, 80, 86, 88, 92, 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139,
+ 139, 144, 147, 148, 147, 150, 148, 83, 78, 77, 75, 74, 74, 72, 73, 76,
+ 80, 81, 86, 89, 92, 97, 99, 106, 109, 113, 119, 121, 128, 131, 137, 139,
+ 140, 145, 150, 152, 155, 152, 157, 87, 83, 81, 79, 78, 78, 75, 77, 80,
+ 83, 85, 90, 92, 96, 100, 102, 110, 112, 117, 122, 125, 133, 135, 142,
+ 144, 145, 150, 151, 155, 158, 162, 158, 90, 85, 84, 81, 80, 80, 78, 78,
+ 82, 84, 87, 91, 93, 98, 99, 106, 108, 113, 118, 121, 129, 130, 137, 141,
+ 147, 150, 151, 156, 156, 161, 164, 169, 92, 88, 87, 84, 83, 82, 80, 80,
+ 84, 85, 90, 91, 95, 98, 102, 106, 109, 115, 117, 125, 126, 134, 137,
+ 142, 148, 152, 155, 156, 162, 162, 168, 170, 95, 90, 89, 86, 85, 84, 83,
+ 82, 85, 87, 91, 92, 97, 98, 105, 105, 112, 114, 121, 123, 129, 133, 138,
+ 143, 147, 155, 158, 161, 162, 168, 168, 174, 97, 92, 92, 88, 88, 86, 86,
+ 84, 85, 90, 91, 95, 97, 101, 104, 108, 112, 116, 121, 125, 130, 133,
+ 140, 143, 150, 152, 162, 164, 168, 168, 174, 175, 100, 95, 95, 90, 90,
+ 89, 89, 86, 86, 92, 92, 97, 98, 104, 104, 111, 111, 119, 119, 128, 129,
+ 137, 137, 147, 148, 157, 158, 169, 170, 174, 175, 181,
+ /* Size 4x8 */
+ 32, 35, 59, 83, 32, 36, 57, 78, 34, 47, 65, 82, 41, 53, 78, 97, 51, 61,
+ 92, 111, 65, 73, 108, 129, 75, 81, 117, 148, 86, 92, 119, 154,
+ /* Size 8x4 */
+ 32, 32, 34, 41, 51, 65, 75, 86, 35, 36, 47, 53, 61, 73, 81, 92, 59, 57,
+ 65, 78, 92, 108, 117, 119, 83, 78, 82, 97, 111, 129, 148, 154,
+ /* Size 8x16 */
+ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 34, 41, 50, 61, 76, 85, 31, 33,
+ 35, 42, 49, 59, 73, 81, 32, 34, 37, 42, 49, 58, 71, 79, 34, 35, 41, 48,
+ 54, 63, 76, 81, 36, 36, 46, 54, 60, 68, 80, 87, 41, 40, 49, 60, 67, 76,
+ 88, 93, 47, 44, 53, 66, 75, 84, 97, 101, 53, 50, 57, 71, 82, 92, 106,
+ 108, 58, 54, 61, 75, 87, 98, 112, 116, 65, 59, 66, 79, 92, 105, 120,
+ 124, 74, 67, 73, 86, 100, 113, 131, 134, 82, 73, 79, 92, 105, 120, 139,
+ 142, 87, 78, 83, 96, 110, 125, 144, 153, 92, 83, 84, 97, 114, 132, 150,
+ 157, 97, 88, 86, 97, 111, 128, 147, 163,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 34, 36, 41, 47, 53, 58, 65, 74, 82, 87, 92, 97, 31, 32,
+ 33, 34, 35, 36, 40, 44, 50, 54, 59, 67, 73, 78, 83, 88, 35, 34, 35, 37,
+ 41, 46, 49, 53, 57, 61, 66, 73, 79, 83, 84, 86, 44, 41, 42, 42, 48, 54,
+ 60, 66, 71, 75, 79, 86, 92, 96, 97, 97, 53, 50, 49, 49, 54, 60, 67, 75,
+ 82, 87, 92, 100, 105, 110, 114, 111, 65, 61, 59, 58, 63, 68, 76, 84, 92,
+ 98, 105, 113, 120, 125, 132, 128, 82, 76, 73, 71, 76, 80, 88, 97, 106,
+ 112, 120, 131, 139, 144, 150, 147, 90, 85, 81, 79, 81, 87, 93, 101, 108,
+ 116, 124, 134, 142, 153, 157, 163,
+ /* Size 16x32 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 53, 62, 65, 79, 82, 88, 90, 93, 31, 32,
+ 32, 32, 35, 35, 42, 45, 51, 59, 62, 75, 78, 83, 86, 88, 31, 32, 32, 32,
+ 34, 35, 41, 45, 50, 58, 61, 74, 76, 82, 85, 88, 31, 32, 32, 33, 34, 34,
+ 41, 44, 49, 57, 59, 72, 74, 79, 82, 84, 31, 32, 33, 34, 35, 36, 42, 44,
+ 49, 57, 59, 71, 73, 79, 81, 84, 32, 32, 33, 34, 36, 36, 42, 45, 50, 57,
+ 59, 71, 73, 78, 80, 82, 32, 33, 34, 35, 37, 38, 42, 45, 49, 56, 58, 69,
+ 71, 76, 79, 83, 32, 33, 34, 36, 39, 40, 44, 47, 51, 58, 60, 71, 73, 76,
+ 78, 80, 34, 34, 35, 37, 41, 42, 48, 50, 54, 61, 63, 73, 76, 81, 81, 80,
+ 35, 34, 36, 38, 45, 47, 52, 55, 59, 65, 67, 77, 79, 82, 83, 86, 36, 34,
+ 36, 38, 46, 48, 54, 56, 60, 66, 68, 78, 80, 85, 87, 86, 39, 37, 39, 40,
+ 48, 50, 58, 60, 65, 71, 73, 84, 86, 89, 88, 91, 41, 39, 40, 41, 49, 51,
+ 60, 62, 67, 74, 76, 86, 88, 91, 93, 91, 44, 41, 42, 43, 51, 53, 63, 66,
+ 71, 78, 79, 90, 92, 97, 94, 97, 47, 44, 44, 45, 53, 56, 66, 69, 75, 82,
+ 84, 95, 97, 98, 101, 98, 48, 45, 45, 46, 54, 56, 67, 70, 76, 83, 85, 96,
+ 98, 104, 101, 105, 53, 49, 50, 50, 57, 60, 71, 75, 82, 90, 92, 103, 106,
+ 107, 108, 105, 55, 51, 51, 51, 59, 61, 72, 77, 84, 92, 94, 106, 108,
+ 111, 110, 112, 58, 54, 54, 54, 61, 63, 75, 79, 87, 95, 98, 110, 112,
+ 117, 116, 113, 63, 58, 58, 57, 65, 67, 78, 83, 91, 100, 103, 116, 118,
+ 119, 119, 121, 65, 60, 59, 58, 66, 68, 79, 84, 92, 102, 105, 118, 120,
+ 127, 124, 122, 71, 65, 64, 63, 71, 73, 84, 89, 97, 108, 111, 125, 127,
+ 129, 129, 130, 74, 68, 67, 66, 73, 75, 86, 91, 100, 110, 113, 128, 131,
+ 135, 134, 130, 79, 72, 71, 70, 77, 79, 90, 95, 104, 115, 118, 133, 136,
+ 140, 139, 140, 82, 75, 73, 72, 79, 81, 92, 97, 105, 117, 120, 136, 139,
+ 145, 142, 140, 82, 75, 74, 72, 79, 81, 92, 97, 106, 117, 121, 136, 139,
+ 148, 150, 149, 87, 79, 78, 76, 83, 85, 96, 100, 110, 120, 125, 141, 144,
+ 148, 153, 150, 89, 82, 81, 78, 83, 87, 97, 99, 113, 118, 128, 139, 145,
+ 153, 157, 161, 92, 84, 83, 80, 84, 89, 97, 101, 114, 116, 132, 135, 150,
+ 153, 157, 162, 94, 86, 85, 82, 85, 92, 97, 104, 112, 119, 130, 136, 151,
+ 154, 163, 166, 97, 88, 88, 85, 86, 94, 97, 107, 111, 123, 128, 140, 147,
+ 159, 163, 167, 99, 91, 91, 87, 87, 97, 97, 110, 110, 126, 126, 144, 144,
+ 163, 163, 173,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+ 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 49, 51, 54, 58, 60, 65,
+ 68, 72, 75, 75, 79, 82, 84, 86, 88, 91, 31, 32, 32, 32, 33, 33, 34, 34,
+ 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64, 67, 71, 73, 74,
+ 78, 81, 83, 85, 88, 91, 32, 32, 32, 33, 34, 34, 35, 36, 37, 38, 38, 40,
+ 41, 43, 45, 46, 50, 51, 54, 57, 58, 63, 66, 70, 72, 72, 76, 78, 80, 82,
+ 85, 87, 35, 35, 34, 34, 35, 36, 37, 39, 41, 45, 46, 48, 49, 51, 53, 54,
+ 57, 59, 61, 65, 66, 71, 73, 77, 79, 79, 83, 83, 84, 85, 86, 87, 36, 35,
+ 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 60, 61, 63, 67,
+ 68, 73, 75, 79, 81, 81, 85, 87, 89, 92, 94, 97, 44, 42, 41, 41, 42, 42,
+ 42, 44, 48, 52, 54, 58, 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90,
+ 92, 92, 96, 97, 97, 97, 97, 97, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+ 56, 60, 62, 66, 69, 70, 75, 77, 79, 83, 84, 89, 91, 95, 97, 97, 100, 99,
+ 101, 104, 107, 110, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67,
+ 71, 75, 76, 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114,
+ 112, 111, 110, 62, 59, 58, 57, 57, 57, 56, 58, 61, 65, 66, 71, 74, 78,
+ 82, 83, 90, 92, 95, 100, 102, 108, 110, 115, 117, 117, 120, 118, 116,
+ 119, 123, 126, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79,
+ 84, 85, 92, 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132,
+ 130, 128, 126, 79, 75, 74, 72, 71, 71, 69, 71, 73, 77, 78, 84, 86, 90,
+ 95, 96, 103, 106, 110, 116, 118, 125, 128, 133, 136, 136, 141, 139, 135,
+ 136, 140, 144, 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92,
+ 97, 98, 106, 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150,
+ 151, 147, 144, 88, 83, 82, 79, 79, 78, 76, 76, 81, 82, 85, 89, 91, 97,
+ 98, 104, 107, 111, 117, 119, 127, 129, 135, 140, 145, 148, 148, 153,
+ 153, 154, 159, 163, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93,
+ 94, 101, 101, 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153,
+ 157, 157, 163, 163, 163, 93, 88, 88, 84, 84, 82, 83, 80, 80, 86, 86, 91,
+ 91, 97, 98, 105, 105, 112, 113, 121, 122, 130, 130, 140, 140, 149, 150,
+ 161, 162, 166, 167, 173,
+ /* Size 4x16 */
+ 31, 36, 62, 88, 32, 35, 58, 82, 32, 36, 57, 79, 33, 38, 56, 76, 34, 42,
+ 61, 81, 34, 48, 66, 85, 39, 51, 74, 91, 44, 56, 82, 98, 49, 60, 90, 107,
+ 54, 63, 95, 117, 60, 68, 102, 127, 68, 75, 110, 135, 75, 81, 117, 145,
+ 79, 85, 120, 148, 84, 89, 116, 153, 88, 94, 123, 159,
+ /* Size 16x4 */
+ 31, 32, 32, 33, 34, 34, 39, 44, 49, 54, 60, 68, 75, 79, 84, 88, 36, 35,
+ 36, 38, 42, 48, 51, 56, 60, 63, 68, 75, 81, 85, 89, 94, 62, 58, 57, 56,
+ 61, 66, 74, 82, 90, 95, 102, 110, 117, 120, 116, 123, 88, 82, 79, 76,
+ 81, 85, 91, 98, 107, 117, 127, 135, 145, 148, 153, 159,
+ /* Size 8x32 */
+ 32, 31, 35, 44, 53, 65, 82, 90, 31, 32, 35, 42, 51, 62, 78, 86, 31, 32,
+ 34, 41, 50, 61, 76, 85, 31, 32, 34, 41, 49, 59, 74, 82, 31, 33, 35, 42,
+ 49, 59, 73, 81, 32, 33, 36, 42, 50, 59, 73, 80, 32, 34, 37, 42, 49, 58,
+ 71, 79, 32, 34, 39, 44, 51, 60, 73, 78, 34, 35, 41, 48, 54, 63, 76, 81,
+ 35, 36, 45, 52, 59, 67, 79, 83, 36, 36, 46, 54, 60, 68, 80, 87, 39, 39,
+ 48, 58, 65, 73, 86, 88, 41, 40, 49, 60, 67, 76, 88, 93, 44, 42, 51, 63,
+ 71, 79, 92, 94, 47, 44, 53, 66, 75, 84, 97, 101, 48, 45, 54, 67, 76, 85,
+ 98, 101, 53, 50, 57, 71, 82, 92, 106, 108, 55, 51, 59, 72, 84, 94, 108,
+ 110, 58, 54, 61, 75, 87, 98, 112, 116, 63, 58, 65, 78, 91, 103, 118,
+ 119, 65, 59, 66, 79, 92, 105, 120, 124, 71, 64, 71, 84, 97, 111, 127,
+ 129, 74, 67, 73, 86, 100, 113, 131, 134, 79, 71, 77, 90, 104, 118, 136,
+ 139, 82, 73, 79, 92, 105, 120, 139, 142, 82, 74, 79, 92, 106, 121, 139,
+ 150, 87, 78, 83, 96, 110, 125, 144, 153, 89, 81, 83, 97, 113, 128, 145,
+ 157, 92, 83, 84, 97, 114, 132, 150, 157, 94, 85, 85, 97, 112, 130, 151,
+ 163, 97, 88, 86, 97, 111, 128, 147, 163, 99, 91, 87, 97, 110, 126, 144,
+ 163,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 53, 55,
+ 58, 63, 65, 71, 74, 79, 82, 82, 87, 89, 92, 94, 97, 99, 31, 32, 32, 32,
+ 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 50, 51, 54, 58, 59, 64,
+ 67, 71, 73, 74, 78, 81, 83, 85, 88, 91, 35, 35, 34, 34, 35, 36, 37, 39,
+ 41, 45, 46, 48, 49, 51, 53, 54, 57, 59, 61, 65, 66, 71, 73, 77, 79, 79,
+ 83, 83, 84, 85, 86, 87, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+ 60, 63, 66, 67, 71, 72, 75, 78, 79, 84, 86, 90, 92, 92, 96, 97, 97, 97,
+ 97, 97, 53, 51, 50, 49, 49, 50, 49, 51, 54, 59, 60, 65, 67, 71, 75, 76,
+ 82, 84, 87, 91, 92, 97, 100, 104, 105, 106, 110, 113, 114, 112, 111,
+ 110, 65, 62, 61, 59, 59, 59, 58, 60, 63, 67, 68, 73, 76, 79, 84, 85, 92,
+ 94, 98, 103, 105, 111, 113, 118, 120, 121, 125, 128, 132, 130, 128, 126,
+ 82, 78, 76, 74, 73, 73, 71, 73, 76, 79, 80, 86, 88, 92, 97, 98, 106,
+ 108, 112, 118, 120, 127, 131, 136, 139, 139, 144, 145, 150, 151, 147,
+ 144, 90, 86, 85, 82, 81, 80, 79, 78, 81, 83, 87, 88, 93, 94, 101, 101,
+ 108, 110, 116, 119, 124, 129, 134, 139, 142, 150, 153, 157, 157, 163,
+ 163, 163 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 45, 51, 61, 45, 54, 59, 65, 51, 59, 75, 81, 61, 65, 81, 97,
+ /* Size 8x8 */
+ 31, 34, 46, 47, 50, 57, 61, 65, 34, 39, 47, 45, 48, 53, 57, 61, 46, 47,
+ 52, 52, 54, 58, 61, 62, 47, 45, 52, 58, 62, 65, 68, 68, 50, 48, 54, 62,
+ 68, 73, 77, 76, 57, 53, 58, 65, 73, 82, 86, 86, 61, 57, 61, 68, 77, 86,
+ 91, 95, 65, 61, 62, 68, 76, 86, 95, 100,
+ /* Size 16x16 */
+ 32, 31, 33, 36, 41, 49, 49, 50, 52, 54, 57, 61, 64, 67, 68, 70, 31, 31,
+ 34, 39, 42, 47, 46, 47, 49, 51, 53, 57, 60, 62, 64, 66, 33, 34, 37, 42,
+ 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 36, 39, 42, 47, 47, 48,
+ 46, 46, 47, 48, 50, 53, 55, 57, 59, 61, 41, 42, 44, 47, 48, 50, 49, 50,
+ 50, 52, 53, 56, 58, 60, 61, 60, 49, 47, 47, 48, 50, 53, 53, 54, 54, 55,
+ 56, 59, 61, 63, 64, 64, 49, 46, 46, 46, 49, 53, 55, 57, 59, 60, 61, 64,
+ 66, 67, 67, 67, 50, 47, 46, 46, 50, 54, 57, 61, 63, 64, 66, 69, 70, 72,
+ 71, 71, 52, 49, 47, 47, 50, 54, 59, 63, 66, 68, 70, 73, 75, 77, 75, 75,
+ 54, 51, 49, 48, 52, 55, 60, 64, 68, 71, 73, 76, 79, 80, 79, 79, 57, 53,
+ 51, 50, 53, 56, 61, 66, 70, 73, 76, 80, 82, 84, 83, 84, 61, 57, 55, 53,
+ 56, 59, 64, 69, 73, 76, 80, 84, 87, 89, 88, 88, 64, 60, 57, 55, 58, 61,
+ 66, 70, 75, 79, 82, 87, 91, 93, 93, 93, 67, 62, 59, 57, 60, 63, 67, 72,
+ 77, 80, 84, 89, 93, 95, 96, 97, 68, 64, 61, 59, 61, 64, 67, 71, 75, 79,
+ 83, 88, 93, 96, 99, 100, 70, 66, 63, 61, 60, 64, 67, 71, 75, 79, 84, 88,
+ 93, 97, 100, 102,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 65, 67, 67, 68, 69, 70, 71, 31, 31, 31, 31,
+ 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 50, 50, 52, 54, 54, 57,
+ 58, 60, 61, 61, 63, 64, 65, 65, 66, 67, 31, 31, 31, 31, 34, 35, 39, 40,
+ 42, 46, 47, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53, 56, 57, 59, 60, 60,
+ 62, 63, 64, 65, 66, 67, 30, 31, 31, 32, 34, 35, 40, 41, 42, 45, 46, 45,
+ 45, 45, 46, 46, 47, 48, 49, 51, 52, 54, 55, 57, 58, 58, 60, 61, 62, 62,
+ 63, 64, 33, 34, 34, 34, 37, 38, 42, 43, 44, 46, 47, 46, 46, 45, 46, 46,
+ 47, 48, 49, 51, 51, 53, 55, 56, 57, 57, 59, 60, 61, 62, 63, 64, 33, 34,
+ 35, 35, 38, 39, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51,
+ 51, 53, 54, 56, 57, 57, 59, 60, 60, 61, 62, 62, 36, 38, 39, 40, 42, 43,
+ 47, 47, 47, 47, 48, 46, 46, 45, 46, 46, 47, 47, 48, 49, 50, 52, 53, 54,
+ 55, 55, 57, 58, 59, 60, 61, 62, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48,
+ 49, 48, 47, 47, 47, 47, 48, 49, 49, 51, 51, 53, 54, 55, 56, 56, 58, 58,
+ 58, 59, 60, 60, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49,
+ 50, 50, 50, 51, 52, 53, 53, 55, 56, 57, 58, 58, 60, 61, 61, 61, 60, 60,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 53, 54,
+ 55, 55, 56, 58, 58, 60, 60, 61, 62, 61, 61, 62, 63, 64, 49, 47, 47, 46,
+ 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56, 56, 58,
+ 59, 60, 61, 61, 63, 63, 64, 64, 64, 64, 48, 47, 46, 45, 46, 46, 46, 48,
+ 49, 52, 53, 54, 55, 55, 56, 56, 57, 58, 58, 59, 60, 61, 62, 63, 64, 64,
+ 66, 65, 65, 65, 66, 67, 49, 47, 46, 45, 46, 46, 46, 47, 49, 52, 53, 55,
+ 55, 57, 57, 58, 59, 59, 60, 61, 61, 63, 64, 65, 66, 66, 67, 67, 67, 68,
+ 67, 67, 49, 47, 46, 45, 45, 45, 45, 47, 49, 52, 53, 55, 57, 58, 59, 60,
+ 61, 62, 62, 63, 63, 65, 66, 67, 68, 68, 69, 70, 69, 68, 69, 70, 50, 48,
+ 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59, 61, 61, 63, 64, 64, 66,
+ 66, 68, 69, 70, 70, 71, 72, 70, 71, 72, 71, 70, 50, 48, 47, 46, 46, 46,
+ 46, 47, 50, 53, 54, 56, 58, 60, 61, 61, 63, 64, 65, 66, 67, 68, 69, 71,
+ 71, 71, 73, 74, 73, 72, 73, 74, 52, 50, 49, 47, 47, 47, 47, 48, 50, 53,
+ 54, 57, 59, 61, 63, 63, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 75,
+ 75, 76, 75, 74, 53, 50, 50, 48, 48, 48, 47, 49, 51, 54, 55, 58, 59, 62,
+ 64, 64, 67, 68, 69, 71, 71, 73, 74, 76, 77, 77, 78, 78, 78, 76, 77, 78,
+ 54, 52, 51, 49, 49, 49, 48, 49, 52, 55, 55, 58, 60, 62, 64, 65, 68, 69,
+ 71, 73, 73, 75, 76, 78, 79, 79, 80, 80, 79, 80, 79, 78, 56, 54, 53, 51,
+ 51, 51, 49, 51, 53, 55, 56, 59, 61, 63, 66, 66, 70, 71, 73, 75, 76, 78,
+ 79, 81, 82, 82, 83, 81, 83, 81, 81, 82, 57, 54, 53, 52, 51, 51, 50, 51,
+ 53, 56, 56, 60, 61, 63, 66, 67, 70, 71, 73, 76, 76, 79, 80, 82, 82, 83,
+ 84, 85, 83, 84, 84, 82, 60, 57, 56, 54, 53, 53, 52, 53, 55, 58, 58, 61,
+ 63, 65, 68, 68, 72, 73, 75, 78, 79, 82, 83, 85, 86, 86, 88, 86, 87, 86,
+ 85, 86, 61, 58, 57, 55, 55, 54, 53, 54, 56, 58, 59, 62, 64, 66, 69, 69,
+ 73, 74, 76, 79, 80, 83, 84, 86, 87, 88, 89, 89, 88, 88, 88, 86, 63, 60,
+ 59, 57, 56, 56, 54, 55, 57, 60, 60, 63, 65, 67, 70, 71, 75, 76, 78, 81,
+ 82, 85, 86, 89, 90, 90, 92, 91, 91, 90, 89, 91, 64, 61, 60, 58, 57, 57,
+ 55, 56, 58, 60, 61, 64, 66, 68, 70, 71, 75, 77, 79, 82, 82, 86, 87, 90,
+ 91, 91, 93, 93, 93, 92, 93, 91, 65, 61, 60, 58, 57, 57, 55, 56, 58, 61,
+ 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 88, 90, 91, 91, 93, 94,
+ 95, 95, 93, 95, 67, 63, 62, 60, 59, 59, 57, 58, 60, 62, 63, 66, 67, 69,
+ 72, 73, 77, 78, 80, 83, 84, 88, 89, 92, 93, 93, 95, 95, 96, 96, 97, 95,
+ 67, 64, 63, 61, 60, 60, 58, 58, 61, 61, 63, 65, 67, 70, 70, 74, 75, 78,
+ 80, 81, 85, 86, 89, 91, 93, 94, 95, 97, 97, 98, 98, 100, 68, 65, 64, 62,
+ 61, 60, 59, 58, 61, 61, 64, 65, 67, 69, 71, 73, 75, 78, 79, 83, 83, 87,
+ 88, 91, 93, 95, 96, 97, 99, 98, 100, 100, 69, 65, 65, 62, 62, 61, 60,
+ 59, 61, 62, 64, 65, 68, 68, 72, 72, 76, 76, 80, 81, 84, 86, 88, 90, 92,
+ 95, 96, 98, 98, 100, 100, 101, 70, 66, 66, 63, 63, 62, 61, 60, 60, 63,
+ 64, 66, 67, 69, 71, 73, 75, 77, 79, 81, 84, 85, 88, 89, 93, 93, 97, 98,
+ 100, 100, 102, 101, 71, 67, 67, 64, 64, 62, 62, 60, 60, 64, 64, 67, 67,
+ 70, 70, 74, 74, 78, 78, 82, 82, 86, 86, 91, 91, 95, 95, 100, 100, 101,
+ 101, 104,
+ /* Size 4x8 */
+ 31, 47, 53, 63, 36, 47, 50, 59, 46, 52, 55, 61, 45, 53, 63, 70, 49, 55,
+ 71, 77, 54, 58, 77, 86, 59, 61, 81, 94, 63, 65, 80, 95,
+ /* Size 8x4 */
+ 31, 36, 46, 45, 49, 54, 59, 63, 47, 47, 52, 53, 55, 58, 61, 65, 53, 50,
+ 55, 63, 71, 77, 81, 80, 63, 59, 61, 70, 77, 86, 94, 95,
+ /* Size 8x16 */
+ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 46, 49, 53, 60, 64, 33, 37,
+ 46, 45, 47, 51, 57, 61, 37, 43, 47, 45, 47, 50, 55, 59, 42, 44, 49, 49,
+ 50, 53, 58, 60, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46, 51, 57, 59, 61,
+ 66, 67, 50, 46, 52, 59, 63, 66, 71, 71, 52, 47, 53, 61, 66, 71, 75, 74,
+ 54, 49, 54, 62, 68, 73, 79, 79, 57, 51, 55, 64, 70, 76, 83, 83, 61, 55,
+ 58, 66, 73, 80, 87, 87, 64, 57, 60, 68, 75, 83, 91, 91, 66, 59, 61, 69,
+ 77, 84, 93, 95, 68, 61, 61, 68, 77, 86, 94, 97, 70, 63, 61, 67, 75, 83,
+ 92, 98,
+ /* Size 16x8 */
+ 32, 31, 33, 37, 42, 49, 48, 50, 52, 54, 57, 61, 64, 66, 68, 70, 33, 34,
+ 37, 43, 44, 47, 46, 46, 47, 49, 51, 55, 57, 59, 61, 63, 45, 45, 46, 47,
+ 49, 52, 51, 52, 53, 54, 55, 58, 60, 61, 61, 61, 49, 46, 45, 45, 49, 53,
+ 57, 59, 61, 62, 64, 66, 68, 69, 68, 67, 52, 49, 47, 47, 50, 54, 59, 63,
+ 66, 68, 70, 73, 75, 77, 77, 75, 57, 53, 51, 50, 53, 57, 61, 66, 71, 73,
+ 76, 80, 83, 84, 86, 83, 64, 60, 57, 55, 58, 61, 66, 71, 75, 79, 83, 87,
+ 91, 93, 94, 92, 68, 64, 61, 59, 60, 63, 67, 71, 74, 79, 83, 87, 91, 95,
+ 97, 98,
+ /* Size 16x32 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 52, 56, 57, 63, 64, 67, 68, 68, 31, 31,
+ 34, 38, 45, 47, 47, 48, 50, 53, 54, 60, 61, 63, 64, 65, 31, 32, 34, 39,
+ 45, 46, 46, 47, 49, 52, 53, 59, 60, 62, 64, 65, 30, 32, 35, 40, 44, 46,
+ 45, 46, 48, 51, 52, 57, 58, 60, 61, 62, 33, 35, 37, 42, 46, 47, 45, 46,
+ 47, 50, 51, 56, 57, 60, 61, 62, 33, 36, 38, 43, 46, 47, 46, 46, 47, 50,
+ 51, 56, 57, 59, 60, 60, 37, 40, 43, 47, 47, 47, 45, 46, 47, 49, 50, 54,
+ 55, 57, 59, 61, 39, 41, 43, 47, 48, 48, 47, 47, 48, 50, 51, 55, 56, 57,
+ 58, 59, 42, 43, 44, 47, 49, 50, 49, 50, 50, 53, 53, 57, 58, 60, 60, 59,
+ 47, 46, 46, 48, 51, 52, 53, 53, 53, 55, 56, 60, 61, 61, 61, 62, 49, 46,
+ 47, 48, 52, 53, 53, 54, 54, 56, 57, 60, 61, 63, 63, 62, 48, 46, 46, 47,
+ 51, 53, 56, 56, 57, 59, 60, 64, 64, 65, 64, 65, 48, 45, 46, 46, 51, 53,
+ 57, 57, 59, 61, 61, 65, 66, 66, 67, 65, 49, 45, 45, 46, 51, 53, 58, 59,
+ 61, 63, 64, 67, 68, 70, 67, 68, 50, 46, 46, 46, 52, 54, 59, 61, 63, 65,
+ 66, 70, 71, 70, 71, 68, 50, 46, 46, 46, 52, 54, 59, 61, 64, 66, 67, 71,
+ 71, 73, 71, 72, 52, 48, 47, 47, 53, 54, 61, 63, 66, 70, 71, 75, 75, 75,
+ 74, 72, 53, 49, 48, 48, 53, 55, 61, 64, 67, 71, 72, 76, 77, 77, 75, 76,
+ 54, 50, 49, 49, 54, 55, 62, 65, 68, 72, 73, 78, 79, 80, 79, 76, 56, 51,
+ 51, 50, 55, 56, 63, 66, 70, 74, 76, 81, 82, 81, 80, 80, 57, 52, 51, 50,
+ 55, 56, 64, 66, 70, 75, 76, 82, 83, 85, 83, 80, 60, 54, 54, 52, 57, 58,
+ 65, 68, 72, 77, 79, 85, 86, 86, 85, 84, 61, 56, 55, 53, 58, 59, 66, 69,
+ 73, 79, 80, 86, 87, 89, 87, 84, 63, 57, 56, 55, 59, 60, 67, 70, 75, 80,
+ 82, 89, 90, 91, 89, 89, 64, 58, 57, 56, 60, 61, 68, 71, 75, 81, 83, 90,
+ 91, 93, 91, 89, 64, 59, 58, 56, 60, 61, 68, 71, 75, 81, 83, 90, 91, 94,
+ 94, 93, 66, 60, 59, 57, 61, 63, 69, 72, 77, 82, 84, 92, 93, 94, 95, 93,
+ 67, 61, 60, 58, 61, 63, 69, 70, 78, 80, 85, 90, 93, 96, 97, 97, 68, 62,
+ 61, 59, 61, 64, 68, 71, 77, 79, 86, 88, 94, 96, 97, 98, 69, 63, 62, 59,
+ 61, 65, 68, 72, 76, 80, 85, 88, 94, 95, 99, 99, 70, 63, 63, 60, 61, 66,
+ 67, 73, 75, 81, 83, 89, 92, 97, 98, 99, 70, 64, 64, 61, 61, 67, 67, 74,
+ 74, 82, 82, 90, 90, 98, 98, 102,
+ /* Size 32x16 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 31, 31, 32, 32,
+ 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 48, 49, 50, 51, 52, 54,
+ 56, 57, 58, 59, 60, 61, 62, 63, 63, 64, 33, 34, 34, 35, 37, 38, 43, 43,
+ 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 37, 38, 39, 40, 42, 43, 47, 47, 47, 48, 48, 47,
+ 46, 46, 46, 46, 47, 48, 49, 50, 50, 52, 53, 55, 56, 56, 57, 58, 59, 59,
+ 60, 61, 45, 45, 45, 44, 46, 46, 47, 48, 49, 51, 52, 51, 51, 51, 52, 52,
+ 53, 53, 54, 55, 55, 57, 58, 59, 60, 60, 61, 61, 61, 61, 61, 61, 48, 47,
+ 46, 46, 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 54, 55, 55, 56,
+ 56, 58, 59, 60, 61, 61, 63, 63, 64, 65, 66, 67, 49, 47, 46, 45, 45, 46,
+ 45, 47, 49, 53, 53, 56, 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67,
+ 68, 68, 69, 69, 68, 68, 67, 67, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+ 54, 56, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 69, 70, 71, 71, 72, 70,
+ 71, 72, 73, 74, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61,
+ 63, 64, 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74,
+ 56, 53, 52, 51, 50, 50, 49, 50, 53, 55, 56, 59, 61, 63, 65, 66, 70, 71,
+ 72, 74, 75, 77, 79, 80, 81, 81, 82, 80, 79, 80, 81, 82, 57, 54, 53, 52,
+ 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76, 76, 79,
+ 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 63, 60, 59, 57, 56, 56, 54, 55,
+ 57, 60, 60, 64, 65, 67, 70, 71, 75, 76, 78, 81, 82, 85, 86, 89, 90, 90,
+ 92, 90, 88, 88, 89, 90, 64, 61, 60, 58, 57, 57, 55, 56, 58, 61, 61, 64,
+ 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90, 91, 91, 93, 93, 94, 94,
+ 92, 90, 67, 63, 62, 60, 60, 59, 57, 57, 60, 61, 63, 65, 66, 70, 70, 73,
+ 75, 77, 80, 81, 85, 86, 89, 91, 93, 94, 94, 96, 96, 95, 97, 98, 68, 64,
+ 64, 61, 61, 60, 59, 58, 60, 61, 63, 64, 67, 67, 71, 71, 74, 75, 79, 80,
+ 83, 85, 87, 89, 91, 94, 95, 97, 97, 99, 98, 98, 68, 65, 65, 62, 62, 60,
+ 61, 59, 59, 62, 62, 65, 65, 68, 68, 72, 72, 76, 76, 80, 80, 84, 84, 89,
+ 89, 93, 93, 97, 98, 99, 99, 102,
+ /* Size 4x16 */
+ 31, 48, 56, 67, 32, 46, 52, 62, 35, 47, 50, 60, 40, 47, 49, 57, 43, 50,
+ 53, 60, 46, 53, 56, 63, 45, 53, 61, 66, 46, 54, 65, 70, 48, 54, 70, 75,
+ 50, 55, 72, 80, 52, 56, 75, 85, 56, 59, 79, 89, 58, 61, 81, 93, 60, 63,
+ 82, 94, 62, 64, 79, 96, 63, 66, 81, 97,
+ /* Size 16x4 */
+ 31, 32, 35, 40, 43, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 63, 48, 46,
+ 47, 47, 50, 53, 53, 54, 54, 55, 56, 59, 61, 63, 64, 66, 56, 52, 50, 49,
+ 53, 56, 61, 65, 70, 72, 75, 79, 81, 82, 79, 81, 67, 62, 60, 57, 60, 63,
+ 66, 70, 75, 80, 85, 89, 93, 94, 96, 97,
+ /* Size 8x32 */
+ 32, 33, 45, 49, 52, 57, 64, 68, 31, 34, 45, 47, 50, 54, 61, 64, 31, 34,
+ 45, 46, 49, 53, 60, 64, 30, 35, 44, 45, 48, 52, 58, 61, 33, 37, 46, 45,
+ 47, 51, 57, 61, 33, 38, 46, 46, 47, 51, 57, 60, 37, 43, 47, 45, 47, 50,
+ 55, 59, 39, 43, 48, 47, 48, 51, 56, 58, 42, 44, 49, 49, 50, 53, 58, 60,
+ 47, 46, 51, 53, 53, 56, 61, 61, 49, 47, 52, 53, 54, 57, 61, 63, 48, 46,
+ 51, 56, 57, 60, 64, 64, 48, 46, 51, 57, 59, 61, 66, 67, 49, 45, 51, 58,
+ 61, 64, 68, 67, 50, 46, 52, 59, 63, 66, 71, 71, 50, 46, 52, 59, 64, 67,
+ 71, 71, 52, 47, 53, 61, 66, 71, 75, 74, 53, 48, 53, 61, 67, 72, 77, 75,
+ 54, 49, 54, 62, 68, 73, 79, 79, 56, 51, 55, 63, 70, 76, 82, 80, 57, 51,
+ 55, 64, 70, 76, 83, 83, 60, 54, 57, 65, 72, 79, 86, 85, 61, 55, 58, 66,
+ 73, 80, 87, 87, 63, 56, 59, 67, 75, 82, 90, 89, 64, 57, 60, 68, 75, 83,
+ 91, 91, 64, 58, 60, 68, 75, 83, 91, 94, 66, 59, 61, 69, 77, 84, 93, 95,
+ 67, 60, 61, 69, 78, 85, 93, 97, 68, 61, 61, 68, 77, 86, 94, 97, 69, 62,
+ 61, 68, 76, 85, 94, 99, 70, 63, 61, 67, 75, 83, 92, 98, 70, 64, 61, 67,
+ 74, 82, 90, 98,
+ /* Size 32x8 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 52, 53,
+ 54, 56, 57, 60, 61, 63, 64, 64, 66, 67, 68, 69, 70, 70, 33, 34, 34, 35,
+ 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 47, 48, 49, 51, 51, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 45, 45, 45, 44, 46, 46, 47, 48,
+ 49, 51, 52, 51, 51, 51, 52, 52, 53, 53, 54, 55, 55, 57, 58, 59, 60, 60,
+ 61, 61, 61, 61, 61, 61, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+ 57, 58, 59, 59, 61, 61, 62, 63, 64, 65, 66, 67, 68, 68, 69, 69, 68, 68,
+ 67, 67, 52, 50, 49, 48, 47, 47, 47, 48, 50, 53, 54, 57, 59, 61, 63, 64,
+ 66, 67, 68, 70, 70, 72, 73, 75, 75, 75, 77, 78, 77, 76, 75, 74, 57, 54,
+ 53, 52, 51, 51, 50, 51, 53, 56, 57, 60, 61, 64, 66, 67, 71, 72, 73, 76,
+ 76, 79, 80, 82, 83, 83, 84, 85, 86, 85, 83, 82, 64, 61, 60, 58, 57, 57,
+ 55, 56, 58, 61, 61, 64, 66, 68, 71, 71, 75, 77, 79, 82, 83, 86, 87, 90,
+ 91, 91, 93, 93, 94, 94, 92, 90, 68, 64, 64, 61, 61, 60, 59, 58, 60, 61,
+ 63, 64, 67, 67, 71, 71, 74, 75, 79, 80, 83, 85, 87, 89, 91, 94, 95, 97,
+ 97, 99, 98, 98 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 34, 53, 75, 34, 49, 64, 81, 53, 64, 91, 112, 75, 81, 112, 140,
+ /* Size 8x8 */
+ 32, 32, 34, 39, 50, 62, 76, 84, 32, 33, 35, 40, 48, 59, 71, 79, 34, 35,
+ 39, 46, 53, 63, 74, 81, 39, 40, 46, 56, 65, 75, 86, 92, 50, 48, 53, 65,
+ 78, 90, 101, 106, 62, 59, 63, 75, 90, 105, 118, 123, 76, 71, 74, 86,
+ 101, 118, 134, 142, 84, 79, 81, 92, 106, 123, 142, 153,
+ /* Size 16x16 */
+ 32, 31, 31, 32, 33, 36, 39, 44, 48, 54, 59, 66, 74, 81, 86, 91, 31, 32,
+ 32, 32, 33, 35, 38, 42, 46, 51, 56, 63, 70, 77, 81, 86, 31, 32, 32, 33,
+ 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 77, 82, 32, 32, 33, 34, 36, 37,
+ 40, 42, 45, 49, 53, 59, 66, 71, 75, 80, 33, 33, 34, 36, 38, 42, 44, 46,
+ 50, 53, 57, 63, 69, 74, 78, 80, 36, 35, 35, 37, 42, 48, 50, 54, 57, 60,
+ 64, 69, 75, 80, 84, 85, 39, 38, 38, 40, 44, 50, 54, 58, 61, 65, 69, 74,
+ 80, 85, 89, 91, 44, 42, 41, 42, 46, 54, 58, 63, 67, 71, 75, 80, 86, 91,
+ 95, 97, 48, 46, 45, 45, 50, 57, 61, 67, 71, 76, 80, 86, 93, 98, 101,
+ 104, 54, 51, 49, 49, 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 112,
+ 59, 56, 54, 53, 57, 64, 69, 75, 80, 87, 92, 99, 106, 112, 116, 120, 66,
+ 63, 60, 59, 63, 69, 74, 80, 86, 93, 99, 107, 115, 121, 125, 129, 74, 70,
+ 67, 66, 69, 75, 80, 86, 93, 100, 106, 115, 123, 130, 135, 138, 81, 77,
+ 73, 71, 74, 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148, 86, 81,
+ 77, 75, 78, 84, 89, 95, 101, 109, 116, 125, 135, 142, 147, 153, 91, 86,
+ 82, 80, 80, 85, 91, 97, 104, 112, 120, 129, 138, 148, 153, 159,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 33, 34, 36, 36, 39, 41, 44, 46, 48, 52,
+ 54, 58, 59, 65, 66, 71, 74, 80, 81, 83, 86, 89, 91, 93, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 35, 35, 38, 39, 42, 44, 46, 50, 51, 56, 56, 62,
+ 63, 68, 71, 76, 77, 78, 82, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 35, 35, 38, 39, 42, 44, 46, 49, 51, 55, 56, 61, 63, 67, 70, 75,
+ 77, 78, 81, 84, 86, 88, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 37, 38, 41, 42, 44, 48, 49, 53, 54, 59, 60, 65, 68, 72, 74, 75, 78, 80,
+ 82, 84, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43,
+ 45, 48, 49, 53, 54, 59, 60, 65, 67, 72, 73, 74, 77, 80, 82, 84, 31, 32,
+ 32, 32, 33, 33, 33, 34, 35, 35, 36, 36, 39, 40, 42, 44, 45, 48, 50, 53,
+ 54, 59, 60, 64, 67, 71, 73, 74, 77, 79, 81, 83, 32, 32, 32, 32, 33, 33,
+ 34, 35, 36, 36, 37, 38, 40, 40, 42, 44, 45, 48, 49, 53, 53, 58, 59, 63,
+ 66, 70, 71, 72, 75, 78, 80, 83, 32, 32, 32, 32, 33, 34, 35, 35, 36, 37,
+ 38, 38, 40, 41, 42, 44, 46, 48, 49, 53, 53, 58, 59, 63, 65, 69, 71, 72,
+ 74, 77, 79, 80, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45,
+ 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 74, 75, 78, 79, 80, 81,
+ 34, 34, 34, 33, 34, 35, 36, 37, 39, 39, 42, 43, 45, 46, 47, 49, 51, 53,
+ 54, 58, 58, 63, 64, 68, 70, 74, 75, 76, 79, 81, 84, 86, 36, 35, 35, 34,
+ 35, 36, 37, 38, 42, 42, 48, 48, 50, 51, 54, 55, 57, 59, 60, 63, 64, 68,
+ 69, 73, 75, 79, 80, 81, 84, 85, 85, 86, 36, 35, 35, 34, 35, 36, 38, 38,
+ 42, 43, 48, 49, 51, 52, 54, 55, 57, 59, 60, 64, 64, 68, 69, 73, 75, 79,
+ 80, 81, 84, 86, 88, 91, 39, 38, 38, 37, 38, 39, 40, 40, 44, 45, 50, 51,
+ 54, 55, 58, 59, 61, 64, 65, 68, 69, 73, 74, 78, 80, 84, 85, 86, 89, 91,
+ 91, 91, 41, 39, 39, 38, 39, 40, 40, 41, 45, 46, 51, 52, 55, 56, 59, 61,
+ 63, 65, 67, 70, 70, 75, 76, 80, 82, 86, 87, 88, 91, 92, 94, 96, 44, 42,
+ 42, 41, 41, 42, 42, 42, 46, 47, 54, 54, 58, 59, 63, 65, 67, 70, 71, 75,
+ 75, 79, 80, 84, 86, 90, 91, 92, 95, 97, 97, 97, 46, 44, 44, 42, 43, 44,
+ 44, 44, 48, 49, 55, 55, 59, 61, 65, 67, 69, 72, 74, 77, 78, 82, 83, 87,
+ 89, 93, 94, 95, 98, 98, 100, 103, 48, 46, 46, 44, 45, 45, 45, 46, 50,
+ 51, 57, 57, 61, 63, 67, 69, 71, 74, 76, 80, 80, 85, 86, 90, 93, 96, 98,
+ 99, 101, 104, 104, 103, 52, 50, 49, 48, 48, 48, 48, 48, 52, 53, 59, 59,
+ 64, 65, 70, 72, 74, 78, 80, 84, 85, 90, 91, 95, 97, 101, 103, 104, 106,
+ 106, 107, 110, 54, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67,
+ 71, 74, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112,
+ 112, 110, 58, 56, 55, 53, 53, 53, 53, 53, 57, 58, 63, 64, 68, 70, 75,
+ 77, 80, 84, 86, 91, 91, 97, 98, 103, 105, 110, 111, 112, 115, 114, 115,
+ 118, 59, 56, 56, 54, 54, 54, 53, 53, 57, 58, 64, 64, 69, 70, 75, 78, 80,
+ 85, 87, 91, 92, 98, 99, 103, 106, 110, 112, 113, 116, 119, 120, 119, 65,
+ 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90, 92,
+ 97, 98, 105, 106, 111, 114, 118, 120, 121, 124, 123, 123, 126, 66, 63,
+ 63, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91, 93, 98,
+ 99, 106, 107, 112, 115, 119, 121, 122, 125, 128, 129, 126, 71, 68, 67,
+ 65, 65, 64, 63, 63, 67, 68, 73, 73, 78, 80, 84, 87, 90, 95, 97, 103,
+ 103, 111, 112, 117, 120, 125, 127, 128, 131, 132, 132, 135, 74, 71, 70,
+ 68, 67, 67, 66, 65, 69, 70, 75, 75, 80, 82, 86, 89, 93, 97, 100, 105,
+ 106, 114, 115, 120, 123, 128, 130, 131, 135, 135, 138, 136, 80, 76, 75,
+ 72, 72, 71, 70, 69, 73, 74, 79, 79, 84, 86, 90, 93, 96, 101, 104, 110,
+ 110, 118, 119, 125, 128, 134, 136, 137, 140, 142, 140, 144, 81, 77, 77,
+ 74, 73, 73, 71, 71, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103, 105, 111,
+ 112, 120, 121, 127, 130, 136, 137, 139, 142, 145, 148, 144, 83, 78, 78,
+ 75, 74, 74, 72, 72, 75, 76, 81, 81, 86, 88, 92, 95, 99, 104, 106, 112,
+ 113, 121, 122, 128, 131, 137, 139, 140, 144, 148, 150, 155, 86, 82, 81,
+ 78, 77, 77, 75, 74, 78, 79, 84, 84, 89, 91, 95, 98, 101, 106, 109, 115,
+ 116, 124, 125, 131, 135, 140, 142, 144, 147, 149, 153, 155, 89, 84, 84,
+ 80, 80, 79, 78, 77, 79, 81, 85, 86, 91, 92, 97, 98, 104, 106, 112, 114,
+ 119, 123, 128, 132, 135, 142, 145, 148, 149, 153, 154, 159, 91, 86, 86,
+ 82, 82, 81, 80, 79, 80, 84, 85, 88, 91, 94, 97, 100, 104, 107, 112, 115,
+ 120, 123, 129, 132, 138, 140, 148, 150, 153, 154, 159, 159, 93, 88, 88,
+ 84, 84, 83, 83, 80, 81, 86, 86, 91, 91, 96, 97, 103, 103, 110, 110, 118,
+ 119, 126, 126, 135, 136, 144, 144, 155, 155, 159, 159, 164,
+ /* Size 4x8 */
+ 32, 35, 51, 77, 32, 36, 50, 72, 34, 42, 54, 75, 38, 51, 67, 87, 48, 59,
+ 80, 103, 60, 68, 92, 119, 72, 79, 104, 135, 81, 86, 112, 144,
+ /* Size 8x4 */
+ 32, 32, 34, 38, 48, 60, 72, 81, 35, 36, 42, 51, 59, 68, 79, 86, 51, 50,
+ 54, 67, 80, 92, 104, 112, 77, 72, 75, 87, 103, 119, 135, 144,
+ /* Size 8x16 */
+ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 61, 74, 82, 31, 32,
+ 34, 38, 47, 59, 71, 79, 32, 33, 36, 40, 48, 58, 69, 77, 33, 34, 38, 44,
+ 52, 62, 72, 78, 36, 35, 42, 51, 58, 68, 78, 84, 39, 38, 44, 54, 63, 73,
+ 84, 89, 44, 41, 46, 59, 69, 79, 90, 96, 48, 45, 50, 62, 74, 85, 96, 103,
+ 53, 49, 53, 66, 79, 92, 103, 111, 58, 54, 57, 70, 84, 98, 110, 118, 66,
+ 60, 63, 75, 90, 106, 119, 126, 74, 67, 69, 81, 97, 113, 128, 134, 81,
+ 73, 75, 86, 102, 120, 135, 143, 86, 78, 78, 90, 106, 124, 140, 147, 91,
+ 82, 80, 90, 103, 119, 137, 151,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 33, 36, 39, 44, 48, 53, 58, 66, 74, 81, 86, 91, 31, 32,
+ 32, 33, 34, 35, 38, 41, 45, 49, 54, 60, 67, 73, 78, 82, 33, 33, 34, 36,
+ 38, 42, 44, 46, 50, 53, 57, 63, 69, 75, 78, 80, 40, 39, 38, 40, 44, 51,
+ 54, 59, 62, 66, 70, 75, 81, 86, 90, 90, 51, 49, 47, 48, 52, 58, 63, 69,
+ 74, 79, 84, 90, 97, 102, 106, 103, 65, 61, 59, 58, 62, 68, 73, 79, 85,
+ 92, 98, 106, 113, 120, 124, 119, 79, 74, 71, 69, 72, 78, 84, 90, 96,
+ 103, 110, 119, 128, 135, 140, 137, 87, 82, 79, 77, 78, 84, 89, 96, 103,
+ 111, 118, 126, 134, 143, 147, 151,
+ /* Size 16x32 */
+ 32, 31, 31, 32, 33, 36, 40, 44, 51, 53, 65, 66, 79, 81, 87, 90, 31, 32,
+ 32, 32, 33, 35, 39, 42, 49, 51, 62, 63, 75, 77, 83, 85, 31, 32, 32, 32,
+ 33, 35, 39, 42, 49, 51, 61, 62, 74, 76, 82, 85, 31, 32, 32, 33, 33, 34,
+ 38, 41, 47, 49, 59, 60, 72, 74, 79, 81, 31, 32, 32, 33, 34, 35, 38, 41,
+ 47, 49, 59, 60, 71, 73, 79, 81, 32, 32, 33, 34, 35, 36, 39, 42, 48, 50,
+ 59, 60, 71, 72, 78, 80, 32, 32, 33, 35, 36, 37, 40, 42, 48, 49, 58, 59,
+ 69, 71, 77, 80, 32, 33, 33, 35, 36, 38, 41, 42, 48, 49, 58, 59, 69, 70,
+ 75, 77, 33, 33, 34, 36, 38, 41, 44, 46, 52, 53, 62, 63, 72, 74, 78, 78,
+ 34, 34, 34, 37, 39, 42, 45, 48, 53, 54, 63, 64, 73, 75, 80, 83, 36, 34,
+ 35, 38, 42, 48, 51, 54, 58, 60, 68, 69, 78, 80, 84, 83, 36, 35, 35, 38,
+ 42, 48, 51, 54, 59, 60, 68, 69, 79, 80, 85, 87, 39, 37, 38, 40, 44, 50,
+ 54, 58, 63, 65, 73, 74, 84, 85, 89, 88, 40, 38, 39, 41, 45, 51, 56, 59,
+ 65, 67, 75, 76, 85, 87, 90, 93, 44, 41, 41, 43, 46, 53, 59, 63, 69, 71,
+ 79, 80, 90, 91, 96, 93, 46, 43, 43, 44, 48, 55, 60, 65, 72, 73, 82, 83,
+ 93, 94, 97, 100, 48, 45, 45, 46, 50, 56, 62, 67, 74, 76, 85, 86, 96, 98,
+ 103, 100, 52, 48, 48, 49, 52, 59, 65, 70, 78, 80, 90, 91, 101, 103, 105,
+ 107, 53, 49, 49, 50, 53, 60, 66, 71, 79, 82, 92, 93, 103, 105, 111, 107,
+ 58, 53, 53, 53, 57, 63, 69, 74, 83, 86, 97, 98, 109, 111, 113, 115, 58,
+ 54, 54, 54, 57, 63, 70, 75, 84, 87, 98, 99, 110, 112, 118, 115, 65, 60,
+ 59, 58, 62, 68, 74, 79, 89, 92, 105, 106, 118, 119, 122, 123, 66, 61,
+ 60, 59, 63, 69, 75, 80, 90, 93, 106, 107, 119, 121, 126, 123, 71, 65,
+ 65, 63, 67, 73, 79, 84, 94, 97, 111, 112, 125, 127, 131, 132, 74, 68,
+ 67, 66, 69, 75, 81, 86, 97, 100, 113, 115, 128, 130, 134, 132, 79, 72,
+ 72, 70, 73, 79, 85, 90, 101, 104, 118, 119, 133, 135, 141, 140, 81, 74,
+ 73, 71, 75, 80, 86, 91, 102, 105, 120, 121, 135, 137, 143, 140, 82, 75,
+ 74, 72, 75, 81, 87, 92, 103, 106, 121, 122, 136, 139, 147, 151, 86, 78,
+ 78, 75, 78, 84, 90, 95, 106, 109, 124, 125, 140, 142, 147, 151, 88, 81,
+ 80, 77, 80, 86, 90, 98, 105, 112, 122, 127, 140, 144, 152, 155, 91, 83,
+ 82, 79, 80, 88, 90, 100, 103, 114, 119, 130, 137, 148, 151, 155, 93, 85,
+ 85, 81, 81, 90, 90, 102, 103, 117, 117, 134, 134, 151, 152, 160,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+ 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 37, 38, 41, 43, 45, 48, 49, 53, 54, 60,
+ 61, 65, 68, 72, 74, 75, 78, 81, 83, 85, 31, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59, 60, 65, 67, 72,
+ 73, 74, 78, 80, 82, 85, 32, 32, 32, 33, 33, 34, 35, 35, 36, 37, 38, 38,
+ 40, 41, 43, 44, 46, 49, 50, 53, 54, 58, 59, 63, 66, 70, 71, 72, 75, 77,
+ 79, 81, 33, 33, 33, 33, 34, 35, 36, 36, 38, 39, 42, 42, 44, 45, 46, 48,
+ 50, 52, 53, 57, 57, 62, 63, 67, 69, 73, 75, 75, 78, 80, 80, 81, 36, 35,
+ 35, 34, 35, 36, 37, 38, 41, 42, 48, 48, 50, 51, 53, 55, 56, 59, 60, 63,
+ 63, 68, 69, 73, 75, 79, 80, 81, 84, 86, 88, 90, 40, 39, 39, 38, 38, 39,
+ 40, 41, 44, 45, 51, 51, 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79,
+ 81, 85, 86, 87, 90, 90, 90, 90, 44, 42, 42, 41, 41, 42, 42, 42, 46, 48,
+ 54, 54, 58, 59, 63, 65, 67, 70, 71, 74, 75, 79, 80, 84, 86, 90, 91, 92,
+ 95, 98, 100, 102, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63,
+ 65, 69, 72, 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105,
+ 103, 103, 53, 51, 51, 49, 49, 50, 49, 49, 53, 54, 60, 60, 65, 67, 71,
+ 73, 76, 80, 82, 86, 87, 92, 93, 97, 100, 104, 105, 106, 109, 112, 114,
+ 117, 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85,
+ 90, 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117,
+ 66, 63, 62, 60, 60, 60, 59, 59, 63, 64, 69, 69, 74, 76, 80, 83, 86, 91,
+ 93, 98, 99, 106, 107, 112, 115, 119, 121, 122, 125, 127, 130, 134, 79,
+ 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+ 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 81, 77, 76, 74, 73, 72, 71, 70, 74, 75, 80, 80, 85, 87, 91, 94, 98, 103,
+ 105, 111, 112, 119, 121, 127, 130, 135, 137, 139, 142, 144, 148, 151,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+ 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+ 152, 90, 85, 85, 81, 81, 80, 80, 77, 78, 83, 83, 87, 88, 93, 93, 100,
+ 100, 107, 107, 115, 115, 123, 123, 132, 132, 140, 140, 151, 151, 155,
+ 155, 160,
+ /* Size 4x16 */
+ 31, 36, 53, 81, 32, 35, 51, 76, 32, 35, 49, 73, 32, 37, 49, 71, 33, 41,
+ 53, 74, 34, 48, 60, 80, 37, 50, 65, 85, 41, 53, 71, 91, 45, 56, 76, 98,
+ 49, 60, 82, 105, 54, 63, 87, 112, 61, 69, 93, 121, 68, 75, 100, 130, 74,
+ 80, 105, 137, 78, 84, 109, 142, 83, 88, 114, 148,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 33, 34, 37, 41, 45, 49, 54, 61, 68, 74, 78, 83, 36, 35,
+ 35, 37, 41, 48, 50, 53, 56, 60, 63, 69, 75, 80, 84, 88, 53, 51, 49, 49,
+ 53, 60, 65, 71, 76, 82, 87, 93, 100, 105, 109, 114, 81, 76, 73, 71, 74,
+ 80, 85, 91, 98, 105, 112, 121, 130, 137, 142, 148,
+ /* Size 8x32 */
+ 32, 31, 33, 40, 51, 65, 79, 87, 31, 32, 33, 39, 49, 62, 75, 83, 31, 32,
+ 33, 39, 49, 61, 74, 82, 31, 32, 33, 38, 47, 59, 72, 79, 31, 32, 34, 38,
+ 47, 59, 71, 79, 32, 33, 35, 39, 48, 59, 71, 78, 32, 33, 36, 40, 48, 58,
+ 69, 77, 32, 33, 36, 41, 48, 58, 69, 75, 33, 34, 38, 44, 52, 62, 72, 78,
+ 34, 34, 39, 45, 53, 63, 73, 80, 36, 35, 42, 51, 58, 68, 78, 84, 36, 35,
+ 42, 51, 59, 68, 79, 85, 39, 38, 44, 54, 63, 73, 84, 89, 40, 39, 45, 56,
+ 65, 75, 85, 90, 44, 41, 46, 59, 69, 79, 90, 96, 46, 43, 48, 60, 72, 82,
+ 93, 97, 48, 45, 50, 62, 74, 85, 96, 103, 52, 48, 52, 65, 78, 90, 101,
+ 105, 53, 49, 53, 66, 79, 92, 103, 111, 58, 53, 57, 69, 83, 97, 109, 113,
+ 58, 54, 57, 70, 84, 98, 110, 118, 65, 59, 62, 74, 89, 105, 118, 122, 66,
+ 60, 63, 75, 90, 106, 119, 126, 71, 65, 67, 79, 94, 111, 125, 131, 74,
+ 67, 69, 81, 97, 113, 128, 134, 79, 72, 73, 85, 101, 118, 133, 141, 81,
+ 73, 75, 86, 102, 120, 135, 143, 82, 74, 75, 87, 103, 121, 136, 147, 86,
+ 78, 78, 90, 106, 124, 140, 147, 88, 80, 80, 90, 105, 122, 140, 152, 91,
+ 82, 80, 90, 103, 119, 137, 151, 93, 85, 81, 90, 103, 117, 134, 152,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 33, 34, 36, 36, 39, 40, 44, 46, 48, 52,
+ 53, 58, 58, 65, 66, 71, 74, 79, 81, 82, 86, 88, 91, 93, 31, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 35, 35, 38, 39, 41, 43, 45, 48, 49, 53, 54, 59,
+ 60, 65, 67, 72, 73, 74, 78, 80, 82, 85, 33, 33, 33, 33, 34, 35, 36, 36,
+ 38, 39, 42, 42, 44, 45, 46, 48, 50, 52, 53, 57, 57, 62, 63, 67, 69, 73,
+ 75, 75, 78, 80, 80, 81, 40, 39, 39, 38, 38, 39, 40, 41, 44, 45, 51, 51,
+ 54, 56, 59, 60, 62, 65, 66, 69, 70, 74, 75, 79, 81, 85, 86, 87, 90, 90,
+ 90, 90, 51, 49, 49, 47, 47, 48, 48, 48, 52, 53, 58, 59, 63, 65, 69, 72,
+ 74, 78, 79, 83, 84, 89, 90, 94, 97, 101, 102, 103, 106, 105, 103, 103,
+ 65, 62, 61, 59, 59, 59, 58, 58, 62, 63, 68, 68, 73, 75, 79, 82, 85, 90,
+ 92, 97, 98, 105, 106, 111, 113, 118, 120, 121, 124, 122, 119, 117, 79,
+ 75, 74, 72, 71, 71, 69, 69, 72, 73, 78, 79, 84, 85, 90, 93, 96, 101,
+ 103, 109, 110, 118, 119, 125, 128, 133, 135, 136, 140, 140, 137, 134,
+ 87, 83, 82, 79, 79, 78, 77, 75, 78, 80, 84, 85, 89, 90, 96, 97, 103,
+ 105, 111, 113, 118, 122, 126, 131, 134, 141, 143, 147, 147, 152, 151,
+ 152 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 46, 49, 58, 46, 53, 55, 62, 49, 55, 70, 78, 58, 62, 78, 91,
+ /* Size 8x8 */
+ 31, 34, 42, 47, 49, 54, 60, 64, 34, 39, 45, 46, 47, 51, 56, 59, 42, 45,
+ 48, 49, 50, 53, 57, 60, 47, 46, 49, 55, 58, 61, 65, 66, 49, 47, 50, 58,
+ 65, 69, 73, 74, 54, 51, 53, 61, 69, 76, 82, 83, 60, 56, 57, 65, 73, 82,
+ 89, 92, 64, 59, 60, 66, 74, 83, 92, 96,
+ /* Size 16x16 */
+ 32, 31, 31, 35, 40, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 31, 31,
+ 32, 37, 41, 47, 47, 46, 48, 49, 51, 54, 57, 60, 62, 64, 31, 32, 34, 39,
+ 43, 46, 46, 45, 46, 47, 49, 52, 55, 57, 59, 61, 35, 37, 39, 44, 46, 47,
+ 46, 45, 46, 47, 48, 51, 53, 56, 57, 59, 40, 41, 43, 46, 48, 50, 49, 48,
+ 49, 49, 51, 53, 55, 57, 59, 59, 49, 47, 46, 47, 50, 53, 53, 53, 54, 54,
+ 55, 57, 59, 61, 62, 62, 48, 47, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60,
+ 62, 64, 65, 65, 49, 46, 45, 45, 48, 53, 55, 58, 60, 61, 62, 64, 66, 68,
+ 69, 69, 50, 48, 46, 46, 49, 54, 56, 60, 61, 63, 65, 67, 69, 71, 72, 72,
+ 52, 49, 47, 47, 49, 54, 57, 61, 63, 66, 68, 71, 73, 75, 76, 77, 54, 51,
+ 49, 48, 51, 55, 58, 62, 65, 68, 71, 74, 76, 78, 80, 81, 57, 54, 52, 51,
+ 53, 57, 60, 64, 67, 71, 74, 77, 80, 83, 84, 85, 61, 57, 55, 53, 55, 59,
+ 62, 66, 69, 73, 76, 80, 84, 87, 89, 89, 64, 60, 57, 56, 57, 61, 64, 68,
+ 71, 75, 78, 83, 87, 90, 92, 94, 66, 62, 59, 57, 59, 62, 65, 69, 72, 76,
+ 80, 84, 89, 92, 94, 96, 68, 64, 61, 59, 59, 62, 65, 69, 72, 77, 81, 85,
+ 89, 94, 96, 98,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 31, 33, 35, 36, 40, 41, 49, 49, 48, 48, 49, 50, 50, 52,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 65, 66, 67, 68, 69, 31, 31, 31, 31,
+ 32, 34, 37, 38, 41, 42, 47, 47, 47, 47, 47, 47, 48, 49, 50, 52, 52, 54,
+ 55, 57, 58, 60, 61, 61, 63, 64, 64, 65, 31, 31, 31, 31, 32, 35, 37, 39,
+ 41, 42, 47, 47, 47, 46, 46, 47, 48, 49, 49, 51, 51, 54, 54, 56, 57, 59,
+ 60, 61, 62, 63, 64, 65, 30, 31, 31, 32, 33, 35, 38, 40, 42, 42, 46, 46,
+ 45, 45, 45, 45, 46, 47, 47, 49, 49, 52, 52, 54, 55, 57, 58, 58, 60, 61,
+ 61, 62, 31, 32, 32, 33, 34, 37, 39, 41, 43, 43, 46, 46, 46, 45, 45, 46,
+ 46, 47, 47, 49, 49, 51, 52, 54, 55, 57, 57, 58, 59, 60, 61, 62, 33, 34,
+ 35, 35, 37, 39, 41, 43, 44, 45, 47, 47, 46, 46, 45, 46, 46, 47, 47, 49,
+ 49, 51, 51, 53, 54, 56, 57, 57, 58, 59, 60, 61, 35, 37, 37, 38, 39, 41,
+ 44, 46, 46, 46, 47, 47, 46, 46, 45, 46, 46, 47, 47, 48, 48, 50, 51, 52,
+ 53, 55, 56, 56, 57, 58, 59, 61, 36, 38, 39, 40, 41, 43, 46, 47, 47, 47,
+ 48, 47, 46, 46, 45, 46, 46, 46, 47, 48, 48, 50, 50, 52, 53, 54, 55, 55,
+ 56, 57, 58, 58, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 49, 49, 49,
+ 48, 49, 49, 49, 49, 51, 51, 52, 53, 54, 55, 57, 57, 58, 59, 59, 59, 59,
+ 41, 42, 42, 42, 43, 45, 46, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50,
+ 50, 52, 52, 53, 53, 55, 56, 57, 58, 58, 59, 60, 61, 62, 49, 47, 47, 46,
+ 46, 47, 47, 48, 50, 50, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+ 57, 58, 59, 60, 61, 61, 62, 62, 62, 62, 49, 47, 47, 46, 46, 47, 47, 47,
+ 49, 50, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 56, 57, 57, 59, 59, 61,
+ 61, 62, 63, 63, 64, 65, 48, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+ 54, 54, 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 62, 63, 64, 64, 65, 66,
+ 65, 65, 48, 47, 46, 45, 45, 46, 46, 46, 49, 49, 53, 53, 54, 55, 56, 57,
+ 57, 58, 58, 59, 60, 61, 61, 63, 63, 65, 65, 65, 66, 66, 67, 68, 49, 47,
+ 46, 45, 45, 45, 45, 45, 48, 49, 53, 54, 55, 56, 58, 59, 60, 61, 61, 62,
+ 62, 63, 64, 65, 66, 67, 68, 68, 69, 70, 69, 68, 50, 47, 47, 45, 46, 46,
+ 46, 46, 49, 49, 54, 54, 56, 57, 59, 60, 60, 62, 62, 63, 64, 65, 65, 67,
+ 68, 69, 69, 70, 70, 70, 71, 71, 50, 48, 48, 46, 46, 46, 46, 46, 49, 50,
+ 54, 54, 56, 57, 60, 60, 61, 63, 63, 65, 65, 67, 67, 68, 69, 71, 71, 71,
+ 72, 73, 72, 71, 52, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+ 61, 62, 63, 65, 65, 67, 67, 69, 70, 71, 72, 73, 74, 74, 75, 74, 74, 75,
+ 52, 50, 49, 47, 47, 47, 47, 47, 49, 50, 54, 54, 57, 58, 61, 62, 63, 65,
+ 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 77, 75, 54, 52, 51, 49,
+ 49, 49, 48, 48, 51, 52, 55, 55, 58, 59, 62, 63, 65, 67, 68, 70, 70, 73,
+ 73, 75, 76, 78, 78, 78, 79, 78, 78, 79, 54, 52, 51, 49, 49, 49, 48, 48,
+ 51, 52, 55, 56, 58, 60, 62, 64, 65, 67, 68, 70, 71, 73, 74, 75, 76, 78,
+ 78, 79, 80, 81, 81, 79, 57, 54, 54, 52, 51, 51, 50, 50, 52, 53, 56, 57,
+ 60, 61, 63, 65, 67, 69, 70, 73, 73, 76, 77, 79, 80, 82, 82, 83, 84, 83,
+ 82, 83, 57, 55, 54, 52, 52, 51, 51, 50, 53, 53, 57, 57, 60, 61, 64, 65,
+ 67, 70, 71, 73, 74, 77, 77, 79, 80, 82, 83, 83, 84, 85, 85, 83, 60, 57,
+ 56, 54, 54, 53, 52, 52, 54, 55, 58, 59, 61, 63, 65, 67, 68, 71, 72, 75,
+ 75, 79, 79, 82, 83, 85, 86, 86, 87, 87, 86, 87, 61, 58, 57, 55, 55, 54,
+ 53, 53, 55, 56, 59, 59, 62, 63, 66, 68, 69, 72, 73, 76, 76, 80, 80, 83,
+ 84, 86, 87, 88, 89, 89, 89, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57,
+ 60, 61, 63, 65, 67, 69, 71, 73, 75, 78, 78, 82, 82, 85, 86, 89, 89, 90,
+ 91, 92, 90, 91, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65,
+ 68, 69, 71, 74, 75, 78, 78, 82, 83, 86, 87, 89, 90, 91, 92, 93, 94, 91,
+ 65, 61, 61, 58, 58, 57, 56, 55, 58, 58, 61, 62, 64, 65, 68, 70, 71, 74,
+ 75, 78, 79, 83, 83, 86, 88, 90, 91, 91, 93, 94, 94, 96, 66, 63, 62, 60,
+ 59, 58, 57, 56, 59, 59, 62, 63, 65, 66, 69, 70, 72, 75, 76, 79, 80, 84,
+ 84, 87, 89, 91, 92, 93, 94, 94, 96, 96, 67, 64, 63, 61, 60, 59, 58, 57,
+ 59, 60, 62, 63, 66, 66, 70, 70, 73, 74, 77, 78, 81, 83, 85, 87, 89, 92,
+ 93, 94, 94, 96, 96, 97, 68, 64, 64, 61, 61, 60, 59, 58, 59, 61, 62, 64,
+ 65, 67, 69, 71, 72, 74, 77, 78, 81, 82, 85, 86, 89, 90, 94, 94, 96, 96,
+ 98, 97, 69, 65, 65, 62, 62, 61, 61, 58, 59, 62, 62, 65, 65, 68, 68, 71,
+ 71, 75, 75, 79, 79, 83, 83, 87, 87, 91, 91, 96, 96, 97, 97, 99,
+ /* Size 4x8 */
+ 31, 47, 50, 61, 36, 47, 47, 57, 43, 50, 50, 58, 45, 53, 58, 65, 47, 54,
+ 66, 74, 52, 56, 70, 82, 57, 60, 75, 90, 61, 63, 77, 93,
+ /* Size 8x4 */
+ 31, 36, 43, 45, 47, 52, 57, 61, 47, 47, 50, 53, 54, 56, 60, 63, 50, 47,
+ 50, 58, 66, 70, 75, 77, 61, 57, 58, 65, 74, 82, 90, 93,
+ /* Size 8x16 */
+ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 59, 63, 31, 35,
+ 43, 46, 47, 51, 57, 60, 35, 39, 46, 46, 47, 50, 55, 58, 41, 43, 48, 49,
+ 49, 52, 57, 59, 49, 47, 50, 53, 54, 57, 60, 62, 48, 46, 49, 54, 57, 60,
+ 64, 65, 49, 45, 48, 56, 61, 64, 67, 69, 50, 46, 49, 57, 63, 67, 71, 73,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 50, 51, 59, 67, 73, 78, 81, 57, 52,
+ 53, 61, 69, 77, 82, 85, 61, 55, 56, 63, 72, 80, 86, 88, 64, 58, 58, 65,
+ 73, 82, 89, 92, 66, 59, 59, 66, 75, 84, 91, 94, 68, 61, 59, 65, 72, 81,
+ 89, 95,
+ /* Size 16x8 */
+ 32, 31, 31, 35, 41, 49, 48, 49, 50, 52, 54, 57, 61, 64, 66, 68, 32, 33,
+ 35, 39, 43, 47, 46, 45, 46, 48, 50, 52, 55, 58, 59, 61, 40, 41, 43, 46,
+ 48, 50, 49, 48, 49, 50, 51, 53, 56, 58, 59, 59, 49, 47, 46, 46, 49, 53,
+ 54, 56, 57, 58, 59, 61, 63, 65, 66, 65, 51, 49, 47, 47, 49, 54, 57, 61,
+ 63, 65, 67, 69, 72, 73, 75, 72, 57, 54, 51, 50, 52, 57, 60, 64, 67, 71,
+ 73, 77, 80, 82, 84, 81, 63, 59, 57, 55, 57, 60, 64, 67, 71, 75, 78, 82,
+ 86, 89, 91, 89, 67, 63, 60, 58, 59, 62, 65, 69, 73, 77, 81, 85, 88, 92,
+ 94, 95,
+ /* Size 16x32 */
+ 32, 31, 32, 37, 40, 48, 49, 49, 51, 52, 57, 58, 63, 64, 67, 67, 31, 31,
+ 33, 38, 41, 47, 47, 47, 49, 50, 54, 55, 60, 61, 63, 64, 31, 31, 33, 38,
+ 41, 47, 47, 47, 49, 49, 54, 54, 59, 60, 63, 64, 30, 32, 33, 40, 42, 46,
+ 45, 45, 47, 48, 52, 52, 57, 58, 60, 61, 31, 33, 35, 41, 43, 46, 46, 45,
+ 47, 48, 51, 52, 57, 57, 60, 61, 33, 36, 37, 43, 44, 47, 46, 46, 47, 47,
+ 51, 52, 56, 57, 59, 60, 35, 38, 39, 45, 46, 47, 46, 45, 47, 47, 50, 51,
+ 55, 56, 58, 60, 37, 40, 41, 47, 47, 47, 46, 45, 46, 47, 50, 50, 54, 55,
+ 57, 58, 41, 42, 43, 47, 48, 49, 49, 48, 49, 50, 52, 53, 57, 57, 59, 58,
+ 42, 43, 43, 47, 48, 50, 49, 49, 50, 50, 53, 54, 57, 58, 60, 61, 49, 46,
+ 47, 48, 50, 53, 53, 53, 54, 54, 57, 57, 60, 61, 62, 61, 49, 46, 47, 48,
+ 50, 53, 53, 54, 54, 55, 57, 57, 61, 61, 63, 64, 48, 46, 46, 47, 49, 53,
+ 54, 56, 57, 57, 60, 60, 64, 64, 65, 64, 48, 45, 46, 46, 49, 53, 55, 56,
+ 58, 58, 61, 61, 65, 65, 66, 67, 49, 45, 45, 46, 48, 53, 56, 58, 61, 61,
+ 64, 64, 67, 68, 69, 67, 49, 46, 46, 46, 49, 53, 57, 59, 62, 62, 65, 66,
+ 69, 69, 70, 70, 50, 46, 46, 46, 49, 54, 57, 59, 63, 64, 67, 67, 71, 71,
+ 73, 71, 51, 47, 47, 47, 49, 54, 58, 61, 64, 66, 69, 70, 73, 74, 74, 74,
+ 52, 48, 48, 47, 50, 54, 58, 61, 65, 66, 71, 71, 75, 75, 77, 74, 54, 50,
+ 49, 48, 51, 55, 59, 62, 67, 68, 73, 73, 77, 78, 78, 78, 54, 50, 50, 49,
+ 51, 55, 59, 62, 67, 68, 73, 74, 78, 78, 81, 78, 57, 52, 52, 50, 52, 56,
+ 60, 64, 69, 70, 76, 77, 82, 82, 83, 82, 57, 52, 52, 51, 53, 57, 61, 64,
+ 69, 71, 77, 77, 82, 83, 85, 82, 60, 54, 54, 52, 55, 58, 62, 65, 71, 72,
+ 79, 79, 85, 86, 87, 86, 61, 56, 55, 53, 56, 59, 63, 66, 72, 73, 80, 81,
+ 86, 87, 88, 86, 63, 57, 57, 55, 57, 60, 64, 67, 73, 75, 82, 82, 89, 90,
+ 92, 90, 64, 58, 58, 55, 58, 61, 65, 68, 73, 75, 82, 83, 89, 90, 92, 90,
+ 64, 59, 58, 56, 58, 61, 65, 68, 74, 75, 83, 83, 90, 91, 94, 95, 66, 60,
+ 59, 57, 59, 62, 66, 69, 75, 76, 84, 85, 91, 92, 94, 95, 67, 61, 60, 58,
+ 59, 63, 66, 70, 74, 77, 82, 85, 91, 93, 96, 96, 68, 62, 61, 58, 59, 64,
+ 65, 71, 72, 78, 81, 86, 89, 94, 95, 96, 68, 62, 62, 59, 59, 65, 65, 71,
+ 71, 79, 79, 87, 87, 95, 95, 98,
+ /* Size 32x16 */
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 31, 31, 31, 32,
+ 33, 36, 38, 40, 42, 43, 46, 46, 46, 45, 45, 46, 46, 47, 48, 50, 50, 52,
+ 52, 54, 56, 57, 58, 59, 60, 61, 62, 62, 32, 33, 33, 33, 35, 37, 39, 41,
+ 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52, 52, 54, 55, 57,
+ 58, 58, 59, 60, 61, 62, 37, 38, 38, 40, 41, 43, 45, 47, 47, 47, 48, 48,
+ 47, 46, 46, 46, 46, 47, 47, 48, 49, 50, 51, 52, 53, 55, 55, 56, 57, 58,
+ 58, 59, 40, 41, 41, 42, 43, 44, 46, 47, 48, 48, 50, 50, 49, 49, 48, 49,
+ 49, 49, 50, 51, 51, 52, 53, 55, 56, 57, 58, 58, 59, 59, 59, 59, 48, 47,
+ 47, 46, 46, 47, 47, 47, 49, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 55,
+ 55, 56, 57, 58, 59, 60, 61, 61, 62, 63, 64, 65, 49, 47, 47, 45, 46, 46,
+ 46, 46, 49, 49, 53, 53, 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62,
+ 63, 64, 65, 65, 66, 66, 65, 65, 49, 47, 47, 45, 45, 46, 45, 45, 48, 49,
+ 53, 54, 56, 56, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 66, 67, 68, 68,
+ 69, 70, 71, 71, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58,
+ 61, 62, 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71,
+ 52, 50, 49, 48, 48, 47, 47, 47, 50, 50, 54, 55, 57, 58, 61, 62, 64, 66,
+ 66, 68, 68, 70, 71, 72, 73, 75, 75, 75, 76, 77, 78, 79, 57, 54, 54, 52,
+ 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 73, 76,
+ 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 58, 55, 54, 52, 52, 52, 51, 50,
+ 53, 54, 57, 57, 60, 61, 64, 66, 67, 70, 71, 73, 74, 77, 77, 79, 81, 82,
+ 83, 83, 85, 85, 86, 87, 63, 60, 59, 57, 57, 56, 55, 54, 57, 57, 60, 61,
+ 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85, 86, 89, 89, 90, 91, 91,
+ 89, 87, 64, 61, 60, 58, 57, 57, 56, 55, 57, 58, 61, 61, 64, 65, 68, 69,
+ 71, 74, 75, 78, 78, 82, 83, 86, 87, 90, 90, 91, 92, 93, 94, 95, 67, 63,
+ 63, 60, 60, 59, 58, 57, 59, 60, 62, 63, 65, 66, 69, 70, 73, 74, 77, 78,
+ 81, 83, 85, 87, 88, 92, 92, 94, 94, 96, 95, 95, 67, 64, 64, 61, 61, 60,
+ 60, 58, 58, 61, 61, 64, 64, 67, 67, 70, 71, 74, 74, 78, 78, 82, 82, 86,
+ 86, 90, 90, 95, 95, 96, 96, 98,
+ /* Size 4x16 */
+ 31, 48, 52, 64, 31, 47, 49, 60, 33, 46, 48, 57, 38, 47, 47, 56, 42, 49,
+ 50, 57, 46, 53, 54, 61, 46, 53, 57, 64, 45, 53, 61, 68, 46, 54, 64, 71,
+ 48, 54, 66, 75, 50, 55, 68, 78, 52, 57, 71, 83, 56, 59, 73, 87, 58, 61,
+ 75, 90, 60, 62, 76, 92, 62, 64, 78, 94,
+ /* Size 16x4 */
+ 31, 31, 33, 38, 42, 46, 46, 45, 46, 48, 50, 52, 56, 58, 60, 62, 48, 47,
+ 46, 47, 49, 53, 53, 53, 54, 54, 55, 57, 59, 61, 62, 64, 52, 49, 48, 47,
+ 50, 54, 57, 61, 64, 66, 68, 71, 73, 75, 76, 78, 64, 60, 57, 56, 57, 61,
+ 64, 68, 71, 75, 78, 83, 87, 90, 92, 94,
+ /* Size 8x32 */
+ 32, 32, 40, 49, 51, 57, 63, 67, 31, 33, 41, 47, 49, 54, 60, 63, 31, 33,
+ 41, 47, 49, 54, 59, 63, 30, 33, 42, 45, 47, 52, 57, 60, 31, 35, 43, 46,
+ 47, 51, 57, 60, 33, 37, 44, 46, 47, 51, 56, 59, 35, 39, 46, 46, 47, 50,
+ 55, 58, 37, 41, 47, 46, 46, 50, 54, 57, 41, 43, 48, 49, 49, 52, 57, 59,
+ 42, 43, 48, 49, 50, 53, 57, 60, 49, 47, 50, 53, 54, 57, 60, 62, 49, 47,
+ 50, 53, 54, 57, 61, 63, 48, 46, 49, 54, 57, 60, 64, 65, 48, 46, 49, 55,
+ 58, 61, 65, 66, 49, 45, 48, 56, 61, 64, 67, 69, 49, 46, 49, 57, 62, 65,
+ 69, 70, 50, 46, 49, 57, 63, 67, 71, 73, 51, 47, 49, 58, 64, 69, 73, 74,
+ 52, 48, 50, 58, 65, 71, 75, 77, 54, 49, 51, 59, 67, 73, 77, 78, 54, 50,
+ 51, 59, 67, 73, 78, 81, 57, 52, 52, 60, 69, 76, 82, 83, 57, 52, 53, 61,
+ 69, 77, 82, 85, 60, 54, 55, 62, 71, 79, 85, 87, 61, 55, 56, 63, 72, 80,
+ 86, 88, 63, 57, 57, 64, 73, 82, 89, 92, 64, 58, 58, 65, 73, 82, 89, 92,
+ 64, 58, 58, 65, 74, 83, 90, 94, 66, 59, 59, 66, 75, 84, 91, 94, 67, 60,
+ 59, 66, 74, 82, 91, 96, 68, 61, 59, 65, 72, 81, 89, 95, 68, 62, 59, 65,
+ 71, 79, 87, 95,
+ /* Size 32x8 */
+ 32, 31, 31, 30, 31, 33, 35, 37, 41, 42, 49, 49, 48, 48, 49, 49, 50, 51,
+ 52, 54, 54, 57, 57, 60, 61, 63, 64, 64, 66, 67, 68, 68, 32, 33, 33, 33,
+ 35, 37, 39, 41, 43, 43, 47, 47, 46, 46, 45, 46, 46, 47, 48, 49, 50, 52,
+ 52, 54, 55, 57, 58, 58, 59, 60, 61, 62, 40, 41, 41, 42, 43, 44, 46, 47,
+ 48, 48, 50, 50, 49, 49, 48, 49, 49, 49, 50, 51, 51, 52, 53, 55, 56, 57,
+ 58, 58, 59, 59, 59, 59, 49, 47, 47, 45, 46, 46, 46, 46, 49, 49, 53, 53,
+ 54, 55, 56, 57, 57, 58, 58, 59, 59, 60, 61, 62, 63, 64, 65, 65, 66, 66,
+ 65, 65, 51, 49, 49, 47, 47, 47, 47, 46, 49, 50, 54, 54, 57, 58, 61, 62,
+ 63, 64, 65, 67, 67, 69, 69, 71, 72, 73, 73, 74, 75, 74, 72, 71, 57, 54,
+ 54, 52, 51, 51, 50, 50, 52, 53, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73,
+ 73, 76, 77, 79, 80, 82, 82, 83, 84, 82, 81, 79, 63, 60, 59, 57, 57, 56,
+ 55, 54, 57, 57, 60, 61, 64, 65, 67, 69, 71, 73, 75, 77, 78, 82, 82, 85,
+ 86, 89, 89, 90, 91, 91, 89, 87, 67, 63, 63, 60, 60, 59, 58, 57, 59, 60,
+ 62, 63, 65, 66, 69, 70, 73, 74, 77, 78, 81, 83, 85, 87, 88, 92, 92, 94,
+ 94, 96, 95, 95 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 34, 49, 72, 34, 48, 60, 79, 49, 60, 82, 104, 72, 79, 104, 134,
+ /* Size 8x8 */
+ 32, 32, 34, 38, 46, 56, 68, 78, 32, 33, 35, 39, 45, 54, 64, 74, 34, 35,
+ 39, 45, 51, 58, 68, 76, 38, 39, 45, 54, 61, 69, 78, 86, 46, 45, 51, 61,
+ 71, 80, 90, 99, 56, 54, 58, 69, 80, 92, 103, 113, 68, 64, 68, 78, 90,
+ 103, 117, 128, 78, 74, 76, 86, 99, 113, 128, 140,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 34, 36, 39, 44, 48, 54, 59, 65, 71, 80, 83, 31, 32,
+ 32, 32, 32, 34, 35, 38, 42, 46, 51, 56, 62, 68, 76, 78, 31, 32, 32, 32,
+ 32, 33, 34, 37, 41, 44, 49, 54, 59, 65, 72, 75, 31, 32, 32, 33, 34, 35,
+ 36, 39, 42, 45, 50, 54, 59, 64, 71, 74, 32, 32, 32, 34, 35, 37, 38, 40,
+ 42, 46, 49, 53, 58, 63, 69, 72, 34, 34, 33, 35, 37, 39, 42, 45, 47, 51,
+ 54, 58, 63, 68, 74, 76, 36, 35, 34, 36, 38, 42, 48, 50, 54, 57, 60, 64,
+ 68, 73, 79, 81, 39, 38, 37, 39, 40, 45, 50, 54, 58, 61, 65, 69, 73, 78,
+ 84, 86, 44, 42, 41, 42, 42, 47, 54, 58, 63, 67, 71, 75, 79, 84, 90, 92,
+ 48, 46, 44, 45, 46, 51, 57, 61, 67, 71, 76, 80, 85, 90, 96, 99, 54, 51,
+ 49, 50, 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 59, 56, 54,
+ 54, 53, 58, 64, 69, 75, 80, 87, 92, 98, 103, 110, 113, 65, 62, 59, 59,
+ 58, 63, 68, 73, 79, 85, 92, 98, 105, 111, 118, 121, 71, 68, 65, 64, 63,
+ 68, 73, 78, 84, 90, 97, 103, 111, 117, 125, 128, 80, 76, 72, 71, 69, 74,
+ 79, 84, 90, 96, 104, 110, 118, 125, 134, 137, 83, 78, 75, 74, 72, 76,
+ 81, 86, 92, 99, 106, 113, 121, 128, 137, 140,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 54, 54, 59, 59, 65, 65, 71, 71, 80, 80, 83, 83, 87, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56,
+ 56, 62, 62, 68, 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 34, 34, 35, 35, 38, 38, 42, 42, 46, 46, 51, 51, 56, 56, 62, 62, 68,
+ 68, 76, 76, 78, 78, 83, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 37, 37, 41, 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75,
+ 75, 79, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 37, 37, 41,
+ 41, 44, 44, 49, 49, 54, 54, 59, 59, 65, 65, 72, 72, 75, 75, 79, 31, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50,
+ 50, 54, 54, 59, 59, 64, 64, 71, 71, 74, 74, 77, 31, 32, 32, 32, 32, 33,
+ 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 42, 45, 45, 50, 50, 54, 54, 59,
+ 59, 64, 64, 71, 71, 74, 74, 77, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37,
+ 37, 38, 38, 40, 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69,
+ 69, 72, 72, 75, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+ 40, 42, 42, 46, 46, 49, 49, 53, 53, 58, 58, 63, 63, 69, 69, 72, 72, 75,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51,
+ 51, 54, 54, 58, 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 34, 34, 34, 33,
+ 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 47, 51, 51, 54, 54, 58,
+ 58, 63, 63, 68, 68, 74, 74, 76, 76, 80, 36, 35, 35, 34, 34, 36, 36, 38,
+ 38, 42, 42, 48, 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73,
+ 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 54, 54, 57, 57, 60, 60, 64, 64, 68, 68, 73, 73, 79, 79, 81,
+ 81, 84, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58,
+ 58, 61, 61, 65, 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 39, 38,
+ 38, 37, 37, 39, 39, 40, 40, 45, 45, 50, 50, 54, 54, 58, 58, 61, 61, 65,
+ 65, 69, 69, 73, 73, 78, 78, 84, 84, 86, 86, 90, 44, 42, 42, 41, 41, 42,
+ 42, 42, 42, 47, 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79,
+ 79, 84, 84, 90, 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47,
+ 47, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+ 90, 92, 92, 96, 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61,
+ 61, 67, 67, 71, 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102,
+ 48, 46, 46, 44, 44, 45, 45, 46, 46, 51, 51, 57, 57, 61, 61, 67, 67, 71,
+ 71, 76, 76, 80, 80, 85, 85, 90, 90, 96, 96, 99, 99, 102, 54, 51, 51, 49,
+ 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87,
+ 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 54, 51, 51, 49, 49, 50, 50,
+ 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82, 87, 87, 92, 92,
+ 97, 97, 104, 104, 106, 106, 109, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58,
+ 58, 64, 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103,
+ 110, 110, 113, 113, 116, 59, 56, 56, 54, 54, 54, 54, 53, 53, 58, 58, 64,
+ 64, 69, 69, 75, 75, 80, 80, 87, 87, 92, 92, 98, 98, 103, 103, 110, 110,
+ 113, 113, 116, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73,
+ 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121,
+ 121, 124, 65, 62, 62, 59, 59, 59, 59, 58, 58, 63, 63, 68, 68, 73, 73,
+ 79, 79, 85, 85, 92, 92, 98, 98, 105, 105, 111, 111, 118, 118, 121, 121,
+ 124, 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84,
+ 90, 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132,
+ 71, 68, 68, 65, 65, 64, 64, 63, 63, 68, 68, 73, 73, 78, 78, 84, 84, 90,
+ 90, 97, 97, 103, 103, 111, 111, 117, 117, 125, 125, 128, 128, 132, 80,
+ 76, 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96,
+ 104, 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 80, 76,
+ 76, 72, 72, 71, 71, 69, 69, 74, 74, 79, 79, 84, 84, 90, 90, 96, 96, 104,
+ 104, 110, 110, 118, 118, 125, 125, 134, 134, 137, 137, 141, 83, 78, 78,
+ 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+ 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 83, 78, 78,
+ 75, 75, 74, 74, 72, 72, 76, 76, 81, 81, 86, 86, 92, 92, 99, 99, 106,
+ 106, 113, 113, 121, 121, 128, 128, 137, 137, 140, 140, 144, 87, 83, 83,
+ 79, 79, 77, 77, 75, 75, 80, 80, 84, 84, 90, 90, 96, 96, 102, 102, 109,
+ 109, 116, 116, 124, 124, 132, 132, 141, 141, 144, 144, 149,
+ /* Size 4x8 */
+ 32, 35, 51, 75, 32, 36, 50, 71, 34, 42, 54, 73, 37, 50, 65, 84, 45, 56,
+ 76, 96, 54, 63, 87, 110, 65, 73, 97, 125, 75, 81, 106, 136,
+ /* Size 8x4 */
+ 32, 32, 34, 37, 45, 54, 65, 75, 35, 36, 42, 50, 56, 63, 73, 81, 51, 50,
+ 54, 65, 76, 87, 97, 106, 75, 71, 73, 84, 96, 110, 125, 136,
+ /* Size 8x16 */
+ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+ 33, 34, 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 33, 35, 38,
+ 42, 49, 58, 69, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34, 38, 48, 54, 60,
+ 68, 78, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71, 79, 90,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 58, 54,
+ 54, 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 71, 65, 63,
+ 73, 84, 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 82, 75, 72, 81,
+ 92, 106, 121, 136,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 32, 34, 36, 39, 44, 48, 53, 58, 65, 71, 79, 82, 31, 32,
+ 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 32, 32, 33, 34,
+ 35, 37, 38, 40, 43, 46, 50, 54, 58, 63, 70, 72, 36, 35, 34, 36, 38, 42,
+ 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 44, 42, 41, 42, 42, 48, 54, 58,
+ 63, 67, 71, 75, 79, 84, 90, 92, 53, 51, 49, 50, 49, 54, 60, 65, 71, 76,
+ 82, 87, 92, 97, 104, 106, 65, 62, 59, 59, 58, 63, 68, 73, 79, 85, 92,
+ 98, 105, 111, 118, 121, 79, 75, 72, 71, 69, 73, 78, 84, 90, 96, 103,
+ 110, 118, 125, 133, 136,
+ /* Size 16x32 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 44, 53, 53, 65, 65, 79, 79, 87, 31, 32,
+ 32, 32, 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 32,
+ 32, 35, 35, 42, 42, 51, 51, 62, 62, 75, 75, 82, 31, 32, 32, 33, 33, 34,
+ 34, 41, 41, 49, 49, 59, 59, 72, 72, 78, 31, 32, 32, 33, 33, 34, 34, 41,
+ 41, 49, 49, 59, 59, 72, 72, 78, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50,
+ 50, 59, 59, 71, 71, 77, 32, 32, 32, 34, 34, 36, 36, 42, 42, 50, 50, 59,
+ 59, 71, 71, 77, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69,
+ 69, 75, 32, 33, 33, 35, 35, 38, 38, 42, 42, 49, 49, 58, 58, 69, 69, 75,
+ 34, 34, 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 34, 34,
+ 34, 37, 37, 42, 42, 48, 48, 54, 54, 63, 63, 73, 73, 79, 36, 34, 34, 38,
+ 38, 48, 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 36, 34, 34, 38, 38, 48,
+ 48, 54, 54, 60, 60, 68, 68, 78, 78, 84, 39, 37, 37, 40, 40, 50, 50, 58,
+ 58, 65, 65, 73, 73, 84, 84, 89, 39, 37, 37, 40, 40, 50, 50, 58, 58, 65,
+ 65, 73, 73, 84, 84, 89, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79,
+ 79, 90, 90, 95, 44, 41, 41, 43, 43, 53, 53, 63, 63, 71, 71, 79, 79, 90,
+ 90, 95, 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102,
+ 48, 45, 45, 46, 46, 56, 56, 67, 67, 76, 76, 85, 85, 96, 96, 102, 53, 49,
+ 49, 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 53, 49, 49,
+ 50, 50, 60, 60, 71, 71, 82, 82, 92, 92, 103, 103, 109, 58, 54, 54, 54,
+ 54, 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 58, 54, 54, 54, 54,
+ 63, 63, 75, 75, 87, 87, 98, 98, 110, 110, 116, 65, 60, 60, 58, 58, 68,
+ 68, 79, 79, 92, 92, 105, 105, 118, 118, 124, 65, 60, 60, 58, 58, 68, 68,
+ 79, 79, 92, 92, 105, 105, 118, 118, 124, 71, 65, 65, 63, 63, 73, 73, 84,
+ 84, 97, 97, 111, 111, 125, 125, 132, 71, 65, 65, 63, 63, 73, 73, 84, 84,
+ 97, 97, 111, 111, 125, 125, 132, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+ 104, 104, 118, 118, 133, 133, 141, 79, 72, 72, 70, 70, 79, 79, 90, 90,
+ 104, 104, 118, 118, 133, 133, 141, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+ 106, 106, 121, 121, 136, 136, 144, 82, 75, 75, 72, 72, 81, 81, 92, 92,
+ 106, 106, 121, 121, 136, 136, 144, 87, 79, 79, 76, 76, 84, 84, 96, 96,
+ 109, 109, 124, 124, 141, 141, 149,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+ 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 31, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54, 54, 60, 60, 65,
+ 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38,
+ 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72,
+ 72, 76, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 43,
+ 43, 46, 46, 50, 50, 54, 54, 58, 58, 63, 63, 70, 70, 72, 72, 76, 36, 35,
+ 35, 34, 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60,
+ 60, 63, 63, 68, 68, 73, 73, 79, 79, 81, 81, 84, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68,
+ 68, 73, 73, 79, 79, 81, 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48,
+ 48, 54, 54, 58, 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90,
+ 90, 92, 92, 96, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+ 58, 63, 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96,
+ 53, 51, 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76,
+ 76, 82, 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 53, 51, 51,
+ 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82, 82,
+ 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59, 59,
+ 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105,
+ 105, 111, 111, 118, 118, 121, 121, 124, 65, 62, 62, 59, 59, 59, 59, 58,
+ 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98, 105, 105,
+ 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+ 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+ 125, 125, 133, 133, 136, 136, 141, 79, 75, 75, 72, 72, 71, 71, 69, 69,
+ 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118, 118,
+ 125, 125, 133, 133, 136, 136, 141, 87, 82, 82, 78, 78, 77, 77, 75, 75,
+ 79, 79, 84, 84, 89, 89, 95, 95, 102, 102, 109, 109, 116, 116, 124, 124,
+ 132, 132, 141, 141, 144, 144, 149,
+ /* Size 4x16 */
+ 31, 36, 53, 79, 32, 35, 51, 75, 32, 34, 49, 72, 32, 36, 50, 71, 33, 38,
+ 49, 69, 34, 42, 54, 73, 34, 48, 60, 78, 37, 50, 65, 84, 41, 53, 71, 90,
+ 45, 56, 76, 96, 49, 60, 82, 103, 54, 63, 87, 110, 60, 68, 92, 118, 65,
+ 73, 97, 125, 72, 79, 104, 133, 75, 81, 106, 136,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 33, 34, 34, 37, 41, 45, 49, 54, 60, 65, 72, 75, 36, 35,
+ 34, 36, 38, 42, 48, 50, 53, 56, 60, 63, 68, 73, 79, 81, 53, 51, 49, 50,
+ 49, 54, 60, 65, 71, 76, 82, 87, 92, 97, 104, 106, 79, 75, 72, 71, 69,
+ 73, 78, 84, 90, 96, 103, 110, 118, 125, 133, 136,
+ /* Size 8x32 */
+ 32, 31, 32, 36, 44, 53, 65, 79, 31, 32, 32, 35, 42, 51, 62, 75, 31, 32,
+ 32, 35, 42, 51, 62, 75, 31, 32, 33, 34, 41, 49, 59, 72, 31, 32, 33, 34,
+ 41, 49, 59, 72, 32, 32, 34, 36, 42, 50, 59, 71, 32, 32, 34, 36, 42, 50,
+ 59, 71, 32, 33, 35, 38, 42, 49, 58, 69, 32, 33, 35, 38, 42, 49, 58, 69,
+ 34, 34, 37, 42, 48, 54, 63, 73, 34, 34, 37, 42, 48, 54, 63, 73, 36, 34,
+ 38, 48, 54, 60, 68, 78, 36, 34, 38, 48, 54, 60, 68, 78, 39, 37, 40, 50,
+ 58, 65, 73, 84, 39, 37, 40, 50, 58, 65, 73, 84, 44, 41, 43, 53, 63, 71,
+ 79, 90, 44, 41, 43, 53, 63, 71, 79, 90, 48, 45, 46, 56, 67, 76, 85, 96,
+ 48, 45, 46, 56, 67, 76, 85, 96, 53, 49, 50, 60, 71, 82, 92, 103, 53, 49,
+ 50, 60, 71, 82, 92, 103, 58, 54, 54, 63, 75, 87, 98, 110, 58, 54, 54,
+ 63, 75, 87, 98, 110, 65, 60, 58, 68, 79, 92, 105, 118, 65, 60, 58, 68,
+ 79, 92, 105, 118, 71, 65, 63, 73, 84, 97, 111, 125, 71, 65, 63, 73, 84,
+ 97, 111, 125, 79, 72, 70, 79, 90, 104, 118, 133, 79, 72, 70, 79, 90,
+ 104, 118, 133, 82, 75, 72, 81, 92, 106, 121, 136, 82, 75, 72, 81, 92,
+ 106, 121, 136, 87, 79, 76, 84, 96, 109, 124, 141,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 44, 48,
+ 48, 53, 53, 58, 58, 65, 65, 71, 71, 79, 79, 82, 82, 87, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 41, 45, 45, 49, 49, 54,
+ 54, 60, 60, 65, 65, 72, 72, 75, 75, 79, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 37, 37, 38, 38, 40, 40, 43, 43, 46, 46, 50, 50, 54, 54, 58, 58, 63,
+ 63, 70, 70, 72, 72, 76, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 68, 68, 73, 73, 79, 79, 81,
+ 81, 84, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58, 58, 63,
+ 63, 67, 67, 71, 71, 75, 75, 79, 79, 84, 84, 90, 90, 92, 92, 96, 53, 51,
+ 51, 49, 49, 50, 50, 49, 49, 54, 54, 60, 60, 65, 65, 71, 71, 76, 76, 82,
+ 82, 87, 87, 92, 92, 97, 97, 104, 104, 106, 106, 109, 65, 62, 62, 59, 59,
+ 59, 59, 58, 58, 63, 63, 68, 68, 73, 73, 79, 79, 85, 85, 92, 92, 98, 98,
+ 105, 105, 111, 111, 118, 118, 121, 121, 124, 79, 75, 75, 72, 72, 71, 71,
+ 69, 69, 73, 73, 78, 78, 84, 84, 90, 90, 96, 96, 103, 103, 110, 110, 118,
+ 118, 125, 125, 133, 133, 136, 136, 141 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 46, 47, 57, 46, 53, 54, 60, 47, 54, 66, 75, 57, 60, 75, 89,
+ /* Size 8x8 */
+ 31, 34, 42, 47, 48, 52, 57, 61, 34, 39, 45, 46, 46, 49, 53, 57, 42, 45,
+ 48, 49, 50, 52, 55, 58, 47, 46, 49, 54, 56, 58, 61, 64, 48, 46, 50, 56,
+ 61, 65, 68, 71, 52, 49, 52, 58, 65, 71, 75, 79, 57, 53, 55, 61, 68, 75,
+ 82, 86, 61, 57, 58, 64, 71, 79, 86, 91,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 36, 41, 49, 48, 49, 50, 52, 54, 57, 60, 63, 65, 31, 31,
+ 31, 34, 38, 42, 47, 47, 47, 48, 50, 52, 54, 57, 60, 61, 30, 31, 32, 35,
+ 40, 42, 46, 45, 45, 46, 47, 49, 52, 54, 57, 58, 33, 34, 35, 39, 43, 45,
+ 47, 46, 45, 46, 47, 49, 51, 53, 56, 57, 36, 38, 40, 43, 47, 47, 48, 46,
+ 45, 46, 47, 48, 50, 52, 54, 55, 41, 42, 42, 45, 47, 48, 50, 49, 49, 50,
+ 50, 52, 53, 55, 57, 58, 49, 47, 46, 47, 48, 50, 53, 53, 53, 54, 54, 55,
+ 56, 58, 60, 61, 48, 47, 45, 46, 46, 49, 53, 54, 55, 56, 57, 58, 60, 61,
+ 63, 64, 49, 47, 45, 45, 45, 49, 53, 55, 58, 60, 61, 62, 63, 65, 67, 68,
+ 50, 48, 46, 46, 46, 50, 54, 56, 60, 61, 63, 65, 67, 68, 71, 71, 52, 50,
+ 47, 47, 47, 50, 54, 57, 61, 63, 66, 68, 70, 72, 75, 75, 54, 52, 49, 49,
+ 48, 52, 55, 58, 62, 65, 68, 71, 73, 75, 78, 79, 57, 54, 52, 51, 50, 53,
+ 56, 60, 63, 67, 70, 73, 76, 79, 82, 83, 60, 57, 54, 53, 52, 55, 58, 61,
+ 65, 68, 72, 75, 79, 82, 85, 86, 63, 60, 57, 56, 54, 57, 60, 63, 67, 71,
+ 75, 78, 82, 85, 89, 90, 65, 61, 58, 57, 55, 58, 61, 64, 68, 71, 75, 79,
+ 83, 86, 90, 91,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 65, 65, 67, 31, 31, 31, 31,
+ 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52,
+ 52, 54, 54, 57, 57, 60, 60, 61, 61, 63, 31, 31, 31, 31, 31, 34, 34, 38,
+ 38, 42, 42, 47, 47, 47, 47, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 57,
+ 57, 60, 60, 61, 61, 63, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46,
+ 46, 45, 45, 45, 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58,
+ 58, 60, 30, 31, 31, 32, 32, 35, 35, 40, 40, 42, 42, 46, 46, 45, 45, 45,
+ 45, 46, 46, 47, 47, 49, 49, 52, 52, 54, 54, 57, 57, 58, 58, 60, 33, 34,
+ 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47,
+ 47, 49, 49, 51, 51, 53, 53, 56, 56, 57, 57, 59, 33, 34, 34, 35, 35, 39,
+ 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 49, 49, 51,
+ 51, 53, 53, 56, 56, 57, 57, 59, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47,
+ 47, 48, 48, 46, 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54,
+ 54, 55, 55, 57, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+ 46, 45, 45, 46, 46, 47, 47, 48, 48, 50, 50, 52, 52, 54, 54, 55, 55, 57,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50,
+ 50, 50, 50, 52, 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 41, 42, 42, 42,
+ 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 52,
+ 52, 53, 53, 55, 55, 57, 57, 58, 58, 60, 49, 47, 47, 46, 46, 47, 47, 48,
+ 48, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58,
+ 58, 60, 60, 61, 61, 62, 49, 47, 47, 46, 46, 47, 47, 48, 48, 50, 50, 53,
+ 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+ 61, 62, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55,
+ 55, 56, 56, 57, 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 48, 47,
+ 47, 45, 45, 46, 46, 46, 46, 49, 49, 53, 53, 54, 54, 55, 55, 56, 56, 57,
+ 57, 58, 58, 60, 60, 61, 61, 63, 63, 64, 64, 66, 49, 47, 47, 45, 45, 45,
+ 45, 45, 45, 49, 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63,
+ 63, 65, 65, 67, 67, 68, 68, 69, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49,
+ 49, 53, 53, 55, 55, 58, 58, 60, 60, 61, 61, 62, 62, 63, 63, 65, 65, 67,
+ 67, 68, 68, 69, 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56,
+ 56, 60, 60, 61, 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72,
+ 50, 48, 48, 46, 46, 46, 46, 46, 46, 50, 50, 54, 54, 56, 56, 60, 60, 61,
+ 61, 63, 63, 65, 65, 67, 67, 68, 68, 71, 71, 71, 71, 72, 52, 50, 50, 47,
+ 47, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68,
+ 68, 70, 70, 72, 72, 75, 75, 75, 75, 76, 52, 50, 50, 47, 47, 47, 47, 47,
+ 47, 50, 50, 54, 54, 57, 57, 61, 61, 63, 63, 66, 66, 68, 68, 70, 70, 72,
+ 72, 75, 75, 75, 75, 76, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55,
+ 55, 58, 58, 62, 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79,
+ 79, 80, 54, 52, 52, 49, 49, 49, 49, 48, 48, 52, 52, 55, 55, 58, 58, 62,
+ 62, 65, 65, 68, 68, 71, 71, 73, 73, 75, 75, 78, 78, 79, 79, 80, 57, 54,
+ 54, 52, 52, 51, 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70,
+ 70, 73, 73, 76, 76, 79, 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 56, 56, 60, 60, 63, 63, 67, 67, 70, 70, 73, 73, 76,
+ 76, 79, 79, 82, 82, 83, 83, 84, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55,
+ 55, 58, 58, 61, 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85,
+ 85, 86, 86, 88, 60, 57, 57, 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 61,
+ 61, 65, 65, 68, 68, 72, 72, 75, 75, 79, 79, 82, 82, 85, 85, 86, 86, 88,
+ 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71,
+ 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60, 60, 57,
+ 57, 56, 56, 54, 54, 57, 57, 60, 60, 63, 63, 67, 67, 71, 71, 75, 75, 78,
+ 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 65, 61, 61, 58, 58, 57, 57, 55,
+ 55, 58, 58, 61, 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86,
+ 86, 90, 90, 91, 91, 93, 65, 61, 61, 58, 58, 57, 57, 55, 55, 58, 58, 61,
+ 61, 64, 64, 68, 68, 71, 71, 75, 75, 79, 79, 83, 83, 86, 86, 90, 90, 91,
+ 91, 93, 67, 63, 63, 60, 60, 59, 59, 57, 57, 60, 60, 62, 62, 66, 66, 69,
+ 69, 72, 72, 76, 76, 80, 80, 84, 84, 88, 88, 92, 92, 93, 93, 95,
+ /* Size 4x8 */
+ 31, 47, 50, 60, 36, 47, 47, 56, 43, 50, 50, 57, 46, 53, 57, 64, 46, 54,
+ 64, 71, 50, 55, 68, 78, 54, 58, 72, 85, 59, 61, 75, 90,
+ /* Size 8x4 */
+ 31, 36, 43, 46, 46, 50, 54, 59, 47, 47, 50, 53, 54, 55, 58, 61, 50, 47,
+ 50, 57, 64, 68, 72, 75, 60, 56, 57, 64, 71, 78, 85, 90,
+ /* Size 8x16 */
+ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 30, 32,
+ 40, 46, 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 37, 40, 47, 47,
+ 45, 47, 50, 54, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46, 48, 53, 53, 54,
+ 57, 60, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61, 64, 67,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 54, 50,
+ 49, 55, 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 60, 54, 52, 58,
+ 65, 72, 79, 85, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59, 56, 61, 68, 75,
+ 83, 90,
+ /* Size 16x8 */
+ 32, 31, 30, 33, 37, 42, 49, 48, 49, 50, 52, 54, 57, 60, 63, 64, 31, 31,
+ 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 37, 38, 40, 43,
+ 47, 47, 48, 47, 46, 46, 47, 49, 50, 52, 55, 56, 48, 47, 46, 47, 47, 50,
+ 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 49, 47, 45, 46, 45, 49, 53, 56,
+ 58, 59, 61, 62, 64, 65, 67, 68, 52, 50, 48, 47, 47, 50, 54, 57, 61, 64,
+ 66, 68, 70, 72, 75, 75, 57, 54, 52, 51, 50, 53, 57, 60, 64, 67, 71, 73,
+ 76, 79, 82, 83, 63, 60, 57, 56, 54, 57, 60, 64, 67, 71, 75, 78, 82, 85,
+ 89, 90,
+ /* Size 16x32 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 49, 52, 52, 57, 57, 63, 63, 66, 31, 31,
+ 31, 38, 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 31, 31, 31, 38,
+ 38, 47, 47, 47, 47, 50, 50, 54, 54, 60, 60, 63, 30, 32, 32, 40, 40, 46,
+ 46, 45, 45, 48, 48, 52, 52, 57, 57, 60, 30, 32, 32, 40, 40, 46, 46, 45,
+ 45, 48, 48, 52, 52, 57, 57, 60, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47,
+ 47, 51, 51, 56, 56, 59, 33, 36, 36, 43, 43, 47, 47, 46, 46, 47, 47, 51,
+ 51, 56, 56, 59, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54,
+ 54, 57, 37, 40, 40, 47, 47, 47, 47, 45, 45, 47, 47, 50, 50, 54, 54, 57,
+ 42, 43, 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 42, 43,
+ 43, 47, 47, 50, 50, 49, 49, 50, 50, 53, 53, 57, 57, 60, 49, 46, 46, 48,
+ 48, 53, 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 49, 46, 46, 48, 48, 53,
+ 53, 53, 53, 54, 54, 57, 57, 60, 60, 62, 48, 46, 46, 47, 47, 53, 53, 56,
+ 56, 57, 57, 60, 60, 64, 64, 66, 48, 46, 46, 47, 47, 53, 53, 56, 56, 57,
+ 57, 60, 60, 64, 64, 66, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64,
+ 64, 67, 67, 69, 49, 45, 45, 46, 46, 53, 53, 58, 58, 61, 61, 64, 64, 67,
+ 67, 69, 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73,
+ 50, 46, 46, 46, 46, 54, 54, 59, 59, 64, 64, 67, 67, 71, 71, 73, 52, 48,
+ 48, 47, 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 52, 48, 48, 47,
+ 47, 54, 54, 61, 61, 66, 66, 71, 71, 75, 75, 77, 54, 50, 50, 49, 49, 55,
+ 55, 62, 62, 68, 68, 73, 73, 78, 78, 80, 54, 50, 50, 49, 49, 55, 55, 62,
+ 62, 68, 68, 73, 73, 78, 78, 80, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70,
+ 70, 76, 76, 82, 82, 84, 57, 52, 52, 50, 50, 56, 56, 64, 64, 70, 70, 76,
+ 76, 82, 82, 84, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85,
+ 85, 88, 60, 54, 54, 52, 52, 58, 58, 65, 65, 72, 72, 79, 79, 85, 85, 88,
+ 63, 57, 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 63, 57,
+ 57, 55, 55, 60, 60, 67, 67, 75, 75, 82, 82, 89, 89, 92, 64, 59, 59, 56,
+ 56, 61, 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 64, 59, 59, 56, 56, 61,
+ 61, 68, 68, 75, 75, 83, 83, 90, 90, 93, 66, 60, 60, 57, 57, 63, 63, 69,
+ 69, 77, 77, 84, 84, 92, 92, 95,
+ /* Size 32x16 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+ 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 31, 31, 31, 32, 32, 36, 36, 40,
+ 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50, 50, 52, 52, 54,
+ 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48,
+ 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56,
+ 56, 57, 37, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 47, 47, 46,
+ 46, 46, 46, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 57, 48, 47,
+ 47, 46, 46, 47, 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+ 54, 55, 55, 56, 56, 58, 58, 60, 60, 61, 61, 63, 48, 47, 47, 46, 46, 47,
+ 47, 47, 47, 50, 50, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56,
+ 56, 58, 58, 60, 60, 61, 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49,
+ 49, 53, 53, 56, 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67,
+ 67, 68, 68, 69, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+ 56, 58, 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69,
+ 52, 50, 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64,
+ 64, 66, 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 52, 50, 50, 48,
+ 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66, 66, 68,
+ 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51, 51, 50,
+ 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79,
+ 79, 82, 82, 83, 83, 84, 57, 54, 54, 52, 52, 51, 51, 50, 50, 53, 53, 57,
+ 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76, 76, 79, 79, 82, 82, 83,
+ 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67,
+ 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 63, 60,
+ 60, 57, 57, 56, 56, 54, 54, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 75,
+ 75, 78, 78, 82, 82, 85, 85, 89, 89, 90, 90, 92, 66, 63, 63, 60, 60, 59,
+ 59, 57, 57, 60, 60, 62, 62, 66, 66, 69, 69, 73, 73, 77, 77, 80, 80, 84,
+ 84, 88, 88, 92, 92, 93, 93, 95,
+ /* Size 4x16 */
+ 31, 48, 52, 63, 31, 47, 50, 60, 32, 46, 48, 57, 36, 47, 47, 56, 40, 47,
+ 47, 54, 43, 50, 50, 57, 46, 53, 54, 60, 46, 53, 57, 64, 45, 53, 61, 67,
+ 46, 54, 64, 71, 48, 54, 66, 75, 50, 55, 68, 78, 52, 56, 70, 82, 54, 58,
+ 72, 85, 57, 60, 75, 89, 59, 61, 75, 90,
+ /* Size 16x4 */
+ 31, 31, 32, 36, 40, 43, 46, 46, 45, 46, 48, 50, 52, 54, 57, 59, 48, 47,
+ 46, 47, 47, 50, 53, 53, 53, 54, 54, 55, 56, 58, 60, 61, 52, 50, 48, 47,
+ 47, 50, 54, 57, 61, 64, 66, 68, 70, 72, 75, 75, 63, 60, 57, 56, 54, 57,
+ 60, 64, 67, 71, 75, 78, 82, 85, 89, 90,
+ /* Size 8x32 */
+ 32, 31, 37, 48, 49, 52, 57, 63, 31, 31, 38, 47, 47, 50, 54, 60, 31, 31,
+ 38, 47, 47, 50, 54, 60, 30, 32, 40, 46, 45, 48, 52, 57, 30, 32, 40, 46,
+ 45, 48, 52, 57, 33, 36, 43, 47, 46, 47, 51, 56, 33, 36, 43, 47, 46, 47,
+ 51, 56, 37, 40, 47, 47, 45, 47, 50, 54, 37, 40, 47, 47, 45, 47, 50, 54,
+ 42, 43, 47, 50, 49, 50, 53, 57, 42, 43, 47, 50, 49, 50, 53, 57, 49, 46,
+ 48, 53, 53, 54, 57, 60, 49, 46, 48, 53, 53, 54, 57, 60, 48, 46, 47, 53,
+ 56, 57, 60, 64, 48, 46, 47, 53, 56, 57, 60, 64, 49, 45, 46, 53, 58, 61,
+ 64, 67, 49, 45, 46, 53, 58, 61, 64, 67, 50, 46, 46, 54, 59, 64, 67, 71,
+ 50, 46, 46, 54, 59, 64, 67, 71, 52, 48, 47, 54, 61, 66, 71, 75, 52, 48,
+ 47, 54, 61, 66, 71, 75, 54, 50, 49, 55, 62, 68, 73, 78, 54, 50, 49, 55,
+ 62, 68, 73, 78, 57, 52, 50, 56, 64, 70, 76, 82, 57, 52, 50, 56, 64, 70,
+ 76, 82, 60, 54, 52, 58, 65, 72, 79, 85, 60, 54, 52, 58, 65, 72, 79, 85,
+ 63, 57, 55, 60, 67, 75, 82, 89, 63, 57, 55, 60, 67, 75, 82, 89, 64, 59,
+ 56, 61, 68, 75, 83, 90, 64, 59, 56, 61, 68, 75, 83, 90, 66, 60, 57, 63,
+ 69, 77, 84, 92,
+ /* Size 32x8 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 49, 50,
+ 50, 52, 52, 54, 54, 57, 57, 60, 60, 63, 63, 64, 64, 66, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 46, 46, 48, 48, 50,
+ 50, 52, 52, 54, 54, 57, 57, 59, 59, 60, 37, 38, 38, 40, 40, 43, 43, 47,
+ 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 47, 47, 49, 49, 50, 50, 52,
+ 52, 55, 55, 56, 56, 57, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+ 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 58, 58, 60, 60, 61,
+ 61, 63, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56, 56, 58,
+ 58, 59, 59, 61, 61, 62, 62, 64, 64, 65, 65, 67, 67, 68, 68, 69, 52, 50,
+ 50, 48, 48, 47, 47, 47, 47, 50, 50, 54, 54, 57, 57, 61, 61, 64, 64, 66,
+ 66, 68, 68, 70, 70, 72, 72, 75, 75, 75, 75, 77, 57, 54, 54, 52, 52, 51,
+ 51, 50, 50, 53, 53, 57, 57, 60, 60, 64, 64, 67, 67, 71, 71, 73, 73, 76,
+ 76, 79, 79, 82, 82, 83, 83, 84, 63, 60, 60, 57, 57, 56, 56, 54, 54, 57,
+ 57, 60, 60, 64, 64, 67, 67, 71, 71, 75, 75, 78, 78, 82, 82, 85, 85, 89,
+ 89, 90, 90, 92 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 33, 45, 62, 33, 39, 51, 64, 45, 51, 71, 87, 62, 64, 87, 108,
+ /* Size 8x8 */
+ 31, 32, 32, 35, 42, 51, 59, 69, 32, 32, 33, 35, 41, 49, 56, 65, 32, 33,
+ 35, 38, 43, 49, 56, 64, 35, 35, 38, 48, 54, 59, 66, 73, 42, 41, 43, 54,
+ 63, 71, 77, 85, 51, 49, 49, 59, 71, 81, 89, 97, 59, 56, 56, 66, 77, 89,
+ 98, 108, 69, 65, 64, 73, 85, 97, 108, 119,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 34, 35, 38, 41, 45, 48, 54, 59, 65, 71, 80, 31, 32,
+ 32, 32, 32, 34, 35, 37, 40, 43, 46, 51, 56, 62, 68, 76, 31, 32, 32, 32,
+ 32, 33, 34, 36, 38, 41, 44, 49, 54, 59, 65, 72, 31, 32, 32, 33, 34, 35,
+ 36, 38, 40, 42, 45, 50, 54, 59, 64, 71, 32, 32, 32, 34, 35, 37, 38, 39,
+ 41, 43, 46, 49, 53, 58, 63, 69, 34, 34, 33, 35, 37, 39, 42, 44, 46, 48,
+ 51, 54, 58, 63, 68, 74, 35, 35, 34, 36, 38, 42, 46, 48, 50, 53, 55, 59,
+ 62, 67, 72, 78, 38, 37, 36, 38, 39, 44, 48, 51, 54, 57, 59, 63, 67, 71,
+ 76, 82, 41, 40, 38, 40, 41, 46, 50, 54, 57, 60, 63, 67, 71, 75, 80, 86,
+ 45, 43, 41, 42, 43, 48, 53, 57, 60, 65, 68, 72, 76, 81, 85, 91, 48, 46,
+ 44, 45, 46, 51, 55, 59, 63, 68, 71, 76, 80, 85, 90, 96, 54, 51, 49, 50,
+ 49, 54, 59, 63, 67, 72, 76, 82, 87, 92, 97, 104, 59, 56, 54, 54, 53, 58,
+ 62, 67, 71, 76, 80, 87, 92, 98, 103, 110, 65, 62, 59, 59, 58, 63, 67,
+ 71, 75, 81, 85, 92, 98, 105, 111, 118, 71, 68, 65, 64, 63, 68, 72, 76,
+ 80, 85, 90, 97, 103, 111, 117, 125, 80, 76, 72, 71, 69, 74, 78, 82, 86,
+ 91, 96, 104, 110, 118, 125, 134,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 45, 48, 48, 53, 54, 57, 59, 62, 65, 67, 71, 72, 80, 80, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51,
+ 52, 55, 56, 59, 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 34, 34, 35, 35, 37, 38, 40, 42, 43, 46, 46, 51, 51, 55, 56, 59,
+ 62, 64, 68, 69, 76, 76, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 36, 38, 39, 41, 42, 45, 45, 49, 50, 53, 54, 57, 60, 62, 66, 66,
+ 73, 73, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 36, 37,
+ 38, 41, 41, 44, 44, 49, 49, 52, 54, 56, 59, 61, 65, 65, 72, 72, 31, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 37, 38, 39, 41, 42, 45,
+ 45, 49, 49, 52, 54, 56, 59, 61, 64, 65, 72, 72, 31, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 45, 45, 49, 50, 52,
+ 54, 56, 59, 60, 64, 65, 71, 71, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 35, 35, 36, 37, 38, 39, 40, 42, 43, 45, 45, 49, 49, 52, 54, 56, 59, 60,
+ 64, 64, 70, 70, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 37, 38, 38,
+ 39, 40, 41, 42, 43, 46, 46, 49, 49, 52, 53, 55, 58, 59, 63, 63, 69, 69,
+ 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 41, 41, 43,
+ 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34, 34, 33,
+ 33, 34, 35, 35, 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54,
+ 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 34, 34, 34, 33, 33, 34, 35, 35,
+ 37, 37, 39, 39, 42, 42, 44, 45, 46, 47, 48, 51, 51, 54, 54, 57, 58, 60,
+ 63, 64, 68, 68, 74, 74, 35, 35, 35, 34, 34, 35, 36, 36, 38, 38, 42, 42,
+ 46, 47, 48, 49, 50, 52, 53, 55, 55, 58, 59, 61, 62, 64, 67, 68, 72, 72,
+ 78, 78, 36, 35, 35, 34, 34, 35, 36, 37, 38, 38, 42, 42, 47, 48, 50, 50,
+ 52, 54, 54, 57, 57, 59, 60, 62, 64, 66, 68, 69, 73, 73, 79, 79, 38, 37,
+ 37, 36, 36, 37, 38, 38, 39, 40, 44, 44, 48, 50, 51, 52, 54, 56, 57, 59,
+ 59, 62, 63, 65, 67, 69, 71, 72, 76, 76, 82, 82, 39, 38, 38, 38, 37, 38,
+ 39, 39, 40, 41, 45, 45, 49, 50, 52, 54, 55, 58, 58, 61, 61, 64, 65, 67,
+ 69, 71, 73, 74, 78, 78, 84, 84, 41, 40, 40, 39, 38, 39, 40, 40, 41, 41,
+ 46, 46, 50, 52, 54, 55, 57, 60, 60, 63, 63, 67, 67, 70, 71, 73, 75, 77,
+ 80, 81, 86, 86, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 47, 47, 52, 54,
+ 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 45, 43, 43, 42, 41, 42, 42, 43, 43, 43, 48, 48, 53, 54, 57, 58, 60, 64,
+ 65, 68, 68, 72, 72, 75, 76, 78, 81, 82, 85, 86, 91, 91, 48, 46, 46, 45,
+ 44, 45, 45, 45, 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75,
+ 76, 79, 80, 83, 85, 87, 90, 91, 96, 96, 48, 46, 46, 45, 44, 45, 45, 45,
+ 46, 46, 51, 51, 55, 57, 59, 61, 63, 67, 68, 71, 71, 75, 76, 79, 80, 83,
+ 85, 87, 90, 91, 96, 96, 53, 51, 51, 49, 49, 49, 49, 49, 49, 49, 54, 54,
+ 58, 59, 62, 64, 67, 71, 72, 75, 75, 81, 81, 85, 86, 89, 91, 93, 97, 97,
+ 103, 103, 54, 52, 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63,
+ 65, 67, 71, 72, 76, 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104,
+ 57, 55, 55, 53, 52, 52, 52, 52, 52, 52, 57, 57, 61, 62, 65, 67, 70, 74,
+ 75, 79, 79, 85, 85, 89, 90, 93, 96, 98, 102, 102, 108, 108, 59, 56, 56,
+ 54, 54, 54, 54, 54, 53, 54, 58, 58, 62, 64, 67, 69, 71, 75, 76, 80, 80,
+ 86, 87, 90, 92, 95, 98, 99, 103, 104, 110, 110, 62, 59, 59, 57, 56, 56,
+ 56, 56, 55, 56, 60, 60, 64, 66, 69, 71, 73, 77, 78, 83, 83, 89, 89, 93,
+ 95, 98, 101, 103, 107, 108, 114, 114, 65, 62, 62, 60, 59, 59, 59, 59,
+ 58, 58, 63, 63, 67, 68, 71, 73, 75, 79, 81, 85, 85, 91, 92, 96, 98, 101,
+ 105, 106, 111, 111, 118, 118, 67, 64, 64, 62, 61, 61, 60, 60, 59, 60,
+ 64, 64, 68, 69, 72, 74, 77, 81, 82, 87, 87, 93, 94, 98, 99, 103, 106,
+ 108, 113, 113, 120, 120, 71, 68, 68, 66, 65, 64, 64, 64, 63, 63, 68, 68,
+ 72, 73, 76, 78, 80, 84, 85, 90, 90, 97, 97, 102, 103, 107, 111, 113,
+ 117, 118, 125, 125, 72, 69, 69, 66, 65, 65, 65, 64, 63, 64, 68, 68, 72,
+ 73, 76, 78, 81, 85, 86, 91, 91, 97, 98, 102, 104, 108, 111, 113, 118,
+ 119, 126, 126, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79,
+ 82, 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+ 134, 134, 80, 76, 76, 73, 72, 72, 71, 70, 69, 70, 74, 74, 78, 79, 82,
+ 84, 86, 90, 91, 96, 96, 103, 104, 108, 110, 114, 118, 120, 125, 126,
+ 134, 134,
+ /* Size 4x8 */
+ 32, 34, 43, 62, 32, 34, 42, 59, 33, 37, 44, 58, 35, 43, 54, 68, 41, 48,
+ 64, 79, 49, 54, 71, 91, 57, 60, 78, 101, 66, 68, 86, 111,
+ /* Size 8x4 */
+ 32, 32, 33, 35, 41, 49, 57, 66, 34, 34, 37, 43, 48, 54, 60, 68, 43, 42,
+ 44, 54, 64, 71, 78, 86, 62, 59, 58, 68, 79, 91, 101, 111,
+ /* Size 8x16 */
+ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 59, 69, 31, 32,
+ 33, 34, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50, 57, 65, 32, 33, 35, 38,
+ 42, 49, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59,
+ 65, 73, 38, 36, 40, 49, 56, 63, 69, 77, 41, 39, 41, 51, 60, 67, 74, 81,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 53, 49,
+ 50, 60, 71, 82, 90, 99, 58, 54, 54, 63, 75, 87, 95, 105, 65, 60, 58, 68,
+ 79, 92, 102, 112, 71, 65, 63, 73, 84, 97, 108, 119, 79, 72, 70, 79, 90,
+ 104, 115, 127,
+ /* Size 16x8 */
+ 32, 31, 31, 32, 32, 34, 35, 38, 41, 44, 48, 53, 58, 65, 71, 79, 31, 32,
+ 32, 32, 33, 34, 34, 36, 39, 42, 45, 49, 54, 60, 65, 72, 32, 32, 33, 34,
+ 35, 37, 38, 40, 41, 43, 46, 50, 54, 58, 63, 70, 36, 35, 34, 36, 38, 42,
+ 47, 49, 51, 54, 56, 60, 63, 68, 73, 79, 44, 42, 41, 42, 42, 48, 52, 56,
+ 60, 64, 67, 71, 75, 79, 84, 90, 53, 51, 49, 50, 49, 54, 59, 63, 67, 72,
+ 76, 82, 87, 92, 97, 104, 62, 59, 57, 57, 56, 61, 65, 69, 74, 79, 83, 90,
+ 95, 102, 108, 115, 73, 69, 66, 65, 64, 69, 73, 77, 81, 86, 91, 99, 105,
+ 112, 119, 127,
+ /* Size 16x32 */
+ 32, 31, 31, 32, 32, 34, 36, 38, 44, 44, 53, 53, 62, 65, 73, 79, 31, 32,
+ 32, 32, 32, 34, 35, 37, 42, 43, 51, 51, 60, 62, 70, 75, 31, 32, 32, 32,
+ 32, 34, 35, 37, 42, 43, 51, 51, 59, 62, 69, 75, 31, 32, 32, 32, 32, 33,
+ 35, 36, 41, 42, 50, 50, 58, 60, 67, 73, 31, 32, 32, 32, 33, 33, 34, 36,
+ 41, 41, 49, 49, 57, 59, 66, 72, 31, 32, 32, 33, 33, 34, 35, 37, 41, 42,
+ 49, 49, 57, 59, 66, 71, 32, 32, 32, 33, 34, 35, 36, 38, 42, 43, 50, 50,
+ 57, 59, 65, 71, 32, 32, 32, 34, 34, 35, 37, 38, 42, 43, 49, 49, 56, 59,
+ 65, 70, 32, 32, 33, 34, 35, 37, 38, 39, 42, 43, 49, 49, 56, 58, 64, 69,
+ 32, 33, 33, 34, 35, 37, 39, 40, 43, 44, 50, 50, 56, 58, 64, 69, 34, 34,
+ 34, 36, 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 34, 34, 34, 36,
+ 37, 39, 42, 44, 48, 48, 54, 54, 61, 63, 69, 73, 35, 34, 34, 37, 38, 42,
+ 47, 48, 52, 53, 59, 59, 65, 67, 73, 77, 36, 35, 34, 37, 38, 43, 48, 49,
+ 54, 54, 60, 60, 66, 68, 74, 78, 38, 36, 36, 38, 40, 44, 49, 51, 56, 57,
+ 63, 63, 69, 71, 77, 81, 39, 38, 37, 40, 40, 45, 50, 52, 58, 58, 65, 65,
+ 71, 73, 79, 84, 41, 39, 39, 41, 41, 46, 51, 54, 60, 60, 67, 67, 74, 76,
+ 81, 86, 44, 41, 41, 42, 43, 48, 53, 56, 63, 64, 71, 71, 78, 79, 85, 90,
+ 44, 42, 42, 43, 43, 48, 54, 56, 64, 64, 72, 72, 79, 81, 86, 91, 48, 45,
+ 45, 46, 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 48, 45, 45, 46,
+ 46, 51, 56, 59, 67, 67, 76, 76, 83, 85, 91, 96, 53, 49, 49, 49, 49, 54,
+ 59, 62, 71, 71, 81, 81, 89, 91, 98, 103, 53, 50, 49, 50, 50, 54, 60, 63,
+ 71, 72, 82, 82, 90, 92, 99, 103, 57, 53, 52, 52, 52, 57, 62, 65, 74, 75,
+ 85, 85, 94, 96, 103, 108, 58, 54, 54, 54, 54, 58, 63, 67, 75, 76, 87,
+ 87, 95, 98, 105, 110, 61, 57, 57, 56, 56, 60, 66, 69, 77, 78, 89, 89,
+ 98, 101, 108, 114, 65, 60, 60, 59, 58, 63, 68, 71, 79, 80, 92, 92, 102,
+ 105, 112, 118, 67, 62, 61, 60, 60, 64, 69, 72, 81, 82, 94, 94, 103, 106,
+ 114, 120, 71, 66, 65, 64, 63, 68, 73, 76, 84, 85, 97, 97, 108, 111, 119,
+ 125, 72, 66, 66, 64, 64, 68, 73, 76, 85, 86, 98, 98, 108, 111, 119, 125,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ 79, 73, 72, 71, 70, 74, 79, 82, 90, 91, 104, 104, 115, 118, 127, 133,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 36, 38, 39, 41, 42, 45, 45, 49,
+ 50, 53, 54, 57, 60, 62, 66, 66, 73, 73, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49, 49, 52, 54, 57,
+ 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 36, 36,
+ 37, 37, 38, 40, 41, 42, 43, 46, 46, 49, 50, 52, 54, 56, 59, 60, 64, 64,
+ 71, 71, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40,
+ 41, 43, 43, 46, 46, 49, 50, 52, 54, 56, 58, 60, 63, 64, 70, 70, 34, 34,
+ 34, 33, 33, 34, 35, 35, 37, 37, 39, 39, 42, 43, 44, 45, 46, 48, 48, 51,
+ 51, 54, 54, 57, 58, 60, 63, 64, 68, 68, 74, 74, 36, 35, 35, 35, 34, 35,
+ 36, 37, 38, 39, 42, 42, 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62,
+ 63, 66, 68, 69, 73, 73, 79, 79, 38, 37, 37, 36, 36, 37, 38, 38, 39, 40,
+ 44, 44, 48, 49, 51, 52, 54, 56, 56, 59, 59, 62, 63, 65, 67, 69, 71, 72,
+ 76, 76, 82, 82, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54,
+ 56, 58, 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90,
+ 44, 43, 43, 42, 41, 42, 43, 43, 43, 44, 48, 48, 53, 54, 57, 58, 60, 64,
+ 64, 67, 67, 71, 72, 75, 76, 78, 80, 82, 85, 86, 91, 91, 53, 51, 51, 50,
+ 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81,
+ 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 53, 51, 51, 50, 49, 49, 50,
+ 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76, 76, 81, 82, 85, 87,
+ 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57, 57, 57, 56, 56, 56,
+ 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90, 94, 95, 98, 102,
+ 103, 108, 108, 115, 115, 65, 62, 62, 60, 59, 59, 59, 59, 58, 58, 63, 63,
+ 67, 68, 71, 73, 76, 79, 81, 85, 85, 91, 92, 96, 98, 101, 105, 106, 111,
+ 111, 118, 118, 73, 70, 69, 67, 66, 66, 65, 65, 64, 64, 69, 69, 73, 74,
+ 77, 79, 81, 85, 86, 91, 91, 98, 99, 103, 105, 108, 112, 114, 119, 119,
+ 127, 127, 79, 75, 75, 73, 72, 71, 71, 70, 69, 69, 73, 73, 77, 78, 81,
+ 84, 86, 90, 91, 96, 96, 103, 103, 108, 110, 114, 118, 120, 125, 125,
+ 133, 133,
+ /* Size 4x16 */
+ 31, 34, 44, 65, 32, 34, 43, 62, 32, 33, 41, 59, 32, 35, 43, 59, 32, 37,
+ 43, 58, 34, 39, 48, 63, 34, 42, 53, 67, 36, 44, 57, 71, 39, 46, 60, 76,
+ 42, 48, 64, 81, 45, 51, 67, 85, 50, 54, 72, 92, 54, 58, 76, 98, 60, 63,
+ 80, 105, 66, 68, 85, 111, 73, 74, 91, 118,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 32, 34, 34, 36, 39, 42, 45, 50, 54, 60, 66, 73, 34, 34,
+ 33, 35, 37, 39, 42, 44, 46, 48, 51, 54, 58, 63, 68, 74, 44, 43, 41, 43,
+ 43, 48, 53, 57, 60, 64, 67, 72, 76, 80, 85, 91, 65, 62, 59, 59, 58, 63,
+ 67, 71, 76, 81, 85, 92, 98, 105, 111, 118,
+ /* Size 8x32 */
+ 32, 31, 32, 36, 44, 53, 62, 73, 31, 32, 32, 35, 42, 51, 60, 70, 31, 32,
+ 32, 35, 42, 51, 59, 69, 31, 32, 32, 35, 41, 50, 58, 67, 31, 32, 33, 34,
+ 41, 49, 57, 66, 31, 32, 33, 35, 41, 49, 57, 66, 32, 32, 34, 36, 42, 50,
+ 57, 65, 32, 32, 34, 37, 42, 49, 56, 65, 32, 33, 35, 38, 42, 49, 56, 64,
+ 32, 33, 35, 39, 43, 50, 56, 64, 34, 34, 37, 42, 48, 54, 61, 69, 34, 34,
+ 37, 42, 48, 54, 61, 69, 35, 34, 38, 47, 52, 59, 65, 73, 36, 34, 38, 48,
+ 54, 60, 66, 74, 38, 36, 40, 49, 56, 63, 69, 77, 39, 37, 40, 50, 58, 65,
+ 71, 79, 41, 39, 41, 51, 60, 67, 74, 81, 44, 41, 43, 53, 63, 71, 78, 85,
+ 44, 42, 43, 54, 64, 72, 79, 86, 48, 45, 46, 56, 67, 76, 83, 91, 48, 45,
+ 46, 56, 67, 76, 83, 91, 53, 49, 49, 59, 71, 81, 89, 98, 53, 49, 50, 60,
+ 71, 82, 90, 99, 57, 52, 52, 62, 74, 85, 94, 103, 58, 54, 54, 63, 75, 87,
+ 95, 105, 61, 57, 56, 66, 77, 89, 98, 108, 65, 60, 58, 68, 79, 92, 102,
+ 112, 67, 61, 60, 69, 81, 94, 103, 114, 71, 65, 63, 73, 84, 97, 108, 119,
+ 72, 66, 64, 73, 85, 98, 108, 119, 79, 72, 70, 79, 90, 104, 115, 127, 79,
+ 72, 70, 79, 90, 104, 115, 127,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 38, 39, 41, 44,
+ 44, 48, 48, 53, 53, 57, 58, 61, 65, 67, 71, 72, 79, 79, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 36, 37, 39, 41, 42, 45, 45, 49,
+ 49, 52, 54, 57, 60, 61, 65, 66, 72, 72, 32, 32, 32, 32, 33, 33, 34, 34,
+ 35, 35, 37, 37, 38, 38, 40, 40, 41, 43, 43, 46, 46, 49, 50, 52, 54, 56,
+ 58, 60, 63, 64, 70, 70, 36, 35, 35, 35, 34, 35, 36, 37, 38, 39, 42, 42,
+ 47, 48, 49, 50, 51, 53, 54, 56, 56, 59, 60, 62, 63, 66, 68, 69, 73, 73,
+ 79, 79, 44, 42, 42, 41, 41, 41, 42, 42, 42, 43, 48, 48, 52, 54, 56, 58,
+ 60, 63, 64, 67, 67, 71, 71, 74, 75, 77, 79, 81, 84, 85, 90, 90, 53, 51,
+ 51, 50, 49, 49, 50, 49, 49, 50, 54, 54, 59, 60, 63, 65, 67, 71, 72, 76,
+ 76, 81, 82, 85, 87, 89, 92, 94, 97, 98, 104, 104, 62, 60, 59, 58, 57,
+ 57, 57, 56, 56, 56, 61, 61, 65, 66, 69, 71, 74, 78, 79, 83, 83, 89, 90,
+ 94, 95, 98, 102, 103, 108, 108, 115, 115, 73, 70, 69, 67, 66, 66, 65,
+ 65, 64, 64, 69, 69, 73, 74, 77, 79, 81, 85, 86, 91, 91, 98, 99, 103,
+ 105, 108, 112, 114, 119, 119, 127, 127 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 42, 47, 53, 42, 48, 50, 54, 47, 50, 61, 67, 53, 54, 67, 78,
+ /* Size 8x8 */
+ 31, 32, 38, 48, 47, 50, 53, 57, 32, 35, 42, 47, 45, 47, 50, 54, 38, 42,
+ 47, 48, 45, 47, 49, 52, 48, 47, 48, 53, 53, 54, 56, 58, 47, 45, 45, 53,
+ 58, 61, 63, 65, 50, 47, 47, 54, 61, 66, 69, 72, 53, 50, 49, 56, 63, 69,
+ 73, 77, 57, 54, 52, 58, 65, 72, 77, 82,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 36, 41, 47, 49, 49, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+ 31, 34, 38, 42, 46, 47, 47, 47, 48, 50, 52, 54, 57, 60, 30, 31, 32, 35,
+ 40, 42, 45, 46, 45, 45, 46, 47, 49, 52, 54, 57, 33, 34, 35, 39, 43, 45,
+ 47, 46, 46, 45, 46, 47, 49, 51, 53, 56, 36, 38, 40, 43, 47, 47, 47, 47,
+ 46, 45, 46, 47, 48, 50, 52, 54, 41, 42, 42, 45, 47, 48, 50, 50, 49, 49,
+ 50, 50, 52, 53, 55, 57, 47, 46, 45, 47, 47, 50, 52, 52, 52, 52, 53, 53,
+ 55, 56, 58, 60, 49, 47, 46, 46, 47, 50, 52, 53, 54, 55, 55, 56, 57, 58,
+ 60, 62, 49, 47, 45, 46, 46, 49, 52, 54, 55, 57, 58, 59, 60, 61, 63, 65,
+ 49, 47, 45, 45, 45, 49, 52, 55, 57, 59, 60, 61, 63, 64, 66, 68, 50, 48,
+ 46, 46, 46, 50, 53, 55, 58, 60, 61, 63, 65, 67, 68, 71, 52, 50, 47, 47,
+ 47, 50, 53, 56, 59, 61, 63, 66, 68, 70, 72, 75, 54, 52, 49, 49, 48, 52,
+ 55, 57, 60, 63, 65, 68, 71, 73, 75, 78, 57, 54, 52, 51, 50, 53, 56, 58,
+ 61, 64, 67, 70, 73, 76, 79, 82, 60, 57, 54, 53, 52, 55, 58, 60, 63, 66,
+ 68, 72, 75, 79, 82, 85, 63, 60, 57, 56, 54, 57, 60, 62, 65, 68, 71, 75,
+ 78, 82, 85, 89,
+ /* Size 32x32 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 36, 37, 41, 41, 47, 49, 49, 48, 49, 49,
+ 49, 50, 50, 52, 52, 54, 54, 56, 57, 58, 60, 60, 63, 63, 31, 31, 31, 31,
+ 31, 32, 34, 35, 38, 38, 42, 42, 46, 48, 47, 47, 47, 47, 47, 48, 48, 50,
+ 50, 51, 52, 53, 54, 55, 57, 57, 60, 60, 31, 31, 31, 31, 31, 33, 34, 35,
+ 38, 39, 42, 42, 46, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 51, 52, 53,
+ 54, 55, 57, 57, 60, 60, 30, 31, 31, 31, 31, 33, 35, 36, 39, 40, 42, 42,
+ 46, 47, 46, 46, 46, 45, 46, 47, 47, 48, 48, 50, 50, 51, 52, 53, 55, 55,
+ 58, 58, 30, 31, 31, 31, 32, 33, 35, 36, 40, 40, 42, 42, 45, 46, 46, 45,
+ 45, 45, 45, 46, 46, 47, 47, 49, 49, 51, 52, 52, 54, 54, 57, 57, 32, 32,
+ 33, 33, 33, 35, 37, 38, 41, 42, 43, 43, 46, 47, 46, 46, 45, 45, 45, 46,
+ 46, 47, 47, 49, 49, 50, 51, 52, 54, 54, 57, 57, 33, 34, 34, 35, 35, 37,
+ 39, 40, 43, 43, 45, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 49,
+ 49, 50, 51, 52, 53, 54, 56, 56, 34, 35, 35, 36, 36, 38, 40, 41, 44, 44,
+ 45, 45, 47, 47, 47, 46, 46, 45, 45, 46, 46, 47, 47, 48, 49, 50, 51, 51,
+ 53, 53, 55, 55, 36, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 47, 48,
+ 47, 46, 46, 45, 45, 46, 46, 46, 47, 48, 48, 49, 50, 50, 52, 52, 54, 54,
+ 37, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47, 46, 45,
+ 46, 46, 46, 47, 47, 48, 48, 49, 50, 51, 52, 52, 55, 55, 41, 42, 42, 42,
+ 42, 43, 45, 45, 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50,
+ 50, 51, 52, 52, 53, 54, 55, 55, 57, 57, 41, 42, 42, 42, 42, 43, 45, 45,
+ 47, 47, 48, 48, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 51, 52, 52,
+ 53, 54, 55, 55, 57, 57, 47, 46, 46, 46, 45, 46, 47, 47, 47, 48, 50, 50,
+ 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 54, 55, 55, 56, 56, 58, 58,
+ 60, 60, 49, 48, 47, 47, 46, 47, 47, 47, 48, 48, 50, 50, 52, 53, 53, 53,
+ 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 58, 60, 60, 49, 47,
+ 47, 46, 46, 46, 46, 47, 47, 47, 50, 50, 52, 53, 53, 54, 54, 55, 55, 55,
+ 55, 56, 56, 57, 57, 58, 58, 59, 60, 60, 62, 62, 48, 47, 47, 46, 45, 46,
+ 46, 46, 46, 47, 49, 49, 52, 53, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58,
+ 58, 59, 60, 60, 61, 62, 63, 63, 49, 47, 47, 46, 45, 45, 46, 46, 46, 46,
+ 49, 49, 52, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 61, 61, 62,
+ 63, 63, 65, 65, 49, 47, 47, 45, 45, 45, 45, 45, 45, 45, 49, 49, 52, 53,
+ 55, 55, 57, 58, 59, 60, 60, 61, 61, 62, 62, 63, 63, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 45, 45, 45, 46, 49, 49, 52, 53, 55, 56, 57, 59,
+ 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 50, 48, 48, 47,
+ 46, 46, 46, 46, 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63,
+ 63, 65, 65, 66, 67, 67, 68, 69, 71, 71, 50, 48, 48, 47, 46, 46, 46, 46,
+ 46, 46, 50, 50, 53, 54, 55, 56, 58, 60, 60, 61, 61, 63, 63, 65, 65, 66,
+ 67, 67, 68, 69, 71, 71, 52, 50, 49, 48, 47, 47, 47, 47, 46, 47, 50, 50,
+ 53, 54, 56, 57, 59, 61, 61, 63, 63, 66, 66, 67, 68, 69, 70, 71, 72, 72,
+ 74, 74, 52, 50, 50, 48, 47, 47, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57,
+ 59, 61, 61, 63, 63, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 54, 51,
+ 51, 50, 49, 49, 49, 48, 48, 48, 51, 51, 54, 55, 57, 58, 60, 62, 62, 65,
+ 65, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 77, 54, 52, 52, 50, 49, 49,
+ 49, 49, 48, 48, 52, 52, 55, 55, 57, 58, 60, 62, 63, 65, 65, 68, 68, 70,
+ 71, 72, 73, 74, 75, 76, 78, 78, 56, 53, 53, 51, 51, 50, 50, 50, 49, 49,
+ 52, 52, 55, 56, 58, 59, 61, 63, 63, 66, 66, 69, 69, 71, 72, 73, 75, 75,
+ 77, 77, 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 56,
+ 58, 60, 61, 63, 64, 67, 67, 70, 70, 72, 73, 75, 76, 77, 79, 79, 82, 82,
+ 58, 55, 55, 53, 52, 52, 52, 51, 50, 51, 54, 54, 56, 57, 59, 60, 62, 64,
+ 65, 67, 67, 71, 71, 73, 74, 75, 77, 78, 80, 80, 83, 83, 60, 57, 57, 55,
+ 54, 54, 53, 53, 52, 52, 55, 55, 58, 58, 60, 61, 63, 65, 66, 68, 68, 72,
+ 72, 74, 75, 77, 79, 80, 82, 82, 85, 85, 60, 57, 57, 55, 54, 54, 54, 53,
+ 52, 52, 55, 55, 58, 58, 60, 62, 63, 65, 66, 69, 69, 72, 73, 75, 76, 77,
+ 79, 80, 82, 82, 85, 85, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57,
+ 60, 60, 62, 63, 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85,
+ 89, 89, 63, 60, 60, 58, 57, 57, 56, 55, 54, 55, 57, 57, 60, 60, 62, 63,
+ 65, 67, 68, 71, 71, 74, 75, 77, 78, 80, 82, 83, 85, 85, 89, 89,
+ /* Size 4x8 */
+ 31, 42, 47, 54, 33, 44, 45, 51, 40, 47, 46, 50, 47, 50, 54, 57, 45, 49,
+ 59, 64, 48, 50, 61, 70, 51, 52, 63, 75, 55, 55, 66, 79,
+ /* Size 8x4 */
+ 31, 33, 40, 47, 45, 48, 51, 55, 42, 44, 47, 50, 49, 50, 52, 55, 47, 45,
+ 46, 54, 59, 61, 63, 66, 54, 51, 50, 57, 64, 70, 75, 79,
+ /* Size 8x16 */
+ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 53, 57, 30, 32,
+ 40, 46, 45, 48, 51, 55, 33, 36, 43, 47, 46, 47, 50, 54, 37, 40, 47, 47,
+ 45, 47, 49, 52, 42, 43, 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53,
+ 55, 58, 48, 46, 47, 53, 55, 56, 58, 61, 48, 45, 46, 53, 57, 59, 61, 63,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 52, 48,
+ 47, 54, 61, 66, 70, 73, 54, 50, 49, 55, 62, 68, 72, 76, 57, 52, 50, 56,
+ 64, 70, 75, 79, 60, 54, 52, 58, 65, 72, 77, 82, 63, 57, 55, 60, 67, 75,
+ 80, 86,
+ /* Size 16x8 */
+ 32, 31, 30, 33, 37, 42, 47, 48, 48, 49, 50, 52, 54, 57, 60, 63, 31, 31,
+ 32, 36, 40, 43, 46, 46, 45, 45, 46, 48, 50, 52, 54, 57, 37, 38, 40, 43,
+ 47, 47, 48, 47, 46, 46, 46, 47, 49, 50, 52, 55, 48, 47, 46, 47, 47, 50,
+ 52, 53, 53, 53, 54, 54, 55, 56, 58, 60, 49, 47, 45, 46, 45, 49, 53, 55,
+ 57, 58, 59, 61, 62, 64, 65, 67, 52, 50, 48, 47, 47, 50, 53, 56, 59, 62,
+ 64, 66, 68, 70, 72, 75, 56, 53, 51, 50, 49, 53, 55, 58, 61, 64, 66, 70,
+ 72, 75, 77, 80, 61, 57, 55, 54, 52, 56, 58, 61, 63, 66, 69, 73, 76, 79,
+ 82, 86,
+ /* Size 16x32 */
+ 32, 31, 31, 35, 37, 42, 48, 48, 49, 49, 52, 52, 56, 57, 61, 63, 31, 31,
+ 31, 36, 38, 42, 47, 47, 47, 47, 50, 50, 54, 54, 58, 60, 31, 31, 31, 36,
+ 38, 42, 47, 47, 47, 47, 50, 50, 53, 54, 57, 60, 30, 32, 32, 37, 39, 42,
+ 46, 46, 46, 46, 48, 48, 52, 52, 56, 58, 30, 32, 32, 37, 40, 42, 46, 46,
+ 45, 45, 48, 48, 51, 52, 55, 57, 32, 33, 34, 39, 41, 44, 46, 46, 45, 45,
+ 48, 48, 51, 51, 54, 57, 33, 35, 36, 40, 43, 45, 47, 46, 46, 46, 47, 47,
+ 50, 51, 54, 56, 34, 37, 37, 42, 44, 45, 47, 47, 45, 46, 47, 47, 50, 51,
+ 53, 55, 37, 40, 40, 45, 47, 47, 47, 47, 45, 46, 47, 47, 49, 50, 52, 54,
+ 37, 40, 40, 45, 47, 47, 48, 47, 46, 46, 47, 47, 49, 50, 53, 55, 42, 43,
+ 43, 46, 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 42, 43, 43, 46,
+ 47, 48, 50, 50, 49, 49, 50, 50, 53, 53, 56, 57, 47, 46, 46, 47, 48, 50,
+ 52, 52, 53, 53, 53, 53, 55, 56, 58, 60, 49, 47, 46, 47, 48, 50, 53, 53,
+ 53, 54, 54, 54, 56, 57, 59, 60, 48, 46, 46, 47, 47, 50, 53, 53, 55, 55,
+ 56, 56, 58, 58, 61, 62, 48, 46, 46, 46, 47, 50, 53, 54, 56, 56, 57, 57,
+ 59, 60, 62, 64, 48, 46, 45, 46, 46, 49, 53, 54, 57, 57, 59, 59, 61, 61,
+ 63, 65, 49, 45, 45, 45, 46, 49, 53, 55, 58, 59, 61, 61, 63, 64, 66, 67,
+ 49, 46, 45, 46, 46, 49, 53, 55, 58, 59, 62, 62, 64, 64, 66, 68, 50, 47,
+ 46, 46, 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 50, 47, 46, 46,
+ 46, 50, 54, 55, 59, 60, 64, 64, 66, 67, 69, 71, 52, 48, 48, 47, 47, 50,
+ 54, 56, 61, 61, 66, 66, 69, 70, 72, 74, 52, 48, 48, 47, 47, 50, 54, 56,
+ 61, 61, 66, 66, 70, 71, 73, 75, 53, 50, 49, 48, 48, 51, 55, 57, 62, 62,
+ 68, 68, 71, 72, 75, 77, 54, 50, 50, 49, 49, 52, 55, 57, 62, 63, 68, 68,
+ 72, 73, 76, 78, 55, 51, 51, 50, 49, 52, 56, 58, 63, 63, 69, 69, 74, 75,
+ 78, 80, 57, 52, 52, 51, 50, 53, 56, 58, 64, 64, 70, 70, 75, 76, 79, 82,
+ 58, 53, 53, 51, 51, 54, 57, 59, 64, 65, 71, 71, 76, 77, 80, 83, 60, 55,
+ 54, 53, 52, 55, 58, 60, 65, 66, 72, 72, 77, 79, 82, 85, 60, 55, 55, 53,
+ 53, 55, 59, 60, 65, 66, 73, 73, 78, 79, 83, 85, 63, 58, 57, 56, 55, 58,
+ 60, 62, 67, 68, 75, 75, 80, 82, 86, 89, 63, 58, 57, 56, 55, 58, 60, 62,
+ 67, 68, 75, 75, 80, 82, 86, 89,
+ /* Size 32x16 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+ 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+ 32, 33, 35, 37, 40, 40, 43, 43, 46, 47, 46, 46, 46, 45, 46, 47, 47, 48,
+ 48, 50, 50, 51, 52, 53, 55, 55, 58, 58, 31, 31, 31, 32, 32, 34, 36, 37,
+ 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48, 48, 49, 50, 51,
+ 52, 53, 54, 55, 57, 57, 35, 36, 36, 37, 37, 39, 40, 42, 45, 45, 46, 46,
+ 47, 47, 47, 46, 46, 45, 46, 46, 46, 47, 47, 48, 49, 50, 51, 51, 53, 53,
+ 56, 56, 37, 38, 38, 39, 40, 41, 43, 44, 47, 47, 47, 47, 48, 48, 47, 47,
+ 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53, 55, 55, 42, 42,
+ 42, 42, 42, 44, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 49, 49, 50,
+ 50, 50, 50, 51, 52, 52, 53, 54, 55, 55, 58, 58, 48, 47, 47, 46, 46, 46,
+ 47, 47, 47, 48, 50, 50, 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55,
+ 55, 56, 56, 57, 58, 59, 60, 60, 48, 47, 47, 46, 46, 46, 46, 47, 47, 47,
+ 50, 50, 52, 53, 53, 54, 54, 55, 55, 55, 55, 56, 56, 57, 57, 58, 58, 59,
+ 60, 60, 62, 62, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53,
+ 55, 56, 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 49, 49, 53, 54, 55, 56, 57, 59,
+ 59, 60, 60, 61, 61, 62, 63, 63, 64, 65, 66, 66, 68, 68, 52, 50, 50, 48,
+ 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66,
+ 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 52, 50, 50, 48, 48, 48, 47, 47,
+ 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64, 64, 66, 66, 68, 68, 69,
+ 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51, 50, 50, 49, 49, 53, 53,
+ 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71, 72, 74, 75, 76, 77, 78,
+ 80, 80, 57, 54, 54, 52, 52, 51, 51, 51, 50, 50, 53, 53, 56, 57, 58, 60,
+ 61, 64, 64, 67, 67, 70, 71, 72, 73, 75, 76, 77, 79, 79, 82, 82, 61, 58,
+ 57, 56, 55, 54, 54, 53, 52, 53, 56, 56, 58, 59, 61, 62, 63, 66, 66, 69,
+ 69, 72, 73, 75, 76, 78, 79, 80, 82, 83, 86, 86, 63, 60, 60, 58, 57, 57,
+ 56, 55, 54, 55, 57, 57, 60, 60, 62, 64, 65, 67, 68, 71, 71, 74, 75, 77,
+ 78, 80, 82, 83, 85, 85, 89, 89,
+ /* Size 4x16 */
+ 31, 42, 49, 57, 31, 42, 47, 54, 32, 42, 45, 52, 35, 45, 46, 51, 40, 47,
+ 46, 50, 43, 48, 49, 53, 46, 50, 53, 56, 46, 50, 55, 58, 46, 49, 57, 61,
+ 46, 49, 59, 64, 47, 50, 60, 67, 48, 50, 61, 71, 50, 52, 63, 73, 52, 53,
+ 64, 76, 55, 55, 66, 79, 58, 58, 68, 82,
+ /* Size 16x4 */
+ 31, 31, 32, 35, 40, 43, 46, 46, 46, 46, 47, 48, 50, 52, 55, 58, 42, 42,
+ 42, 45, 47, 48, 50, 50, 49, 49, 50, 50, 52, 53, 55, 58, 49, 47, 45, 46,
+ 46, 49, 53, 55, 57, 59, 60, 61, 63, 64, 66, 68, 57, 54, 52, 51, 50, 53,
+ 56, 58, 61, 64, 67, 71, 73, 76, 79, 82,
+ /* Size 8x32 */
+ 32, 31, 37, 48, 49, 52, 56, 61, 31, 31, 38, 47, 47, 50, 54, 58, 31, 31,
+ 38, 47, 47, 50, 53, 57, 30, 32, 39, 46, 46, 48, 52, 56, 30, 32, 40, 46,
+ 45, 48, 51, 55, 32, 34, 41, 46, 45, 48, 51, 54, 33, 36, 43, 47, 46, 47,
+ 50, 54, 34, 37, 44, 47, 45, 47, 50, 53, 37, 40, 47, 47, 45, 47, 49, 52,
+ 37, 40, 47, 48, 46, 47, 49, 53, 42, 43, 47, 50, 49, 50, 53, 56, 42, 43,
+ 47, 50, 49, 50, 53, 56, 47, 46, 48, 52, 53, 53, 55, 58, 49, 46, 48, 53,
+ 53, 54, 56, 59, 48, 46, 47, 53, 55, 56, 58, 61, 48, 46, 47, 53, 56, 57,
+ 59, 62, 48, 45, 46, 53, 57, 59, 61, 63, 49, 45, 46, 53, 58, 61, 63, 66,
+ 49, 45, 46, 53, 58, 62, 64, 66, 50, 46, 46, 54, 59, 64, 66, 69, 50, 46,
+ 46, 54, 59, 64, 66, 69, 52, 48, 47, 54, 61, 66, 69, 72, 52, 48, 47, 54,
+ 61, 66, 70, 73, 53, 49, 48, 55, 62, 68, 71, 75, 54, 50, 49, 55, 62, 68,
+ 72, 76, 55, 51, 49, 56, 63, 69, 74, 78, 57, 52, 50, 56, 64, 70, 75, 79,
+ 58, 53, 51, 57, 64, 71, 76, 80, 60, 54, 52, 58, 65, 72, 77, 82, 60, 55,
+ 53, 59, 65, 73, 78, 83, 63, 57, 55, 60, 67, 75, 80, 86, 63, 57, 55, 60,
+ 67, 75, 80, 86,
+ /* Size 32x8 */
+ 32, 31, 31, 30, 30, 32, 33, 34, 37, 37, 42, 42, 47, 49, 48, 48, 48, 49,
+ 49, 50, 50, 52, 52, 53, 54, 55, 57, 58, 60, 60, 63, 63, 31, 31, 31, 32,
+ 32, 34, 36, 37, 40, 40, 43, 43, 46, 46, 46, 46, 45, 45, 45, 46, 46, 48,
+ 48, 49, 50, 51, 52, 53, 54, 55, 57, 57, 37, 38, 38, 39, 40, 41, 43, 44,
+ 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49,
+ 50, 51, 52, 53, 55, 55, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 50, 50,
+ 52, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 56, 56, 57, 58, 59,
+ 60, 60, 49, 47, 47, 46, 45, 45, 46, 45, 45, 46, 49, 49, 53, 53, 55, 56,
+ 57, 58, 58, 59, 59, 61, 61, 62, 62, 63, 64, 64, 65, 65, 67, 67, 52, 50,
+ 50, 48, 48, 48, 47, 47, 47, 47, 50, 50, 53, 54, 56, 57, 59, 61, 62, 64,
+ 64, 66, 66, 68, 68, 69, 70, 71, 72, 73, 75, 75, 56, 54, 53, 52, 51, 51,
+ 50, 50, 49, 49, 53, 53, 55, 56, 58, 59, 61, 63, 64, 66, 66, 69, 70, 71,
+ 72, 74, 75, 76, 77, 78, 80, 80, 61, 58, 57, 56, 55, 54, 54, 53, 52, 53,
+ 56, 56, 58, 59, 61, 62, 63, 66, 66, 69, 69, 72, 73, 75, 76, 78, 79, 80,
+ 82, 83, 86, 86 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 33, 42, 55, 33, 38, 46, 57, 42, 46, 63, 75, 55, 57, 75, 92,
+ /* Size 8x8 */
+ 31, 32, 32, 34, 38, 46, 52, 63, 32, 32, 32, 34, 37, 44, 49, 59, 32, 32,
+ 35, 37, 40, 45, 49, 58, 34, 34, 37, 42, 47, 52, 56, 65, 38, 37, 40, 47,
+ 54, 60, 65, 73, 46, 44, 45, 52, 60, 69, 75, 84, 52, 49, 49, 56, 65, 75,
+ 82, 92, 63, 59, 58, 65, 73, 84, 92, 105,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 54, 58, 61, 65, 31, 32,
+ 32, 32, 32, 32, 34, 35, 38, 40, 42, 46, 51, 55, 58, 62, 31, 32, 32, 32,
+ 32, 32, 33, 34, 37, 38, 41, 44, 49, 53, 56, 59, 31, 32, 32, 33, 33, 33,
+ 35, 36, 38, 40, 42, 45, 49, 53, 56, 59, 32, 32, 32, 33, 34, 34, 36, 37,
+ 39, 40, 42, 45, 49, 53, 55, 59, 32, 32, 32, 33, 34, 35, 37, 38, 40, 41,
+ 42, 46, 49, 52, 55, 58, 34, 34, 33, 35, 36, 37, 39, 42, 44, 46, 47, 51,
+ 54, 57, 60, 63, 36, 35, 34, 36, 37, 38, 42, 48, 50, 52, 54, 57, 60, 63,
+ 65, 68, 38, 38, 37, 38, 39, 40, 44, 50, 52, 54, 57, 60, 64, 67, 69, 72,
+ 41, 40, 38, 40, 40, 41, 46, 52, 54, 57, 60, 63, 67, 70, 73, 75, 44, 42,
+ 41, 42, 42, 42, 47, 54, 57, 60, 63, 67, 71, 74, 77, 79, 48, 46, 44, 45,
+ 45, 46, 51, 57, 60, 63, 67, 71, 76, 79, 82, 85, 54, 51, 49, 49, 49, 49,
+ 54, 60, 64, 67, 71, 76, 82, 86, 89, 92, 58, 55, 53, 53, 53, 52, 57, 63,
+ 67, 70, 74, 79, 86, 90, 93, 97, 61, 58, 56, 56, 55, 55, 60, 65, 69, 73,
+ 77, 82, 89, 93, 97, 101, 65, 62, 59, 59, 59, 58, 63, 68, 72, 75, 79, 85,
+ 92, 97, 101, 105,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 54, 54, 58, 59, 61, 65, 65, 70, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 46,
+ 47, 49, 52, 52, 56, 57, 59, 63, 63, 67, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 35, 35, 38, 38, 40, 42, 42, 45, 46, 48, 51, 51,
+ 55, 56, 58, 62, 62, 67, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 35, 35, 37, 38, 39, 42, 42, 45, 45, 47, 50, 50, 54, 55, 57, 61,
+ 61, 65, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 37, 37, 38, 41, 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 37, 38, 41,
+ 41, 44, 44, 46, 49, 49, 53, 54, 56, 59, 59, 64, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47,
+ 49, 49, 53, 54, 56, 59, 59, 63, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 34, 35, 35, 36, 36, 36, 38, 39, 40, 42, 42, 45, 45, 47, 50, 50, 53, 54,
+ 56, 59, 59, 63, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36,
+ 37, 37, 39, 39, 40, 42, 42, 45, 45, 47, 49, 49, 53, 54, 55, 59, 59, 63,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40,
+ 41, 42, 42, 45, 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 42, 42, 45,
+ 46, 47, 49, 49, 52, 53, 55, 58, 58, 62, 33, 33, 33, 33, 33, 33, 34, 35,
+ 35, 36, 36, 38, 39, 40, 42, 42, 43, 44, 45, 46, 46, 49, 50, 51, 53, 53,
+ 56, 57, 59, 62, 62, 66, 34, 34, 34, 34, 33, 33, 35, 35, 36, 37, 37, 39,
+ 39, 41, 42, 42, 44, 45, 46, 47, 47, 50, 51, 52, 54, 54, 57, 58, 60, 63,
+ 63, 67, 34, 34, 34, 34, 34, 34, 35, 36, 36, 37, 37, 40, 41, 42, 45, 45,
+ 46, 47, 48, 50, 50, 52, 53, 54, 56, 56, 59, 60, 62, 65, 65, 69, 36, 35,
+ 35, 35, 34, 34, 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54,
+ 54, 56, 57, 58, 60, 60, 63, 64, 65, 68, 68, 72, 36, 35, 35, 35, 34, 34,
+ 36, 36, 37, 38, 38, 42, 42, 45, 48, 48, 50, 50, 52, 54, 54, 56, 57, 58,
+ 60, 60, 63, 64, 65, 68, 68, 72, 38, 38, 38, 37, 37, 37, 38, 38, 39, 40,
+ 40, 43, 44, 46, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68,
+ 69, 72, 72, 76, 39, 38, 38, 38, 37, 37, 39, 39, 39, 40, 40, 44, 45, 47,
+ 50, 50, 53, 54, 55, 58, 58, 60, 61, 62, 65, 65, 68, 69, 70, 73, 73, 77,
+ 41, 40, 40, 39, 38, 38, 40, 40, 40, 41, 41, 45, 46, 48, 52, 52, 54, 55,
+ 57, 60, 60, 62, 63, 65, 67, 67, 70, 71, 73, 75, 75, 79, 44, 42, 42, 42,
+ 41, 41, 42, 42, 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+ 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 42, 41, 41, 42, 42,
+ 42, 42, 42, 46, 47, 50, 54, 54, 57, 58, 60, 63, 63, 66, 67, 68, 71, 71,
+ 74, 75, 77, 79, 79, 83, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 49,
+ 50, 52, 56, 56, 59, 60, 62, 66, 66, 69, 70, 72, 75, 75, 78, 79, 81, 84,
+ 84, 88, 48, 47, 46, 45, 44, 44, 45, 45, 45, 46, 46, 50, 51, 53, 57, 57,
+ 60, 61, 63, 67, 67, 70, 71, 73, 76, 76, 79, 80, 82, 85, 85, 89, 50, 49,
+ 48, 47, 46, 46, 47, 47, 47, 47, 47, 51, 52, 54, 58, 58, 61, 62, 65, 68,
+ 68, 72, 73, 75, 78, 78, 82, 83, 85, 88, 88, 92, 54, 52, 51, 50, 49, 49,
+ 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+ 82, 82, 86, 87, 89, 92, 92, 96, 54, 52, 51, 50, 49, 49, 49, 50, 49, 49,
+ 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87,
+ 89, 92, 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59,
+ 63, 63, 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101,
+ 59, 57, 56, 55, 54, 54, 54, 54, 54, 53, 53, 57, 58, 60, 64, 64, 68, 69,
+ 71, 75, 75, 79, 80, 83, 87, 87, 91, 92, 94, 98, 98, 102, 61, 59, 58, 57,
+ 56, 56, 56, 56, 55, 55, 55, 59, 60, 62, 65, 65, 69, 70, 73, 77, 77, 81,
+ 82, 85, 89, 89, 93, 94, 97, 101, 101, 105, 65, 63, 62, 61, 59, 59, 59,
+ 59, 59, 58, 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92,
+ 92, 97, 98, 101, 105, 105, 109, 65, 63, 62, 61, 59, 59, 59, 59, 59, 58,
+ 58, 62, 63, 65, 68, 68, 72, 73, 75, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+ 101, 105, 105, 109, 70, 67, 67, 65, 64, 64, 63, 63, 63, 62, 62, 66, 67,
+ 69, 72, 72, 76, 77, 79, 83, 83, 88, 89, 92, 96, 96, 101, 102, 105, 109,
+ 109, 114,
+ /* Size 4x8 */
+ 32, 32, 42, 56, 32, 33, 41, 53, 32, 35, 42, 52, 34, 37, 50, 59, 38, 40,
+ 58, 68, 44, 45, 66, 78, 50, 50, 71, 86, 61, 58, 79, 97,
+ /* Size 8x4 */
+ 32, 32, 32, 34, 38, 44, 50, 61, 32, 33, 35, 37, 40, 45, 50, 58, 42, 41,
+ 42, 50, 58, 66, 71, 79, 56, 53, 52, 59, 68, 78, 86, 97,
+ /* Size 8x16 */
+ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 51, 62, 31, 32,
+ 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42, 49, 59, 32, 32, 34, 36,
+ 39, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 34, 34, 37, 41, 44, 48,
+ 54, 63, 36, 34, 38, 46, 50, 54, 60, 68, 38, 37, 40, 47, 52, 57, 64, 72,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 48, 45,
+ 46, 54, 60, 67, 76, 85, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60,
+ 67, 74, 86, 97, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60, 58, 66, 72, 79,
+ 92, 105,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 32, 32, 34, 36, 38, 41, 44, 48, 53, 57, 61, 65, 31, 32,
+ 32, 32, 32, 33, 34, 34, 37, 39, 41, 45, 49, 53, 56, 60, 32, 32, 33, 34,
+ 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 35, 35, 34, 35, 36, 37,
+ 41, 46, 47, 49, 51, 54, 57, 60, 63, 66, 39, 38, 37, 38, 39, 40, 44, 50,
+ 52, 54, 57, 60, 64, 67, 69, 72, 44, 42, 41, 42, 42, 42, 48, 54, 57, 60,
+ 63, 67, 71, 74, 77, 79, 53, 51, 49, 49, 49, 49, 54, 60, 64, 67, 71, 76,
+ 82, 86, 89, 92, 65, 62, 59, 59, 58, 58, 63, 68, 72, 76, 79, 85, 92, 97,
+ 100, 105,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 39, 44, 44, 51, 53, 58, 65, 65, 31, 32,
+ 32, 32, 32, 32, 35, 35, 38, 42, 42, 49, 52, 56, 63, 63, 31, 32, 32, 32,
+ 32, 32, 35, 35, 38, 42, 42, 49, 51, 55, 62, 62, 31, 32, 32, 32, 32, 32,
+ 34, 35, 37, 41, 41, 48, 50, 54, 61, 61, 31, 32, 32, 32, 33, 33, 34, 34,
+ 37, 41, 41, 47, 49, 53, 59, 59, 31, 32, 32, 32, 33, 33, 34, 34, 37, 41,
+ 41, 47, 49, 53, 59, 59, 31, 32, 32, 33, 34, 34, 35, 36, 38, 42, 42, 48,
+ 49, 53, 59, 59, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 48, 50, 53,
+ 59, 59, 32, 32, 32, 33, 34, 34, 36, 37, 39, 42, 42, 48, 49, 53, 58, 58,
+ 32, 32, 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 32, 32,
+ 33, 34, 35, 35, 37, 38, 40, 42, 42, 48, 49, 52, 58, 58, 33, 33, 33, 35,
+ 36, 36, 40, 41, 43, 46, 46, 52, 53, 56, 62, 62, 34, 34, 34, 35, 37, 37,
+ 41, 42, 44, 48, 48, 53, 54, 57, 63, 63, 34, 34, 34, 35, 37, 37, 43, 44,
+ 46, 50, 50, 55, 56, 59, 65, 65, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54,
+ 54, 58, 60, 63, 68, 68, 36, 35, 34, 36, 38, 38, 46, 48, 50, 54, 54, 58,
+ 60, 63, 68, 68, 38, 37, 37, 38, 40, 40, 47, 50, 52, 57, 57, 62, 64, 67,
+ 72, 72, 39, 38, 37, 39, 40, 40, 48, 50, 53, 58, 58, 63, 65, 68, 73, 73,
+ 41, 39, 39, 40, 41, 41, 49, 51, 54, 60, 60, 66, 67, 70, 76, 76, 44, 41,
+ 41, 42, 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 44, 41, 41, 42,
+ 43, 43, 51, 53, 57, 63, 63, 69, 71, 74, 79, 79, 47, 44, 44, 44, 45, 45,
+ 53, 56, 59, 66, 66, 73, 75, 78, 84, 84, 48, 45, 45, 45, 46, 46, 54, 56,
+ 60, 67, 67, 74, 76, 79, 85, 85, 50, 47, 46, 47, 47, 47, 55, 58, 61, 68,
+ 68, 76, 78, 82, 88, 88, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79,
+ 82, 86, 92, 92, 53, 50, 49, 50, 50, 50, 57, 60, 64, 71, 71, 79, 82, 86,
+ 92, 92, 57, 54, 53, 53, 53, 53, 60, 63, 67, 74, 74, 83, 86, 90, 97, 97,
+ 58, 55, 54, 54, 54, 54, 61, 63, 68, 75, 75, 84, 87, 91, 98, 98, 61, 57,
+ 56, 56, 56, 56, 63, 65, 69, 77, 77, 86, 89, 93, 100, 100, 65, 61, 60,
+ 59, 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 65, 61, 60, 59,
+ 58, 58, 66, 68, 72, 79, 79, 89, 92, 97, 105, 105, 70, 65, 64, 63, 62,
+ 62, 70, 72, 76, 83, 83, 93, 96, 101, 109, 109,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 37, 38, 39, 41, 41, 44,
+ 45, 47, 50, 50, 54, 55, 57, 61, 61, 65, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44, 45, 46, 49, 49,
+ 53, 54, 56, 60, 60, 64, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35,
+ 35, 35, 36, 36, 38, 39, 40, 42, 42, 44, 45, 47, 50, 50, 53, 54, 56, 59,
+ 59, 63, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38,
+ 40, 40, 41, 43, 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 32, 32,
+ 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43,
+ 43, 45, 46, 47, 50, 50, 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34,
+ 35, 36, 36, 37, 37, 40, 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55,
+ 57, 57, 60, 61, 63, 66, 66, 70, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+ 38, 41, 42, 44, 48, 48, 50, 50, 51, 53, 53, 56, 56, 58, 60, 60, 63, 63,
+ 65, 68, 68, 72, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46,
+ 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76,
+ 44, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58,
+ 60, 63, 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 44, 42, 42, 41,
+ 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63, 63, 66,
+ 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 51, 49, 49, 48, 47, 47, 48, 48,
+ 48, 48, 48, 52, 53, 55, 58, 58, 62, 63, 66, 69, 69, 73, 74, 76, 79, 79,
+ 83, 84, 86, 89, 89, 93, 53, 52, 51, 50, 49, 49, 49, 50, 49, 49, 49, 53,
+ 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78, 82, 82, 86, 87, 89, 92,
+ 92, 96, 58, 56, 55, 54, 53, 53, 53, 53, 53, 52, 52, 56, 57, 59, 63, 63,
+ 67, 68, 70, 74, 74, 78, 79, 82, 86, 86, 90, 91, 93, 97, 97, 101, 65, 63,
+ 62, 61, 59, 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79,
+ 79, 84, 85, 88, 92, 92, 97, 98, 100, 105, 105, 109, 65, 63, 62, 61, 59,
+ 59, 59, 59, 58, 58, 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85,
+ 88, 92, 92, 97, 98, 100, 105, 105, 109,
+ /* Size 4x16 */
+ 31, 32, 44, 58, 32, 32, 42, 55, 32, 33, 41, 53, 32, 34, 42, 53, 32, 34,
+ 42, 53, 32, 35, 42, 52, 34, 37, 48, 57, 35, 38, 54, 63, 37, 40, 57, 67,
+ 39, 41, 60, 70, 41, 43, 63, 74, 45, 46, 67, 79, 50, 50, 71, 86, 54, 53,
+ 74, 90, 57, 56, 77, 93, 61, 58, 79, 97,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 32, 32, 34, 35, 37, 39, 41, 45, 50, 54, 57, 61, 32, 32,
+ 33, 34, 34, 35, 37, 38, 40, 41, 43, 46, 50, 53, 56, 58, 44, 42, 41, 42,
+ 42, 42, 48, 54, 57, 60, 63, 67, 71, 74, 77, 79, 58, 55, 53, 53, 53, 52,
+ 57, 63, 67, 70, 74, 79, 86, 90, 93, 97,
+ /* Size 8x32 */
+ 32, 31, 32, 35, 39, 44, 53, 65, 31, 32, 32, 35, 38, 42, 52, 63, 31, 32,
+ 32, 35, 38, 42, 51, 62, 31, 32, 32, 34, 37, 41, 50, 61, 31, 32, 33, 34,
+ 37, 41, 49, 59, 31, 32, 33, 34, 37, 41, 49, 59, 31, 32, 34, 35, 38, 42,
+ 49, 59, 32, 32, 34, 36, 38, 42, 50, 59, 32, 32, 34, 36, 39, 42, 49, 58,
+ 32, 33, 35, 37, 40, 42, 49, 58, 32, 33, 35, 37, 40, 42, 49, 58, 33, 33,
+ 36, 40, 43, 46, 53, 62, 34, 34, 37, 41, 44, 48, 54, 63, 34, 34, 37, 43,
+ 46, 50, 56, 65, 36, 34, 38, 46, 50, 54, 60, 68, 36, 34, 38, 46, 50, 54,
+ 60, 68, 38, 37, 40, 47, 52, 57, 64, 72, 39, 37, 40, 48, 53, 58, 65, 73,
+ 41, 39, 41, 49, 54, 60, 67, 76, 44, 41, 43, 51, 57, 63, 71, 79, 44, 41,
+ 43, 51, 57, 63, 71, 79, 47, 44, 45, 53, 59, 66, 75, 84, 48, 45, 46, 54,
+ 60, 67, 76, 85, 50, 46, 47, 55, 61, 68, 78, 88, 53, 49, 50, 57, 64, 71,
+ 82, 92, 53, 49, 50, 57, 64, 71, 82, 92, 57, 53, 53, 60, 67, 74, 86, 97,
+ 58, 54, 54, 61, 68, 75, 87, 98, 61, 56, 56, 63, 69, 77, 89, 100, 65, 60,
+ 58, 66, 72, 79, 92, 105, 65, 60, 58, 66, 72, 79, 92, 105, 70, 64, 62,
+ 70, 76, 83, 96, 109,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 39,
+ 41, 44, 44, 47, 48, 50, 53, 53, 57, 58, 61, 65, 65, 70, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 37, 37, 39, 41, 41, 44,
+ 45, 46, 49, 49, 53, 54, 56, 60, 60, 64, 32, 32, 32, 32, 33, 33, 34, 34,
+ 34, 35, 35, 36, 37, 37, 38, 38, 40, 40, 41, 43, 43, 45, 46, 47, 50, 50,
+ 53, 54, 56, 58, 58, 62, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+ 41, 43, 46, 46, 47, 48, 49, 51, 51, 53, 54, 55, 57, 57, 60, 61, 63, 66,
+ 66, 70, 39, 38, 38, 37, 37, 37, 38, 38, 39, 40, 40, 43, 44, 46, 50, 50,
+ 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 67, 68, 69, 72, 72, 76, 44, 42,
+ 42, 41, 41, 41, 42, 42, 42, 42, 42, 46, 48, 50, 54, 54, 57, 58, 60, 63,
+ 63, 66, 67, 68, 71, 71, 74, 75, 77, 79, 79, 83, 53, 52, 51, 50, 49, 49,
+ 49, 50, 49, 49, 49, 53, 54, 56, 60, 60, 64, 65, 67, 71, 71, 75, 76, 78,
+ 82, 82, 86, 87, 89, 92, 92, 96, 65, 63, 62, 61, 59, 59, 59, 59, 58, 58,
+ 58, 62, 63, 65, 68, 68, 72, 73, 76, 79, 79, 84, 85, 88, 92, 92, 97, 98,
+ 100, 105, 105, 109 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 41, 46, 51, 41, 48, 48, 51, 46, 48, 58, 62, 51, 51, 62, 71,
+ /* Size 8x8 */
+ 31, 31, 38, 44, 47, 48, 50, 55, 31, 32, 40, 44, 45, 46, 47, 52, 38, 40,
+ 47, 47, 46, 46, 47, 50, 44, 44, 47, 50, 51, 51, 52, 54, 47, 45, 46, 51,
+ 54, 56, 57, 60, 48, 46, 46, 51, 56, 61, 63, 66, 50, 47, 47, 52, 57, 63,
+ 66, 70, 55, 52, 50, 54, 60, 66, 70, 76,
+ /* Size 16x16 */
+ 32, 31, 30, 33, 34, 36, 41, 49, 48, 49, 49, 50, 52, 54, 55, 57, 31, 31,
+ 31, 34, 36, 38, 42, 47, 47, 47, 47, 48, 50, 51, 53, 54, 30, 31, 32, 34,
+ 37, 40, 42, 46, 45, 45, 45, 46, 47, 49, 50, 52, 33, 34, 34, 37, 40, 42,
+ 44, 47, 46, 46, 45, 46, 47, 49, 50, 51, 34, 36, 37, 40, 42, 45, 46, 47,
+ 46, 46, 45, 46, 47, 48, 49, 50, 36, 38, 40, 42, 45, 47, 47, 48, 47, 46,
+ 45, 46, 47, 48, 49, 50, 41, 42, 42, 44, 46, 47, 48, 50, 50, 49, 49, 50,
+ 50, 51, 52, 53, 49, 47, 46, 47, 47, 48, 50, 53, 53, 53, 53, 54, 54, 55,
+ 56, 56, 48, 47, 45, 46, 46, 47, 50, 53, 54, 54, 55, 56, 57, 58, 58, 59,
+ 49, 47, 45, 46, 46, 46, 49, 53, 54, 55, 57, 58, 59, 60, 60, 61, 49, 47,
+ 45, 45, 45, 45, 49, 53, 55, 57, 58, 60, 61, 62, 63, 63, 50, 48, 46, 46,
+ 46, 46, 50, 54, 56, 58, 60, 61, 63, 65, 66, 67, 52, 50, 47, 47, 47, 47,
+ 50, 54, 57, 59, 61, 63, 66, 68, 69, 70, 54, 51, 49, 49, 48, 48, 51, 55,
+ 58, 60, 62, 65, 68, 70, 71, 73, 55, 53, 50, 50, 49, 49, 52, 56, 58, 60,
+ 63, 66, 69, 71, 73, 74, 57, 54, 52, 51, 50, 50, 53, 56, 59, 61, 63, 67,
+ 70, 73, 74, 76,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 48, 48,
+ 49, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+ 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 47, 47, 47, 47, 47, 48,
+ 49, 49, 50, 50, 52, 52, 53, 55, 55, 57, 31, 31, 31, 31, 31, 31, 34, 34,
+ 36, 38, 38, 41, 42, 44, 47, 47, 47, 47, 47, 47, 47, 48, 48, 49, 50, 50,
+ 51, 52, 53, 54, 54, 56, 31, 31, 31, 31, 31, 31, 34, 35, 36, 39, 39, 41,
+ 42, 44, 47, 47, 46, 46, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 52, 53,
+ 53, 55, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46,
+ 45, 45, 45, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 30, 31,
+ 31, 31, 32, 32, 34, 35, 37, 40, 40, 42, 42, 44, 46, 46, 45, 45, 45, 45,
+ 45, 46, 46, 47, 47, 47, 49, 49, 50, 52, 52, 54, 33, 33, 34, 34, 34, 34,
+ 37, 38, 40, 42, 42, 44, 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47,
+ 47, 47, 49, 49, 50, 51, 51, 53, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43,
+ 43, 44, 45, 46, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49,
+ 50, 51, 51, 53, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46,
+ 46, 45, 45, 46, 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 36, 38, 38, 39,
+ 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 46,
+ 46, 46, 47, 47, 48, 48, 49, 50, 50, 51, 40, 41, 41, 41, 42, 42, 44, 44,
+ 45, 47, 47, 48, 48, 49, 50, 50, 49, 49, 49, 48, 48, 49, 49, 49, 49, 49,
+ 51, 51, 51, 52, 52, 54, 41, 42, 42, 42, 42, 42, 44, 45, 46, 47, 47, 48,
+ 48, 49, 50, 50, 50, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 53,
+ 53, 55, 44, 44, 44, 44, 44, 44, 45, 46, 46, 47, 47, 49, 49, 50, 51, 51,
+ 51, 51, 51, 51, 51, 51, 51, 51, 52, 52, 53, 53, 54, 54, 54, 56, 49, 48,
+ 47, 47, 46, 46, 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+ 53, 54, 54, 54, 54, 54, 55, 55, 56, 56, 56, 58, 49, 48, 47, 47, 46, 46,
+ 47, 47, 47, 48, 48, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54,
+ 54, 54, 55, 55, 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47,
+ 47, 49, 50, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58,
+ 58, 59, 59, 60, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51,
+ 53, 53, 54, 54, 55, 55, 55, 56, 56, 57, 57, 57, 58, 58, 59, 60, 60, 61,
+ 49, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 49, 49, 51, 53, 53, 54, 55,
+ 55, 57, 57, 57, 58, 58, 59, 59, 60, 60, 60, 61, 61, 63, 49, 47, 47, 46,
+ 45, 45, 45, 45, 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59,
+ 60, 60, 61, 61, 62, 62, 63, 63, 63, 65, 49, 47, 47, 46, 45, 45, 45, 45,
+ 45, 45, 45, 48, 49, 51, 53, 53, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61,
+ 62, 62, 63, 63, 63, 65, 50, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49,
+ 50, 51, 54, 54, 56, 56, 57, 59, 59, 61, 61, 62, 63, 63, 64, 64, 65, 66,
+ 66, 67, 50, 49, 48, 47, 46, 46, 46, 46, 46, 46, 46, 49, 50, 51, 54, 54,
+ 56, 56, 58, 60, 60, 61, 61, 62, 63, 63, 65, 65, 66, 67, 67, 68, 51, 49,
+ 49, 48, 47, 47, 47, 47, 47, 46, 46, 49, 50, 51, 54, 54, 56, 57, 58, 60,
+ 60, 62, 62, 63, 65, 65, 66, 66, 67, 68, 68, 70, 52, 50, 50, 49, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65,
+ 66, 66, 68, 68, 69, 70, 70, 72, 52, 50, 50, 49, 47, 47, 47, 47, 47, 47,
+ 47, 49, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 63, 65, 66, 66, 68, 68,
+ 69, 70, 70, 72, 54, 52, 51, 50, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53,
+ 55, 55, 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74,
+ 54, 52, 52, 51, 49, 49, 49, 49, 49, 48, 48, 51, 52, 53, 55, 55, 58, 58,
+ 60, 62, 62, 64, 65, 66, 68, 68, 70, 71, 72, 73, 73, 75, 55, 53, 53, 52,
+ 50, 50, 50, 50, 49, 49, 49, 51, 52, 54, 56, 56, 58, 59, 60, 63, 63, 65,
+ 66, 67, 69, 69, 71, 72, 73, 74, 74, 76, 57, 55, 54, 53, 52, 52, 51, 51,
+ 50, 50, 50, 52, 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70,
+ 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52, 51, 51, 50, 50, 50, 52,
+ 53, 54, 56, 56, 59, 60, 61, 63, 63, 66, 67, 68, 70, 70, 73, 73, 74, 76,
+ 76, 78, 59, 57, 56, 55, 54, 54, 53, 53, 52, 51, 51, 54, 55, 56, 58, 58,
+ 60, 61, 63, 65, 65, 67, 68, 70, 72, 72, 74, 75, 76, 78, 78, 80,
+ /* Size 4x8 */
+ 31, 38, 47, 52, 32, 40, 45, 49, 39, 47, 45, 48, 44, 47, 51, 53, 46, 47,
+ 56, 58, 47, 46, 59, 64, 48, 47, 61, 68, 53, 50, 64, 73,
+ /* Size 8x4 */
+ 31, 32, 39, 44, 46, 47, 48, 53, 38, 40, 47, 47, 47, 46, 47, 50, 47, 45,
+ 45, 51, 56, 59, 61, 64, 52, 49, 48, 53, 58, 64, 68, 73,
+ /* Size 8x16 */
+ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 54, 30, 32,
+ 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45, 47, 51, 35, 37, 44, 46,
+ 46, 45, 47, 51, 37, 40, 47, 47, 47, 45, 47, 50, 42, 43, 47, 49, 50, 49,
+ 50, 53, 49, 46, 48, 52, 53, 53, 54, 57, 48, 46, 47, 51, 54, 55, 57, 59,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 50, 46,
+ 46, 52, 56, 59, 64, 67, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54,
+ 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52, 50, 55, 59, 64,
+ 70, 76,
+ /* Size 16x8 */
+ 32, 31, 30, 33, 35, 37, 42, 49, 48, 48, 49, 50, 52, 54, 55, 57, 31, 31,
+ 32, 35, 37, 40, 43, 46, 46, 45, 45, 46, 48, 49, 51, 52, 37, 38, 40, 42,
+ 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 45, 45, 44, 46, 46, 47,
+ 49, 52, 51, 51, 51, 52, 53, 54, 54, 55, 48, 47, 45, 46, 46, 47, 50, 53,
+ 54, 54, 55, 56, 57, 58, 58, 59, 49, 47, 45, 45, 45, 45, 49, 53, 55, 57,
+ 58, 59, 61, 62, 63, 64, 52, 50, 48, 47, 47, 47, 50, 54, 57, 59, 61, 64,
+ 66, 68, 69, 70, 57, 54, 52, 51, 51, 50, 53, 57, 59, 61, 64, 67, 71, 73,
+ 74, 76,
+ /* Size 16x32 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 48, 49, 49, 51, 52, 54, 57, 57, 31, 31,
+ 31, 34, 38, 38, 45, 47, 47, 47, 47, 50, 50, 52, 55, 55, 31, 31, 31, 34,
+ 38, 38, 45, 47, 47, 47, 47, 49, 50, 51, 54, 54, 31, 31, 32, 34, 39, 39,
+ 45, 46, 46, 46, 46, 48, 49, 51, 53, 53, 30, 32, 32, 35, 40, 40, 44, 46,
+ 45, 45, 45, 47, 48, 49, 52, 52, 30, 32, 32, 35, 40, 40, 44, 46, 45, 45,
+ 45, 47, 48, 49, 52, 52, 33, 34, 35, 37, 42, 42, 46, 47, 46, 45, 45, 47,
+ 47, 49, 51, 51, 33, 35, 36, 38, 43, 43, 46, 47, 46, 46, 46, 47, 47, 49,
+ 51, 51, 35, 37, 37, 40, 44, 44, 46, 47, 46, 45, 45, 47, 47, 48, 51, 51,
+ 37, 39, 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 37, 39,
+ 40, 43, 47, 47, 47, 47, 47, 45, 45, 46, 47, 48, 50, 50, 41, 42, 42, 44,
+ 47, 47, 49, 49, 49, 48, 48, 49, 50, 51, 52, 52, 42, 42, 43, 44, 47, 47,
+ 49, 50, 50, 49, 49, 50, 50, 51, 53, 53, 44, 44, 44, 45, 47, 47, 50, 51,
+ 51, 51, 51, 52, 52, 53, 54, 54, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53,
+ 53, 54, 54, 55, 57, 57, 49, 47, 46, 47, 48, 48, 52, 53, 53, 53, 53, 54,
+ 54, 55, 57, 57, 48, 46, 46, 46, 47, 47, 51, 53, 54, 55, 55, 56, 57, 58,
+ 59, 59, 48, 46, 46, 46, 47, 47, 51, 53, 54, 56, 56, 57, 57, 58, 60, 60,
+ 48, 46, 45, 46, 46, 46, 51, 53, 54, 57, 57, 58, 59, 60, 61, 61, 49, 46,
+ 45, 45, 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 49, 46, 45, 45,
+ 46, 46, 51, 53, 55, 58, 58, 61, 61, 62, 64, 64, 50, 47, 46, 46, 46, 46,
+ 52, 54, 56, 59, 59, 62, 63, 64, 66, 66, 50, 47, 46, 46, 46, 46, 52, 54,
+ 56, 59, 59, 63, 64, 65, 67, 67, 51, 48, 47, 47, 47, 47, 52, 54, 56, 60,
+ 60, 64, 65, 66, 68, 68, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65,
+ 66, 68, 71, 71, 52, 48, 48, 47, 47, 47, 53, 54, 57, 61, 61, 65, 66, 68,
+ 71, 71, 54, 50, 49, 49, 48, 48, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73,
+ 54, 51, 50, 49, 49, 49, 54, 55, 58, 62, 62, 67, 68, 70, 73, 73, 55, 51,
+ 51, 50, 49, 49, 54, 56, 58, 63, 63, 68, 69, 71, 74, 74, 57, 53, 52, 51,
+ 50, 50, 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 57, 53, 52, 51, 50, 50,
+ 55, 56, 59, 64, 64, 69, 70, 73, 76, 76, 59, 55, 54, 53, 52, 52, 57, 58,
+ 61, 65, 65, 70, 72, 74, 78, 78,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+ 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 31,
+ 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 46, 46, 46, 46, 46, 47,
+ 47, 48, 48, 48, 50, 51, 51, 53, 53, 55, 31, 31, 31, 32, 32, 32, 35, 36,
+ 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48,
+ 49, 50, 51, 52, 52, 54, 33, 34, 34, 34, 35, 35, 37, 38, 40, 43, 43, 44,
+ 44, 45, 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 49, 49, 50, 51,
+ 51, 53, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48,
+ 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 37, 38,
+ 38, 39, 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46,
+ 46, 46, 46, 47, 47, 47, 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44,
+ 46, 46, 46, 47, 47, 49, 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52,
+ 53, 53, 54, 54, 54, 55, 55, 57, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47,
+ 47, 49, 50, 51, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 55, 55,
+ 56, 56, 56, 58, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51,
+ 53, 53, 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61,
+ 49, 47, 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56,
+ 57, 58, 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 49, 47, 47, 46,
+ 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58, 58, 59,
+ 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 51, 50, 49, 48, 47, 47, 47, 47,
+ 47, 46, 46, 49, 50, 52, 54, 54, 56, 57, 58, 61, 61, 62, 63, 64, 65, 65,
+ 67, 67, 68, 69, 69, 70, 52, 50, 50, 49, 48, 48, 47, 47, 47, 47, 47, 50,
+ 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65, 66, 66, 68, 68, 69, 70,
+ 70, 72, 54, 52, 51, 51, 49, 49, 49, 49, 48, 48, 48, 51, 51, 53, 55, 55,
+ 58, 58, 60, 62, 62, 64, 65, 66, 68, 68, 70, 70, 71, 73, 73, 74, 57, 55,
+ 54, 53, 52, 52, 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64,
+ 64, 66, 67, 68, 71, 71, 73, 73, 74, 76, 76, 78, 57, 55, 54, 53, 52, 52,
+ 51, 51, 51, 50, 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68,
+ 71, 71, 73, 73, 74, 76, 76, 78,
+ /* Size 4x16 */
+ 31, 37, 49, 54, 31, 38, 47, 51, 32, 40, 45, 49, 34, 42, 45, 49, 37, 44,
+ 45, 48, 39, 47, 45, 48, 42, 47, 49, 51, 47, 48, 53, 55, 46, 47, 55, 58,
+ 46, 46, 57, 60, 46, 46, 58, 62, 47, 46, 59, 65, 48, 47, 61, 68, 50, 48,
+ 62, 70, 51, 49, 63, 71, 53, 50, 64, 73,
+ /* Size 16x4 */
+ 31, 31, 32, 34, 37, 39, 42, 47, 46, 46, 46, 47, 48, 50, 51, 53, 37, 38,
+ 40, 42, 44, 47, 47, 48, 47, 46, 46, 46, 47, 48, 49, 50, 49, 47, 45, 45,
+ 45, 45, 49, 53, 55, 57, 58, 59, 61, 62, 63, 64, 54, 51, 49, 49, 48, 48,
+ 51, 55, 58, 60, 62, 65, 68, 70, 71, 73,
+ /* Size 8x32 */
+ 32, 31, 37, 45, 48, 49, 52, 57, 31, 31, 38, 45, 47, 47, 50, 55, 31, 31,
+ 38, 45, 47, 47, 50, 54, 31, 32, 39, 45, 46, 46, 49, 53, 30, 32, 40, 44,
+ 45, 45, 48, 52, 30, 32, 40, 44, 45, 45, 48, 52, 33, 35, 42, 46, 46, 45,
+ 47, 51, 33, 36, 43, 46, 46, 46, 47, 51, 35, 37, 44, 46, 46, 45, 47, 51,
+ 37, 40, 47, 47, 47, 45, 47, 50, 37, 40, 47, 47, 47, 45, 47, 50, 41, 42,
+ 47, 49, 49, 48, 50, 52, 42, 43, 47, 49, 50, 49, 50, 53, 44, 44, 47, 50,
+ 51, 51, 52, 54, 49, 46, 48, 52, 53, 53, 54, 57, 49, 46, 48, 52, 53, 53,
+ 54, 57, 48, 46, 47, 51, 54, 55, 57, 59, 48, 46, 47, 51, 54, 56, 57, 60,
+ 48, 45, 46, 51, 54, 57, 59, 61, 49, 45, 46, 51, 55, 58, 61, 64, 49, 45,
+ 46, 51, 55, 58, 61, 64, 50, 46, 46, 52, 56, 59, 63, 66, 50, 46, 46, 52,
+ 56, 59, 64, 67, 51, 47, 47, 52, 56, 60, 65, 68, 52, 48, 47, 53, 57, 61,
+ 66, 71, 52, 48, 47, 53, 57, 61, 66, 71, 54, 49, 48, 54, 58, 62, 68, 73,
+ 54, 50, 49, 54, 58, 62, 68, 73, 55, 51, 49, 54, 58, 63, 69, 74, 57, 52,
+ 50, 55, 59, 64, 70, 76, 57, 52, 50, 55, 59, 64, 70, 76, 59, 54, 52, 57,
+ 61, 65, 72, 78,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 48, 48,
+ 48, 49, 49, 50, 50, 51, 52, 52, 54, 54, 55, 57, 57, 59, 31, 31, 31, 32,
+ 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 46, 46, 45, 45, 45, 46,
+ 46, 47, 48, 48, 49, 50, 51, 52, 52, 54, 37, 38, 38, 39, 40, 40, 42, 43,
+ 44, 47, 47, 47, 47, 47, 48, 48, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47,
+ 48, 49, 49, 50, 50, 52, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+ 49, 50, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 53, 53, 54, 54, 54, 55,
+ 55, 57, 48, 47, 47, 46, 45, 45, 46, 46, 46, 47, 47, 49, 50, 51, 53, 53,
+ 54, 54, 54, 55, 55, 56, 56, 56, 57, 57, 58, 58, 58, 59, 59, 61, 49, 47,
+ 47, 46, 45, 45, 45, 46, 45, 45, 45, 48, 49, 51, 53, 53, 55, 56, 57, 58,
+ 58, 59, 59, 60, 61, 61, 62, 62, 63, 64, 64, 65, 52, 50, 50, 49, 48, 48,
+ 47, 47, 47, 47, 47, 50, 50, 52, 54, 54, 57, 57, 59, 61, 61, 63, 64, 65,
+ 66, 66, 68, 68, 69, 70, 70, 72, 57, 55, 54, 53, 52, 52, 51, 51, 51, 50,
+ 50, 52, 53, 54, 57, 57, 59, 60, 61, 64, 64, 66, 67, 68, 71, 71, 73, 73,
+ 74, 76, 76, 78 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 38, 51, 32, 35, 40, 49, 38, 40, 54, 64, 51, 49, 64, 81,
+ /* Size 8x8 */
+ 31, 32, 32, 34, 35, 41, 47, 53, 32, 32, 32, 33, 34, 40, 44, 50, 32, 32,
+ 34, 35, 37, 41, 45, 51, 34, 33, 35, 39, 42, 47, 51, 55, 35, 34, 37, 42,
+ 48, 53, 57, 61, 41, 40, 41, 47, 53, 60, 65, 70, 47, 44, 45, 51, 57, 65,
+ 71, 77, 53, 50, 51, 55, 61, 70, 77, 85,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 32, 32, 34, 36, 38, 39, 44, 47, 49, 54, 59, 31, 32,
+ 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 56, 31, 32, 32, 32,
+ 32, 32, 33, 33, 34, 36, 37, 41, 44, 46, 50, 54, 31, 32, 32, 32, 32, 33,
+ 33, 34, 35, 36, 38, 41, 44, 45, 49, 54, 31, 32, 32, 32, 33, 34, 34, 35,
+ 36, 38, 39, 42, 45, 46, 50, 54, 32, 32, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 42, 45, 46, 49, 53, 32, 33, 33, 33, 34, 36, 36, 38, 40, 41, 42, 44,
+ 47, 48, 51, 55, 34, 34, 33, 34, 35, 37, 38, 39, 42, 44, 45, 47, 50, 51,
+ 54, 58, 36, 35, 34, 35, 36, 38, 40, 42, 48, 50, 50, 54, 56, 57, 60, 64,
+ 38, 37, 36, 36, 38, 39, 41, 44, 50, 51, 52, 56, 58, 60, 63, 67, 39, 38,
+ 37, 38, 39, 40, 42, 45, 50, 52, 54, 58, 60, 62, 65, 69, 44, 42, 41, 41,
+ 42, 42, 44, 47, 54, 56, 58, 63, 66, 68, 71, 75, 47, 45, 44, 44, 45, 45,
+ 47, 50, 56, 58, 60, 66, 69, 71, 75, 79, 49, 47, 46, 45, 46, 46, 48, 51,
+ 57, 60, 62, 68, 71, 73, 77, 81, 54, 51, 50, 49, 50, 49, 51, 54, 60, 63,
+ 65, 71, 75, 77, 82, 87, 59, 56, 54, 54, 54, 53, 55, 58, 64, 67, 69, 75,
+ 79, 81, 87, 92,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 54, 55, 59, 59, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 39, 39, 41,
+ 43, 43, 46, 47, 48, 51, 52, 53, 57, 57, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 43, 45, 46,
+ 47, 51, 51, 53, 56, 56, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 34, 35, 35, 37, 38, 38, 41, 42, 42, 45, 46, 47, 51, 51, 52,
+ 56, 56, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34,
+ 34, 34, 36, 37, 37, 40, 41, 41, 44, 45, 46, 49, 50, 51, 54, 54, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 36, 37,
+ 37, 40, 41, 41, 44, 44, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 36, 38, 38, 40, 41, 41,
+ 44, 45, 45, 49, 49, 50, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 34, 35, 35, 35, 36, 36, 38, 39, 39, 41, 42, 42, 44, 45, 46, 49,
+ 50, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 36, 36, 36, 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 50, 51, 54, 54,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 38, 39, 39, 41, 42, 42, 45, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42,
+ 42, 43, 45, 46, 46, 49, 49, 50, 53, 53, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 39, 40, 40, 42, 42, 43, 45, 46,
+ 46, 49, 49, 50, 53, 53, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 44, 45, 47, 47, 48, 51, 51, 52,
+ 55, 55, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41,
+ 42, 42, 44, 45, 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 34, 34,
+ 34, 34, 33, 33, 34, 35, 35, 35, 37, 37, 38, 39, 39, 41, 42, 42, 44, 45,
+ 45, 47, 47, 48, 50, 51, 51, 54, 54, 55, 58, 58, 35, 34, 34, 34, 34, 34,
+ 34, 35, 36, 36, 37, 37, 39, 41, 41, 43, 45, 45, 47, 47, 47, 49, 50, 51,
+ 53, 53, 54, 57, 57, 58, 61, 61, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+ 38, 38, 40, 42, 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59,
+ 60, 61, 64, 64, 36, 35, 35, 35, 34, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+ 42, 45, 48, 48, 50, 50, 50, 53, 54, 54, 56, 57, 57, 59, 60, 61, 64, 64,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 47, 50, 50,
+ 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 39, 39, 38, 38,
+ 37, 37, 38, 39, 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56,
+ 58, 58, 60, 61, 62, 64, 65, 66, 69, 69, 39, 39, 38, 38, 37, 37, 38, 39,
+ 39, 39, 40, 40, 42, 45, 45, 47, 50, 50, 52, 54, 54, 56, 58, 58, 60, 61,
+ 62, 64, 65, 66, 69, 69, 42, 41, 41, 41, 40, 40, 40, 41, 41, 41, 42, 42,
+ 44, 47, 47, 49, 53, 53, 55, 56, 56, 60, 61, 62, 64, 65, 66, 69, 69, 70,
+ 73, 73, 44, 43, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 47, 47, 50,
+ 54, 54, 56, 58, 58, 61, 63, 64, 66, 67, 68, 71, 71, 72, 75, 75, 44, 43,
+ 43, 42, 41, 41, 41, 42, 42, 42, 43, 43, 45, 48, 48, 51, 54, 54, 56, 58,
+ 58, 62, 64, 64, 66, 67, 68, 71, 72, 73, 76, 76, 47, 46, 45, 45, 44, 44,
+ 44, 44, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+ 69, 70, 71, 74, 75, 76, 79, 79, 48, 47, 46, 46, 45, 44, 45, 45, 45, 45,
+ 46, 46, 47, 51, 51, 53, 57, 57, 59, 61, 61, 65, 67, 67, 70, 71, 72, 75,
+ 76, 77, 80, 80, 49, 48, 47, 47, 46, 45, 45, 46, 46, 46, 46, 46, 48, 51,
+ 51, 54, 57, 57, 60, 62, 62, 66, 68, 68, 71, 72, 73, 77, 77, 78, 81, 81,
+ 53, 51, 51, 51, 49, 49, 49, 49, 49, 49, 49, 49, 51, 54, 54, 57, 59, 59,
+ 62, 64, 64, 69, 71, 71, 74, 75, 77, 81, 81, 83, 86, 86, 54, 52, 51, 51,
+ 50, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69,
+ 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 55, 53, 53, 52, 51, 50, 50, 51,
+ 51, 51, 50, 50, 52, 55, 55, 58, 61, 61, 64, 66, 66, 70, 72, 73, 76, 77,
+ 78, 83, 83, 85, 88, 88, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53,
+ 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88,
+ 92, 92, 59, 57, 56, 56, 54, 54, 54, 54, 54, 54, 53, 53, 55, 58, 58, 61,
+ 64, 64, 67, 69, 69, 73, 75, 76, 79, 80, 81, 86, 87, 88, 92, 92,
+ /* Size 4x8 */
+ 32, 32, 37, 52, 32, 33, 36, 49, 32, 34, 38, 49, 34, 37, 44, 54, 35, 38,
+ 49, 60, 40, 42, 55, 69, 46, 46, 59, 76, 52, 51, 64, 83,
+ /* Size 8x4 */
+ 32, 32, 32, 34, 35, 40, 46, 52, 32, 33, 34, 37, 38, 42, 46, 51, 37, 36,
+ 38, 44, 49, 55, 59, 64, 52, 49, 49, 54, 60, 69, 76, 83,
+ /* Size 8x16 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32,
+ 32, 33, 35, 41, 44, 49, 31, 32, 33, 33, 35, 41, 44, 49, 32, 32, 34, 34,
+ 36, 42, 45, 50, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44,
+ 47, 51, 34, 34, 36, 38, 42, 48, 50, 54, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 44, 41,
+ 42, 45, 53, 63, 66, 71, 47, 44, 45, 47, 56, 66, 69, 75, 49, 46, 47, 48,
+ 57, 67, 71, 77, 53, 49, 50, 51, 60, 71, 75, 82, 58, 54, 54, 55, 63, 75,
+ 79, 87,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 32, 32, 32, 34, 36, 38, 39, 44, 47, 49, 53, 58, 31, 32,
+ 32, 32, 32, 33, 33, 34, 34, 36, 37, 41, 44, 46, 49, 54, 32, 32, 32, 33,
+ 34, 35, 35, 36, 37, 39, 40, 42, 45, 47, 50, 54, 32, 33, 33, 33, 34, 36,
+ 36, 38, 40, 41, 42, 45, 47, 48, 51, 55, 36, 35, 35, 35, 36, 38, 40, 42,
+ 48, 49, 50, 53, 56, 57, 60, 63, 44, 42, 41, 41, 42, 42, 44, 48, 54, 56,
+ 58, 63, 66, 67, 71, 75, 47, 45, 44, 44, 45, 45, 47, 50, 56, 58, 60, 66,
+ 69, 71, 75, 79, 53, 51, 49, 49, 50, 49, 51, 54, 60, 63, 65, 71, 75, 77,
+ 82, 87,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 32, 32, 32, 35, 36, 38, 44, 44, 47, 53, 53, 59, 31, 32,
+ 32, 32, 32, 32, 33, 35, 35, 37, 43, 43, 46, 52, 52, 57, 31, 32, 32, 32,
+ 32, 32, 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32,
+ 33, 35, 35, 37, 42, 42, 45, 51, 51, 56, 31, 32, 32, 32, 32, 32, 33, 34,
+ 35, 36, 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 32, 33, 33, 34, 34, 36,
+ 41, 41, 44, 49, 49, 54, 31, 32, 32, 32, 33, 33, 33, 35, 35, 36, 41, 41,
+ 44, 49, 49, 54, 32, 32, 32, 32, 33, 34, 34, 36, 36, 38, 42, 42, 45, 49,
+ 49, 54, 32, 32, 32, 33, 34, 34, 34, 36, 36, 38, 42, 42, 45, 50, 50, 54,
+ 32, 32, 32, 33, 34, 34, 35, 37, 37, 38, 42, 42, 45, 49, 49, 54, 32, 32,
+ 33, 33, 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 32, 33, 33,
+ 35, 35, 36, 38, 38, 39, 42, 42, 45, 49, 49, 53, 32, 33, 33, 33, 35, 36,
+ 36, 39, 40, 41, 44, 44, 47, 51, 51, 55, 34, 34, 34, 34, 36, 37, 38, 42,
+ 42, 44, 48, 48, 50, 54, 54, 58, 34, 34, 34, 34, 36, 37, 38, 42, 42, 44,
+ 48, 48, 50, 54, 54, 58, 35, 34, 34, 34, 37, 37, 39, 44, 45, 46, 50, 50,
+ 53, 57, 57, 61, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60,
+ 60, 64, 36, 35, 34, 35, 37, 38, 40, 47, 48, 49, 54, 54, 56, 60, 60, 64,
+ 38, 37, 36, 37, 39, 40, 41, 48, 49, 51, 56, 56, 58, 63, 63, 67, 39, 38,
+ 37, 38, 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 39, 38, 37, 38,
+ 40, 40, 42, 49, 50, 52, 58, 58, 60, 65, 65, 69, 42, 40, 40, 40, 42, 42,
+ 44, 51, 52, 55, 61, 61, 64, 69, 69, 73, 44, 42, 41, 41, 42, 43, 45, 52,
+ 53, 56, 63, 63, 66, 71, 71, 75, 44, 42, 41, 41, 43, 43, 45, 52, 54, 56,
+ 63, 63, 66, 72, 72, 76, 47, 45, 44, 44, 45, 45, 47, 54, 56, 58, 66, 66,
+ 69, 75, 75, 79, 48, 46, 45, 45, 46, 46, 48, 55, 56, 59, 67, 67, 70, 76,
+ 76, 80, 49, 47, 46, 46, 47, 47, 48, 56, 57, 60, 67, 67, 71, 77, 77, 81,
+ 53, 50, 49, 49, 49, 49, 51, 58, 59, 62, 71, 71, 74, 81, 81, 86, 53, 51,
+ 49, 49, 50, 50, 51, 59, 60, 63, 71, 71, 75, 82, 82, 87, 55, 52, 51, 51,
+ 51, 51, 53, 60, 61, 64, 72, 72, 76, 83, 83, 88, 58, 55, 54, 54, 54, 54,
+ 55, 62, 63, 67, 75, 75, 79, 87, 87, 92, 58, 55, 54, 54, 54, 54, 55, 62,
+ 63, 67, 75, 75, 79, 87, 87, 92,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 37, 38, 38, 40,
+ 42, 42, 45, 46, 47, 50, 51, 52, 55, 55, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40, 41, 41, 44, 45,
+ 46, 49, 49, 51, 54, 54, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 34, 34, 34, 35, 35, 37, 38, 38, 40, 41, 41, 44, 45, 46, 49, 49, 51,
+ 54, 54, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37,
+ 37, 37, 39, 40, 40, 42, 42, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 37, 38, 38, 40, 40,
+ 40, 42, 43, 43, 45, 46, 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33,
+ 33, 34, 34, 35, 36, 36, 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45,
+ 47, 48, 48, 51, 51, 53, 55, 55, 35, 35, 35, 35, 34, 34, 35, 36, 36, 37,
+ 38, 38, 39, 42, 42, 44, 47, 47, 48, 49, 49, 51, 52, 52, 54, 55, 56, 58,
+ 59, 60, 62, 62, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42,
+ 42, 45, 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63,
+ 38, 37, 37, 37, 36, 36, 36, 38, 38, 38, 39, 39, 41, 44, 44, 46, 49, 49,
+ 51, 52, 52, 55, 56, 56, 58, 59, 60, 62, 63, 64, 67, 67, 44, 43, 42, 42,
+ 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61,
+ 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 44, 43, 42, 42, 41, 41, 41, 42,
+ 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58, 58, 61, 63, 63, 66, 67,
+ 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44, 44, 45, 45, 45, 45, 45,
+ 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66, 69, 70, 71, 74, 75, 76,
+ 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57,
+ 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 53, 52,
+ 51, 51, 49, 49, 49, 49, 50, 49, 49, 49, 51, 54, 54, 57, 60, 60, 63, 65,
+ 65, 69, 71, 72, 75, 76, 77, 81, 82, 83, 87, 87, 59, 57, 56, 56, 54, 54,
+ 54, 54, 54, 54, 53, 53, 55, 58, 58, 61, 64, 64, 67, 69, 69, 73, 75, 76,
+ 79, 80, 81, 86, 87, 88, 92, 92,
+ /* Size 4x16 */
+ 31, 32, 38, 53, 32, 32, 37, 51, 32, 32, 36, 49, 32, 33, 36, 49, 32, 34,
+ 38, 50, 32, 35, 39, 49, 33, 36, 41, 51, 34, 37, 44, 54, 35, 38, 49, 60,
+ 37, 40, 51, 63, 38, 40, 52, 65, 42, 43, 56, 71, 45, 45, 58, 75, 47, 47,
+ 60, 77, 51, 50, 63, 82, 55, 54, 67, 87,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 32, 32, 33, 34, 35, 37, 38, 42, 45, 47, 51, 55, 32, 32,
+ 32, 33, 34, 35, 36, 37, 38, 40, 40, 43, 45, 47, 50, 54, 38, 37, 36, 36,
+ 38, 39, 41, 44, 49, 51, 52, 56, 58, 60, 63, 67, 53, 51, 49, 49, 50, 49,
+ 51, 54, 60, 63, 65, 71, 75, 77, 82, 87,
+ /* Size 8x32 */
+ 32, 31, 32, 32, 36, 44, 47, 53, 31, 32, 32, 33, 35, 43, 46, 52, 31, 32,
+ 32, 33, 35, 42, 45, 51, 31, 32, 32, 33, 35, 42, 45, 51, 31, 32, 32, 33,
+ 35, 41, 44, 49, 31, 32, 32, 33, 34, 41, 44, 49, 31, 32, 33, 33, 35, 41,
+ 44, 49, 32, 32, 33, 34, 36, 42, 45, 49, 32, 32, 34, 34, 36, 42, 45, 50,
+ 32, 32, 34, 35, 37, 42, 45, 49, 32, 33, 35, 36, 38, 42, 45, 49, 32, 33,
+ 35, 36, 38, 42, 45, 49, 32, 33, 35, 36, 40, 44, 47, 51, 34, 34, 36, 38,
+ 42, 48, 50, 54, 34, 34, 36, 38, 42, 48, 50, 54, 35, 34, 37, 39, 45, 50,
+ 53, 57, 36, 34, 37, 40, 48, 54, 56, 60, 36, 34, 37, 40, 48, 54, 56, 60,
+ 38, 36, 39, 41, 49, 56, 58, 63, 39, 37, 40, 42, 50, 58, 60, 65, 39, 37,
+ 40, 42, 50, 58, 60, 65, 42, 40, 42, 44, 52, 61, 64, 69, 44, 41, 42, 45,
+ 53, 63, 66, 71, 44, 41, 43, 45, 54, 63, 66, 72, 47, 44, 45, 47, 56, 66,
+ 69, 75, 48, 45, 46, 48, 56, 67, 70, 76, 49, 46, 47, 48, 57, 67, 71, 77,
+ 53, 49, 49, 51, 59, 71, 74, 81, 53, 49, 50, 51, 60, 71, 75, 82, 55, 51,
+ 51, 53, 61, 72, 76, 83, 58, 54, 54, 55, 63, 75, 79, 87, 58, 54, 54, 55,
+ 63, 75, 79, 87,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 35, 36, 36,
+ 38, 39, 39, 42, 44, 44, 47, 48, 49, 53, 53, 55, 58, 58, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 36, 37, 37, 40,
+ 41, 41, 44, 45, 46, 49, 49, 51, 54, 54, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 35, 35, 35, 36, 36, 37, 37, 37, 39, 40, 40, 42, 42, 43, 45, 46,
+ 47, 49, 50, 51, 54, 54, 32, 33, 33, 33, 33, 33, 33, 34, 34, 35, 36, 36,
+ 36, 38, 38, 39, 40, 40, 41, 42, 42, 44, 45, 45, 47, 48, 48, 51, 51, 53,
+ 55, 55, 36, 35, 35, 35, 35, 34, 35, 36, 36, 37, 38, 38, 40, 42, 42, 45,
+ 48, 48, 49, 50, 50, 52, 53, 54, 56, 56, 57, 59, 60, 61, 63, 63, 44, 43,
+ 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 44, 48, 48, 50, 54, 54, 56, 58,
+ 58, 61, 63, 63, 66, 67, 67, 71, 71, 72, 75, 75, 47, 46, 45, 45, 44, 44,
+ 44, 45, 45, 45, 45, 45, 47, 50, 50, 53, 56, 56, 58, 60, 60, 64, 66, 66,
+ 69, 70, 71, 74, 75, 76, 79, 79, 53, 52, 51, 51, 49, 49, 49, 49, 50, 49,
+ 49, 49, 51, 54, 54, 57, 60, 60, 63, 65, 65, 69, 71, 72, 75, 76, 77, 81,
+ 82, 83, 87, 87 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 38, 47, 49, 38, 47, 46, 46, 47, 46, 54, 57, 49, 46, 57, 66,
+ /* Size 8x8 */
+ 31, 31, 35, 42, 48, 47, 49, 51, 31, 32, 36, 42, 46, 45, 46, 48, 35, 36,
+ 41, 45, 47, 45, 46, 48, 42, 42, 45, 48, 50, 49, 50, 51, 48, 46, 47, 50,
+ 53, 53, 54, 54, 47, 45, 45, 49, 53, 57, 59, 60, 49, 46, 46, 50, 54, 59,
+ 61, 64, 51, 48, 48, 51, 54, 60, 64, 68,
+ /* Size 16x16 */
+ 32, 31, 30, 31, 33, 36, 38, 41, 49, 49, 48, 49, 50, 51, 52, 54, 31, 31,
+ 31, 32, 34, 38, 40, 42, 47, 47, 47, 47, 48, 48, 50, 52, 30, 31, 31, 32,
+ 35, 39, 41, 42, 46, 46, 46, 45, 46, 47, 48, 50, 31, 32, 32, 33, 36, 40,
+ 41, 43, 46, 46, 45, 45, 46, 46, 47, 49, 33, 34, 35, 36, 39, 43, 44, 45,
+ 47, 46, 46, 45, 46, 47, 47, 49, 36, 38, 39, 40, 43, 47, 47, 47, 48, 47,
+ 46, 45, 46, 46, 47, 48, 38, 40, 41, 41, 44, 47, 47, 48, 49, 48, 48, 47,
+ 47, 47, 48, 49, 41, 42, 42, 43, 45, 47, 48, 48, 50, 50, 49, 49, 50, 50,
+ 50, 52, 49, 47, 46, 46, 47, 48, 49, 50, 53, 53, 53, 53, 54, 54, 54, 55,
+ 49, 47, 46, 46, 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 48, 47,
+ 46, 45, 46, 46, 48, 49, 53, 54, 54, 55, 56, 56, 57, 58, 49, 47, 45, 45,
+ 45, 45, 47, 49, 53, 55, 55, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46,
+ 47, 50, 54, 55, 56, 59, 61, 61, 63, 64, 51, 48, 47, 46, 47, 46, 47, 50,
+ 54, 55, 56, 60, 61, 62, 64, 66, 52, 50, 48, 47, 47, 47, 48, 50, 54, 56,
+ 57, 61, 63, 64, 66, 68, 54, 52, 50, 49, 49, 48, 49, 52, 55, 57, 58, 62,
+ 64, 66, 68, 71,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 36, 36, 38, 41, 41, 45, 49, 49,
+ 49, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 31, 31, 31, 34, 34, 35, 38, 38, 39, 42, 42, 45, 48, 48, 47, 47, 47, 47,
+ 47, 47, 49, 49, 49, 50, 50, 51, 53, 53, 31, 31, 31, 31, 31, 31, 32, 34,
+ 34, 35, 38, 38, 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 50, 50, 52, 52, 31, 31, 31, 31, 31, 31, 32, 34, 34, 36, 38, 38,
+ 40, 42, 42, 45, 47, 47, 47, 47, 47, 47, 46, 47, 48, 48, 48, 49, 49, 50,
+ 52, 52, 30, 31, 31, 31, 31, 31, 32, 35, 35, 36, 39, 39, 41, 42, 42, 44,
+ 46, 46, 46, 46, 46, 45, 45, 45, 46, 47, 47, 48, 48, 48, 50, 50, 30, 31,
+ 31, 31, 31, 32, 32, 35, 35, 36, 40, 40, 41, 42, 42, 44, 46, 46, 46, 45,
+ 45, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 31, 31, 32, 32, 32, 32,
+ 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 45, 45, 45, 45, 45,
+ 46, 46, 46, 47, 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 35, 38, 38, 40,
+ 43, 43, 43, 44, 44, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47,
+ 47, 48, 49, 49, 33, 34, 34, 34, 35, 35, 36, 38, 39, 40, 43, 43, 44, 45,
+ 45, 46, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 48, 49, 49,
+ 34, 35, 35, 36, 36, 36, 37, 40, 40, 41, 44, 44, 45, 45, 45, 46, 47, 47,
+ 47, 46, 46, 45, 45, 45, 46, 46, 46, 47, 47, 48, 49, 49, 36, 38, 38, 38,
+ 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45,
+ 45, 45, 46, 46, 46, 46, 47, 47, 48, 48, 36, 38, 38, 38, 39, 40, 40, 43,
+ 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 45, 45, 45, 46, 46,
+ 46, 46, 47, 47, 48, 48, 38, 39, 40, 40, 41, 41, 41, 43, 44, 45, 47, 47,
+ 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 48, 48, 48,
+ 49, 49, 41, 42, 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49,
+ 50, 50, 50, 49, 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 41, 42,
+ 42, 42, 42, 42, 43, 44, 45, 45, 47, 47, 48, 48, 48, 49, 50, 50, 50, 49,
+ 49, 49, 49, 49, 50, 50, 50, 50, 50, 51, 52, 52, 45, 45, 45, 45, 44, 44,
+ 44, 46, 46, 46, 47, 47, 48, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+ 52, 52, 52, 52, 52, 52, 53, 53, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47,
+ 48, 48, 49, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54,
+ 54, 54, 55, 55, 49, 48, 47, 47, 46, 46, 46, 47, 47, 47, 48, 48, 49, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 49, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+ 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 48, 47, 47, 47,
+ 46, 45, 45, 46, 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55,
+ 55, 56, 56, 56, 56, 57, 57, 58, 58, 58, 48, 47, 47, 47, 46, 45, 45, 46,
+ 46, 46, 46, 46, 48, 49, 49, 51, 53, 53, 54, 54, 54, 55, 55, 56, 56, 56,
+ 56, 57, 57, 58, 58, 58, 49, 47, 47, 47, 45, 45, 45, 45, 45, 45, 45, 45,
+ 47, 49, 49, 51, 53, 53, 54, 55, 55, 57, 57, 58, 58, 59, 59, 60, 60, 60,
+ 61, 61, 49, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51,
+ 53, 53, 55, 55, 55, 57, 58, 58, 59, 60, 60, 61, 61, 61, 62, 62, 49, 47,
+ 47, 47, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+ 56, 58, 58, 59, 59, 60, 60, 61, 61, 62, 63, 63, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 59,
+ 61, 61, 61, 63, 63, 63, 64, 64, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+ 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 61, 62, 63,
+ 63, 64, 65, 65, 51, 49, 48, 48, 47, 46, 46, 47, 47, 46, 46, 46, 47, 50,
+ 50, 52, 54, 54, 55, 56, 56, 59, 60, 60, 61, 62, 62, 64, 64, 64, 66, 66,
+ 52, 50, 49, 49, 48, 47, 47, 47, 47, 47, 46, 46, 48, 50, 50, 52, 54, 54,
+ 56, 57, 57, 60, 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 52, 50, 50, 49,
+ 48, 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60,
+ 61, 61, 63, 63, 64, 66, 66, 67, 68, 68, 53, 51, 50, 50, 48, 48, 48, 48,
+ 48, 48, 47, 47, 48, 51, 51, 52, 54, 54, 56, 58, 58, 60, 61, 62, 63, 64,
+ 64, 67, 67, 68, 69, 69, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48,
+ 49, 52, 52, 53, 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69,
+ 71, 71, 54, 53, 52, 52, 50, 49, 49, 49, 49, 49, 48, 48, 49, 52, 52, 53,
+ 55, 55, 57, 58, 58, 61, 62, 63, 64, 65, 66, 68, 68, 69, 71, 71,
+ /* Size 4x8 */
+ 31, 38, 47, 50, 31, 40, 46, 48, 36, 44, 47, 47, 42, 47, 50, 50, 47, 48,
+ 53, 54, 46, 46, 54, 60, 48, 46, 55, 64, 50, 48, 56, 67,
+ /* Size 8x4 */
+ 31, 31, 36, 42, 47, 46, 48, 50, 38, 40, 44, 47, 48, 46, 46, 48, 47, 46,
+ 47, 50, 53, 54, 55, 56, 50, 48, 47, 50, 54, 60, 64, 67,
+ /* Size 8x16 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32,
+ 38, 40, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45, 46, 48, 33, 36, 41, 44,
+ 47, 46, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47,
+ 47, 48, 42, 43, 46, 48, 50, 49, 50, 50, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 49, 45,
+ 45, 47, 53, 58, 59, 61, 50, 46, 46, 48, 54, 59, 61, 63, 51, 47, 47, 48,
+ 54, 60, 61, 64, 52, 48, 47, 48, 54, 61, 63, 66, 54, 50, 49, 50, 55, 62,
+ 65, 68,
+ /* Size 16x8 */
+ 32, 31, 30, 31, 33, 37, 39, 42, 49, 48, 48, 49, 50, 51, 52, 54, 31, 31,
+ 32, 33, 36, 40, 41, 43, 46, 46, 46, 45, 46, 47, 48, 50, 35, 37, 38, 38,
+ 41, 45, 46, 46, 48, 47, 46, 45, 46, 47, 47, 49, 38, 40, 40, 41, 44, 47,
+ 47, 48, 49, 48, 48, 47, 48, 48, 48, 50, 48, 47, 46, 46, 47, 47, 48, 50,
+ 53, 53, 53, 53, 54, 54, 54, 55, 49, 47, 45, 45, 46, 45, 47, 49, 53, 55,
+ 56, 58, 59, 60, 61, 62, 50, 48, 46, 46, 46, 46, 47, 50, 54, 55, 56, 59,
+ 61, 61, 63, 65, 52, 50, 48, 48, 47, 47, 48, 50, 54, 56, 57, 61, 63, 64,
+ 66, 68,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 35, 37, 38, 47, 48, 48, 49, 49, 50, 52, 52, 54, 31, 31,
+ 31, 32, 36, 38, 39, 46, 47, 47, 48, 48, 49, 50, 50, 53, 31, 31, 31, 32,
+ 37, 38, 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 31, 31, 31, 32, 37, 38,
+ 40, 46, 47, 47, 47, 47, 48, 50, 50, 52, 30, 31, 32, 32, 38, 39, 40, 45,
+ 46, 46, 45, 45, 46, 48, 48, 50, 30, 31, 32, 33, 38, 40, 41, 45, 46, 46,
+ 45, 45, 46, 48, 48, 50, 31, 32, 33, 33, 38, 40, 41, 45, 46, 46, 45, 45,
+ 46, 48, 48, 50, 33, 35, 35, 36, 41, 43, 43, 46, 47, 46, 45, 45, 46, 47,
+ 47, 49, 33, 35, 36, 36, 41, 43, 44, 46, 47, 46, 46, 46, 46, 47, 47, 49,
+ 34, 36, 37, 37, 42, 44, 45, 47, 47, 47, 45, 45, 46, 47, 47, 49, 37, 39,
+ 40, 41, 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 37, 39, 40, 41,
+ 45, 47, 47, 47, 47, 47, 45, 45, 46, 47, 47, 48, 39, 40, 41, 42, 46, 47,
+ 47, 48, 48, 48, 47, 47, 47, 48, 48, 50, 42, 42, 43, 43, 46, 47, 48, 50,
+ 50, 50, 49, 49, 50, 50, 50, 52, 42, 42, 43, 43, 46, 47, 48, 50, 50, 50,
+ 49, 49, 50, 50, 50, 52, 45, 45, 44, 45, 47, 47, 48, 51, 51, 51, 51, 51,
+ 52, 52, 52, 54, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54,
+ 54, 55, 49, 47, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 54, 54, 54, 55,
+ 48, 47, 46, 46, 47, 47, 48, 52, 53, 53, 55, 55, 55, 56, 56, 57, 48, 46,
+ 46, 46, 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 48, 46, 46, 46,
+ 46, 47, 48, 52, 53, 54, 56, 56, 56, 57, 57, 59, 49, 46, 45, 45, 46, 46,
+ 47, 52, 53, 54, 57, 57, 58, 60, 60, 61, 49, 46, 45, 45, 45, 46, 47, 52,
+ 53, 55, 58, 58, 59, 61, 61, 62, 49, 46, 45, 45, 46, 46, 47, 52, 53, 55,
+ 58, 58, 60, 61, 61, 63, 50, 47, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59,
+ 61, 63, 63, 65, 50, 48, 46, 46, 46, 46, 48, 53, 54, 55, 59, 59, 61, 64,
+ 64, 65, 51, 48, 47, 47, 47, 47, 48, 53, 54, 55, 60, 60, 61, 64, 64, 66,
+ 52, 49, 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 52, 49,
+ 48, 48, 47, 47, 48, 53, 54, 56, 61, 61, 63, 66, 66, 68, 53, 50, 48, 48,
+ 48, 48, 49, 54, 54, 56, 61, 61, 63, 67, 67, 69, 54, 51, 50, 50, 49, 49,
+ 50, 55, 55, 57, 62, 62, 65, 68, 68, 71, 54, 51, 50, 50, 49, 49, 50, 55,
+ 55, 57, 62, 62, 65, 68, 68, 71,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+ 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 31, 31, 32, 35, 35, 36, 39, 39, 40, 42, 42, 45, 47, 47, 47, 46, 46, 46,
+ 46, 46, 47, 48, 48, 49, 49, 50, 51, 51, 31, 31, 31, 31, 32, 32, 33, 35,
+ 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46,
+ 47, 48, 48, 48, 50, 50, 31, 32, 32, 32, 32, 33, 33, 36, 36, 37, 41, 41,
+ 42, 43, 43, 45, 47, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47, 48, 48, 48,
+ 50, 50, 35, 36, 37, 37, 38, 38, 38, 41, 41, 42, 45, 45, 46, 46, 46, 47,
+ 48, 48, 47, 46, 46, 46, 45, 46, 46, 46, 47, 47, 47, 48, 49, 49, 37, 38,
+ 38, 38, 39, 40, 40, 43, 43, 44, 47, 47, 47, 47, 47, 47, 48, 48, 47, 47,
+ 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41,
+ 41, 43, 44, 45, 47, 47, 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47,
+ 48, 48, 48, 48, 48, 49, 50, 50, 47, 46, 46, 46, 45, 45, 45, 46, 46, 47,
+ 47, 47, 48, 50, 50, 51, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53,
+ 53, 54, 55, 55, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 48, 50, 50, 51, 53, 53,
+ 53, 54, 54, 54, 55, 55, 55, 55, 55, 56, 56, 56, 57, 57, 49, 48, 47, 47,
+ 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57,
+ 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 49, 48, 47, 47, 45, 45, 45, 45,
+ 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56, 56, 57, 58, 58, 59, 59,
+ 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46, 46, 46, 46, 46, 46, 46,
+ 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60, 61, 61, 61, 63, 63, 63,
+ 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52,
+ 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 52, 50,
+ 50, 50, 48, 48, 48, 47, 47, 47, 47, 47, 48, 50, 50, 52, 54, 54, 56, 57,
+ 57, 60, 61, 61, 63, 64, 64, 66, 66, 67, 68, 68, 54, 53, 52, 52, 50, 50,
+ 50, 49, 49, 49, 48, 48, 50, 52, 52, 54, 55, 55, 57, 59, 59, 61, 62, 63,
+ 65, 65, 66, 68, 68, 69, 71, 71,
+ /* Size 4x16 */
+ 31, 37, 48, 52, 31, 38, 47, 50, 31, 39, 46, 48, 32, 40, 46, 48, 35, 43,
+ 46, 47, 39, 47, 47, 47, 40, 47, 48, 48, 42, 47, 50, 50, 47, 48, 53, 54,
+ 47, 47, 53, 56, 46, 47, 54, 57, 46, 46, 55, 61, 47, 46, 55, 63, 48, 47,
+ 55, 64, 49, 47, 56, 66, 51, 49, 57, 68,
+ /* Size 16x4 */
+ 31, 31, 31, 32, 35, 39, 40, 42, 47, 47, 46, 46, 47, 48, 49, 51, 37, 38,
+ 39, 40, 43, 47, 47, 47, 48, 47, 47, 46, 46, 47, 47, 49, 48, 47, 46, 46,
+ 46, 47, 48, 50, 53, 53, 54, 55, 55, 55, 56, 57, 52, 50, 48, 48, 47, 47,
+ 48, 50, 54, 56, 57, 61, 63, 64, 66, 68,
+ /* Size 8x32 */
+ 32, 31, 35, 38, 48, 49, 50, 52, 31, 31, 36, 39, 47, 48, 49, 50, 31, 31,
+ 37, 40, 47, 47, 48, 50, 31, 31, 37, 40, 47, 47, 48, 50, 30, 32, 38, 40,
+ 46, 45, 46, 48, 30, 32, 38, 41, 46, 45, 46, 48, 31, 33, 38, 41, 46, 45,
+ 46, 48, 33, 35, 41, 43, 47, 45, 46, 47, 33, 36, 41, 44, 47, 46, 46, 47,
+ 34, 37, 42, 45, 47, 45, 46, 47, 37, 40, 45, 47, 47, 45, 46, 47, 37, 40,
+ 45, 47, 47, 45, 46, 47, 39, 41, 46, 47, 48, 47, 47, 48, 42, 43, 46, 48,
+ 50, 49, 50, 50, 42, 43, 46, 48, 50, 49, 50, 50, 45, 44, 47, 48, 51, 51,
+ 52, 52, 49, 46, 48, 49, 53, 53, 54, 54, 49, 46, 48, 49, 53, 53, 54, 54,
+ 48, 46, 47, 48, 53, 55, 55, 56, 48, 46, 46, 48, 53, 56, 56, 57, 48, 46,
+ 46, 48, 53, 56, 56, 57, 49, 45, 46, 47, 53, 57, 58, 60, 49, 45, 45, 47,
+ 53, 58, 59, 61, 49, 45, 46, 47, 53, 58, 60, 61, 50, 46, 46, 48, 54, 59,
+ 61, 63, 50, 46, 46, 48, 54, 59, 61, 64, 51, 47, 47, 48, 54, 60, 61, 64,
+ 52, 48, 47, 48, 54, 61, 63, 66, 52, 48, 47, 48, 54, 61, 63, 66, 53, 48,
+ 48, 49, 54, 61, 63, 67, 54, 50, 49, 50, 55, 62, 65, 68, 54, 50, 49, 50,
+ 55, 62, 65, 68,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 30, 30, 31, 33, 33, 34, 37, 37, 39, 42, 42, 45, 49, 49,
+ 48, 48, 48, 49, 49, 49, 50, 50, 51, 52, 52, 53, 54, 54, 31, 31, 31, 31,
+ 32, 32, 33, 35, 36, 37, 40, 40, 41, 43, 43, 44, 46, 46, 46, 46, 46, 45,
+ 45, 45, 46, 46, 47, 48, 48, 48, 50, 50, 35, 36, 37, 37, 38, 38, 38, 41,
+ 41, 42, 45, 45, 46, 46, 46, 47, 48, 48, 47, 46, 46, 46, 45, 46, 46, 46,
+ 47, 47, 47, 48, 49, 49, 38, 39, 40, 40, 40, 41, 41, 43, 44, 45, 47, 47,
+ 47, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 48, 48, 48, 48, 48, 49,
+ 50, 50, 48, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 51,
+ 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 54, 54, 55, 55, 49, 48,
+ 47, 47, 45, 45, 45, 45, 46, 45, 45, 45, 47, 49, 49, 51, 53, 53, 55, 56,
+ 56, 57, 58, 58, 59, 59, 60, 61, 61, 61, 62, 62, 50, 49, 48, 48, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 52, 54, 54, 55, 56, 56, 58, 59, 60,
+ 61, 61, 61, 63, 63, 63, 65, 65, 52, 50, 50, 50, 48, 48, 48, 47, 47, 47,
+ 47, 47, 48, 50, 50, 52, 54, 54, 56, 57, 57, 60, 61, 61, 63, 64, 64, 66,
+ 66, 67, 68, 68 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 35, 43, 32, 34, 37, 43, 35, 37, 48, 54, 43, 43, 54, 65,
+ /* Size 8x8 */
+ 31, 31, 32, 32, 34, 37, 43, 47, 31, 32, 32, 32, 34, 36, 41, 44, 32, 32,
+ 33, 34, 35, 38, 42, 45, 32, 32, 34, 35, 37, 39, 42, 46, 34, 34, 35, 37,
+ 41, 45, 49, 52, 37, 36, 38, 39, 45, 51, 56, 59, 43, 41, 42, 42, 49, 56,
+ 63, 67, 47, 44, 45, 46, 52, 59, 67, 71,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+ 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 35, 38, 39, 42, 45, 45, 31, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 37, 38, 41, 44, 44, 31, 32, 32, 32, 33, 33, 33, 34,
+ 35, 36, 36, 39, 40, 42, 44, 45, 31, 32, 32, 32, 33, 33, 34, 34, 35, 36,
+ 36, 39, 40, 42, 45, 45, 32, 32, 32, 32, 33, 34, 35, 36, 37, 38, 38, 40,
+ 41, 42, 45, 46, 32, 33, 33, 33, 34, 34, 36, 36, 38, 39, 40, 42, 43, 44,
+ 47, 47, 34, 34, 34, 33, 35, 35, 37, 38, 39, 42, 42, 45, 46, 47, 50, 51,
+ 35, 35, 34, 34, 36, 36, 38, 39, 42, 46, 47, 49, 50, 52, 55, 55, 36, 35,
+ 35, 34, 36, 36, 38, 40, 42, 47, 48, 50, 52, 54, 56, 57, 39, 38, 38, 37,
+ 39, 39, 40, 42, 45, 49, 50, 54, 55, 58, 60, 61, 41, 40, 39, 38, 40, 40,
+ 41, 43, 46, 50, 52, 55, 57, 60, 62, 63, 44, 42, 42, 41, 42, 42, 42, 44,
+ 47, 52, 54, 58, 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55,
+ 56, 60, 62, 66, 69, 70, 48, 46, 45, 44, 45, 45, 46, 47, 51, 55, 57, 61,
+ 63, 67, 70, 71,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 45, 47, 48, 48, 51, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+ 39, 39, 40, 43, 43, 44, 46, 47, 47, 50, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42,
+ 42, 43, 45, 46, 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 40, 42, 42, 43, 45, 46,
+ 46, 49, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 34, 35, 35, 36, 38, 38, 39, 42, 42, 42, 45, 45, 45, 48, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+ 34, 36, 37, 37, 38, 41, 41, 41, 44, 44, 44, 47, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+ 38, 41, 41, 41, 44, 44, 44, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 36, 38, 38, 39, 41, 41, 42,
+ 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35,
+ 36, 36, 36, 38, 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 38,
+ 39, 39, 40, 42, 42, 42, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 39, 40, 40, 41, 42,
+ 42, 43, 45, 45, 45, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46,
+ 46, 48, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 37,
+ 37, 37, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 46, 46, 48, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36, 36, 38, 38, 38, 39, 40,
+ 40, 41, 42, 42, 43, 44, 44, 45, 47, 47, 47, 50, 34, 34, 34, 34, 34, 33,
+ 33, 34, 35, 35, 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45,
+ 46, 47, 47, 48, 50, 51, 51, 53, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35,
+ 35, 36, 37, 37, 38, 39, 39, 40, 42, 42, 42, 44, 45, 45, 46, 47, 47, 48,
+ 50, 51, 51, 53, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 38, 40, 40, 41, 43, 44, 44, 45, 46, 46, 47, 49, 49, 49, 51, 52, 52, 54,
+ 35, 35, 35, 35, 34, 34, 34, 34, 36, 36, 36, 37, 38, 38, 39, 42, 42, 43,
+ 46, 47, 47, 48, 49, 49, 50, 52, 52, 53, 55, 55, 55, 57, 36, 35, 35, 35,
+ 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50,
+ 50, 50, 52, 54, 54, 54, 56, 57, 57, 58, 36, 35, 35, 35, 35, 34, 34, 35,
+ 36, 36, 36, 37, 38, 38, 40, 42, 42, 44, 47, 48, 48, 50, 50, 50, 52, 54,
+ 54, 54, 56, 57, 57, 58, 38, 37, 37, 37, 36, 36, 36, 36, 37, 38, 38, 39,
+ 39, 39, 41, 44, 44, 45, 48, 50, 50, 51, 52, 52, 54, 56, 56, 57, 58, 59,
+ 59, 61, 39, 39, 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45,
+ 45, 46, 49, 50, 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 39, 39,
+ 38, 38, 38, 37, 37, 38, 39, 39, 39, 40, 40, 40, 42, 45, 45, 46, 49, 50,
+ 50, 52, 54, 54, 55, 58, 58, 58, 60, 61, 61, 63, 41, 40, 40, 40, 39, 38,
+ 38, 39, 40, 40, 40, 41, 41, 41, 43, 46, 46, 47, 50, 52, 52, 54, 55, 55,
+ 57, 60, 60, 60, 62, 63, 63, 66, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42,
+ 42, 42, 42, 42, 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64,
+ 66, 67, 67, 69, 44, 43, 42, 42, 42, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+ 44, 47, 47, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69,
+ 45, 44, 43, 43, 42, 41, 41, 42, 42, 42, 42, 43, 43, 43, 45, 48, 48, 49,
+ 53, 54, 54, 57, 58, 58, 60, 64, 64, 65, 67, 68, 68, 70, 47, 46, 45, 45,
+ 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58,
+ 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 48, 47, 46, 46, 45, 44, 44, 45,
+ 45, 45, 45, 45, 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67,
+ 67, 68, 70, 71, 71, 74, 48, 47, 46, 46, 45, 44, 44, 45, 45, 45, 45, 45,
+ 46, 46, 47, 51, 51, 52, 55, 57, 57, 59, 61, 61, 63, 67, 67, 68, 70, 71,
+ 71, 74, 51, 50, 49, 49, 48, 47, 47, 47, 48, 48, 48, 48, 48, 48, 50, 53,
+ 53, 54, 57, 58, 58, 61, 63, 63, 66, 69, 69, 70, 73, 74, 74, 77,
+ /* Size 4x8 */
+ 31, 32, 35, 43, 32, 33, 34, 41, 32, 34, 36, 42, 32, 35, 38, 42, 34, 37,
+ 43, 49, 37, 40, 49, 56, 42, 43, 53, 63, 46, 46, 56, 67,
+ /* Size 8x4 */
+ 31, 32, 32, 32, 34, 37, 42, 46, 32, 33, 34, 35, 37, 40, 43, 46, 35, 34,
+ 36, 38, 43, 49, 53, 56, 43, 41, 42, 42, 49, 56, 63, 67,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32,
+ 32, 32, 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 33, 34,
+ 35, 36, 42, 44, 32, 32, 33, 34, 36, 36, 42, 45, 32, 33, 34, 35, 37, 38,
+ 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42, 48, 50,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 39, 37,
+ 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51, 60, 62, 44, 41, 42, 43,
+ 51, 53, 63, 66, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45, 45, 46, 54, 56,
+ 67, 70,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 34, 35, 36, 39, 41, 44, 47, 48, 31, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 34, 37, 39, 41, 44, 45, 31, 32, 32, 32,
+ 33, 33, 34, 34, 35, 36, 36, 39, 40, 42, 44, 45, 32, 32, 32, 33, 34, 34,
+ 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 35, 35, 34, 34, 35, 36, 37, 39,
+ 41, 45, 46, 48, 49, 51, 53, 54, 36, 35, 35, 34, 36, 36, 38, 40, 42, 47,
+ 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42, 42, 44, 48, 52, 54, 58,
+ 60, 63, 66, 67, 47, 45, 45, 44, 44, 45, 45, 47, 50, 55, 56, 60, 62, 66,
+ 69, 70,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 35, 36, 36, 40, 44, 44, 47, 53, 31, 31,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 39, 43, 43, 46, 52, 31, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32,
+ 32, 33, 35, 35, 35, 39, 42, 42, 45, 51, 31, 32, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 39, 41, 41, 45, 50, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 34, 38, 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 38,
+ 41, 41, 44, 49, 31, 32, 32, 32, 32, 33, 33, 33, 34, 35, 35, 38, 41, 41,
+ 44, 49, 31, 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 39, 42, 42, 44, 49,
+ 32, 32, 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32,
+ 32, 32, 33, 34, 34, 34, 36, 36, 36, 39, 42, 42, 45, 50, 32, 32, 32, 32,
+ 33, 35, 35, 35, 37, 37, 37, 40, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35,
+ 35, 36, 37, 38, 38, 41, 42, 42, 45, 49, 32, 32, 33, 33, 34, 35, 35, 36,
+ 37, 38, 38, 41, 42, 42, 45, 49, 32, 33, 33, 33, 34, 36, 36, 36, 39, 40,
+ 40, 42, 44, 44, 47, 51, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45,
+ 48, 48, 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 41, 42, 42, 45, 48, 48,
+ 50, 54, 34, 34, 34, 34, 35, 37, 37, 38, 42, 43, 43, 46, 49, 49, 51, 55,
+ 35, 35, 34, 34, 36, 38, 38, 39, 45, 47, 47, 50, 52, 52, 55, 59, 36, 35,
+ 34, 34, 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 36, 35, 34, 34,
+ 36, 38, 38, 40, 46, 48, 48, 51, 54, 54, 56, 60, 38, 37, 36, 36, 37, 40,
+ 40, 41, 47, 49, 49, 53, 56, 56, 58, 63, 39, 38, 37, 37, 39, 40, 40, 42,
+ 48, 50, 50, 54, 58, 58, 60, 65, 39, 38, 37, 37, 39, 40, 40, 42, 48, 50,
+ 50, 54, 58, 58, 60, 65, 41, 40, 39, 39, 40, 41, 41, 43, 49, 51, 51, 56,
+ 60, 60, 62, 67, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63,
+ 66, 71, 44, 42, 41, 41, 42, 43, 43, 45, 51, 53, 53, 59, 63, 63, 66, 71,
+ 44, 43, 42, 42, 42, 43, 43, 45, 51, 54, 54, 59, 64, 64, 67, 72, 47, 45,
+ 44, 44, 44, 45, 45, 47, 53, 56, 56, 61, 66, 66, 69, 75, 48, 46, 45, 45,
+ 45, 46, 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 48, 46, 45, 45, 45, 46,
+ 46, 48, 54, 56, 56, 62, 67, 67, 70, 76, 51, 49, 47, 47, 48, 48, 48, 50,
+ 56, 58, 58, 64, 69, 69, 73, 79,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 35, 35, 35, 37,
+ 38, 38, 40, 42, 42, 43, 45, 46, 46, 49, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41,
+ 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 36, 37, 37, 39, 41, 41, 42, 44, 45,
+ 45, 47, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 36, 36, 37, 39, 39, 40, 42, 42, 42, 44, 45, 45, 48, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38,
+ 38, 40, 40, 40, 41, 43, 43, 43, 45, 46, 46, 48, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40,
+ 41, 43, 43, 43, 45, 46, 46, 48, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 35, 36, 36, 36, 38, 38, 38, 39, 40, 40, 41, 42, 42, 43, 45, 45, 45,
+ 47, 48, 48, 50, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37,
+ 39, 41, 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56,
+ 36, 35, 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43,
+ 47, 48, 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 36, 35, 35, 35,
+ 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48, 48, 49,
+ 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 40, 39, 39, 39, 39, 38, 38, 38,
+ 39, 39, 39, 40, 41, 41, 42, 45, 45, 46, 50, 51, 51, 53, 54, 54, 56, 59,
+ 59, 59, 61, 62, 62, 64, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42,
+ 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67,
+ 67, 69, 44, 43, 42, 42, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 44, 48,
+ 48, 49, 52, 54, 54, 56, 58, 58, 60, 63, 63, 64, 66, 67, 67, 69, 47, 46,
+ 45, 45, 45, 44, 44, 44, 44, 45, 45, 45, 45, 45, 47, 50, 50, 51, 55, 56,
+ 56, 58, 60, 60, 62, 66, 66, 67, 69, 70, 70, 73, 53, 52, 51, 51, 50, 49,
+ 49, 49, 49, 50, 50, 49, 49, 49, 51, 54, 54, 55, 59, 60, 60, 63, 65, 65,
+ 67, 71, 71, 72, 75, 76, 76, 79,
+ /* Size 4x16 */
+ 31, 32, 36, 44, 32, 32, 35, 42, 32, 32, 35, 41, 32, 33, 34, 41, 32, 34,
+ 36, 42, 32, 34, 36, 42, 32, 35, 38, 42, 33, 36, 40, 44, 34, 37, 42, 48,
+ 35, 38, 47, 52, 35, 38, 48, 54, 38, 40, 50, 58, 40, 41, 51, 60, 42, 43,
+ 53, 63, 45, 45, 56, 66, 46, 46, 56, 67,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 35, 38, 40, 42, 45, 46, 32, 32,
+ 32, 33, 34, 34, 35, 36, 37, 38, 38, 40, 41, 43, 45, 46, 36, 35, 35, 34,
+ 36, 36, 38, 40, 42, 47, 48, 50, 51, 53, 56, 56, 44, 42, 41, 41, 42, 42,
+ 42, 44, 48, 52, 54, 58, 60, 63, 66, 67,
+ /* Size 8x32 */
+ 32, 31, 31, 32, 35, 36, 44, 47, 31, 32, 32, 32, 35, 35, 43, 46, 31, 32,
+ 32, 32, 35, 35, 42, 45, 31, 32, 32, 32, 35, 35, 42, 45, 31, 32, 32, 32,
+ 34, 35, 41, 45, 31, 32, 32, 33, 34, 34, 41, 44, 31, 32, 32, 33, 34, 34,
+ 41, 44, 31, 32, 32, 33, 34, 35, 41, 44, 31, 32, 33, 34, 35, 36, 42, 44,
+ 32, 32, 33, 34, 36, 36, 42, 45, 32, 32, 33, 34, 36, 36, 42, 45, 32, 32,
+ 33, 35, 37, 37, 42, 45, 32, 33, 34, 35, 37, 38, 42, 45, 32, 33, 34, 35,
+ 37, 38, 42, 45, 32, 33, 34, 36, 39, 40, 44, 47, 34, 34, 35, 37, 41, 42,
+ 48, 50, 34, 34, 35, 37, 41, 42, 48, 50, 34, 34, 35, 37, 42, 43, 49, 51,
+ 35, 34, 36, 38, 45, 47, 52, 55, 36, 34, 36, 38, 46, 48, 54, 56, 36, 34,
+ 36, 38, 46, 48, 54, 56, 38, 36, 37, 40, 47, 49, 56, 58, 39, 37, 39, 40,
+ 48, 50, 58, 60, 39, 37, 39, 40, 48, 50, 58, 60, 41, 39, 40, 41, 49, 51,
+ 60, 62, 44, 41, 42, 43, 51, 53, 63, 66, 44, 41, 42, 43, 51, 53, 63, 66,
+ 44, 42, 42, 43, 51, 54, 64, 67, 47, 44, 44, 45, 53, 56, 66, 69, 48, 45,
+ 45, 46, 54, 56, 67, 70, 48, 45, 45, 46, 54, 56, 67, 70, 51, 47, 48, 48,
+ 56, 58, 69, 73,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 34, 34,
+ 35, 36, 36, 38, 39, 39, 41, 44, 44, 44, 47, 48, 48, 51, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 39, 41, 41, 42, 44, 45, 45, 47, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 39, 39, 40, 42,
+ 42, 42, 44, 45, 45, 48, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 37, 37, 37, 38, 38, 38, 40, 40, 40, 41, 43, 43, 43, 45, 46,
+ 46, 48, 35, 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 37, 37, 37, 39, 41,
+ 41, 42, 45, 46, 46, 47, 48, 48, 49, 51, 51, 51, 53, 54, 54, 56, 36, 35,
+ 35, 35, 35, 34, 34, 35, 36, 36, 36, 37, 38, 38, 40, 42, 42, 43, 47, 48,
+ 48, 49, 50, 50, 51, 53, 53, 54, 56, 56, 56, 58, 44, 43, 42, 42, 41, 41,
+ 41, 41, 42, 42, 42, 42, 42, 42, 44, 48, 48, 49, 52, 54, 54, 56, 58, 58,
+ 60, 63, 63, 64, 66, 67, 67, 69, 47, 46, 45, 45, 45, 44, 44, 44, 44, 45,
+ 45, 45, 45, 45, 47, 50, 50, 51, 55, 56, 56, 58, 60, 60, 62, 66, 66, 67,
+ 69, 70, 70, 73 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 37, 47, 47, 37, 44, 47, 45, 47, 47, 53, 53, 47, 45, 53, 59,
+ /* Size 8x8 */
+ 31, 31, 34, 37, 43, 48, 47, 49, 31, 32, 35, 40, 43, 46, 45, 46, 34, 35,
+ 39, 43, 45, 46, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 43, 43, 45, 47,
+ 49, 50, 50, 50, 48, 46, 46, 47, 50, 53, 55, 55, 47, 45, 45, 45, 50, 55,
+ 58, 60, 49, 46, 46, 46, 50, 55, 60, 61,
+ /* Size 16x16 */
+ 32, 31, 31, 30, 33, 33, 36, 38, 41, 47, 49, 48, 49, 49, 50, 50, 31, 31,
+ 31, 31, 34, 34, 38, 40, 42, 46, 47, 47, 47, 47, 48, 48, 31, 31, 31, 31,
+ 34, 35, 39, 40, 42, 46, 47, 46, 46, 46, 47, 47, 30, 31, 31, 32, 34, 35,
+ 40, 41, 42, 45, 46, 45, 45, 45, 46, 46, 33, 34, 34, 34, 37, 38, 42, 43,
+ 44, 46, 47, 46, 46, 45, 46, 46, 33, 34, 35, 35, 38, 39, 43, 44, 45, 47,
+ 47, 46, 46, 45, 46, 46, 36, 38, 39, 40, 42, 43, 47, 47, 47, 47, 48, 46,
+ 46, 45, 46, 46, 38, 40, 40, 41, 43, 44, 47, 47, 48, 48, 49, 48, 47, 47,
+ 47, 47, 41, 42, 42, 42, 44, 45, 47, 48, 48, 50, 50, 49, 49, 49, 50, 50,
+ 47, 46, 46, 45, 46, 47, 47, 48, 50, 52, 52, 52, 52, 52, 53, 53, 49, 47,
+ 47, 46, 47, 47, 48, 49, 50, 52, 53, 53, 53, 53, 54, 54, 48, 47, 46, 45,
+ 46, 46, 46, 48, 49, 52, 53, 54, 55, 55, 56, 56, 49, 47, 46, 45, 46, 46,
+ 46, 47, 49, 52, 53, 55, 55, 57, 57, 58, 49, 47, 46, 45, 45, 45, 45, 47,
+ 49, 52, 53, 55, 57, 58, 59, 60, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53,
+ 54, 56, 57, 59, 61, 61, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56,
+ 58, 60, 61, 61,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 41, 43,
+ 47, 49, 49, 49, 48, 48, 49, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 42, 43, 47, 48, 48, 48,
+ 47, 47, 47, 47, 47, 48, 49, 49, 49, 50, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 34, 34, 37, 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 37,
+ 38, 38, 40, 42, 42, 43, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 31, 31, 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42,
+ 42, 43, 46, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 47, 47, 48, 30, 31,
+ 31, 31, 31, 32, 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46,
+ 46, 46, 45, 45, 45, 45, 45, 45, 46, 46, 46, 47, 30, 31, 31, 31, 31, 32,
+ 32, 32, 34, 35, 35, 38, 40, 40, 41, 42, 42, 43, 45, 46, 46, 46, 45, 45,
+ 45, 45, 45, 45, 46, 46, 46, 47, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36,
+ 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+ 46, 46, 46, 47, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42,
+ 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45,
+ 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 33, 34, 34, 34,
+ 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 45, 45, 47, 47, 47, 46,
+ 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 35, 36, 37, 37, 37, 38, 38, 38,
+ 41, 41, 41, 44, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 46, 46, 46, 45,
+ 45, 45, 46, 46, 46, 47, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46,
+ 47, 47, 47, 47, 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46,
+ 46, 46, 36, 37, 38, 38, 39, 40, 40, 40, 42, 43, 43, 46, 47, 47, 47, 47,
+ 47, 47, 47, 48, 48, 47, 46, 46, 46, 45, 45, 45, 46, 46, 46, 46, 38, 39,
+ 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+ 49, 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42,
+ 42, 43, 44, 45, 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49,
+ 49, 49, 49, 49, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45,
+ 45, 46, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49,
+ 50, 50, 50, 50, 43, 43, 43, 43, 43, 43, 43, 43, 45, 45, 45, 46, 47, 47,
+ 48, 49, 49, 49, 50, 51, 51, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51,
+ 47, 47, 46, 46, 46, 45, 45, 46, 46, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+ 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 53, 53, 53, 53, 49, 48, 47, 47,
+ 47, 46, 46, 46, 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 47, 46, 46, 46,
+ 47, 47, 47, 47, 48, 48, 49, 50, 50, 51, 52, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47,
+ 47, 47, 48, 50, 50, 50, 52, 53, 53, 53, 54, 54, 54, 55, 55, 55, 55, 55,
+ 55, 56, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49,
+ 49, 50, 52, 53, 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 48, 47,
+ 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 48, 49, 49, 50, 52, 53,
+ 53, 54, 54, 54, 55, 55, 55, 56, 56, 56, 56, 57, 49, 47, 47, 47, 46, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 55, 55,
+ 55, 57, 57, 57, 57, 58, 58, 58, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59,
+ 59, 60, 60, 60, 49, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 47, 49, 49, 50, 52, 53, 53, 55, 55, 55, 57, 58, 58, 59, 59, 60, 60, 60,
+ 49, 48, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 47, 49, 49, 50,
+ 52, 53, 53, 55, 56, 56, 57, 59, 59, 59, 60, 60, 60, 61, 50, 49, 48, 48,
+ 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55,
+ 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 50, 49, 48, 48, 47, 46, 46, 46,
+ 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60,
+ 60, 60, 61, 61, 61, 63, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46, 46, 46,
+ 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 58, 60, 60, 60, 61, 61,
+ 61, 63, 51, 50, 49, 49, 48, 47, 47, 47, 47, 47, 47, 47, 46, 46, 48, 50,
+ 50, 51, 53, 54, 54, 56, 57, 57, 58, 60, 60, 61, 62, 63, 63, 64,
+ /* Size 4x8 */
+ 31, 38, 47, 48, 31, 40, 46, 45, 35, 43, 47, 46, 39, 47, 47, 45, 43, 47,
+ 50, 50, 47, 47, 53, 55, 46, 46, 53, 58, 48, 46, 54, 59,
+ /* Size 8x4 */
+ 31, 31, 35, 39, 43, 47, 46, 48, 38, 40, 43, 47, 47, 47, 46, 46, 47, 46,
+ 47, 47, 50, 53, 53, 54, 48, 45, 46, 45, 50, 55, 58, 59,
+ /* Size 8x16 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32,
+ 34, 39, 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 33, 35, 37, 42,
+ 46, 47, 45, 46, 33, 36, 38, 43, 46, 47, 46, 46, 37, 40, 43, 47, 47, 47,
+ 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50, 49, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 48, 46,
+ 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53, 57, 57, 49, 45, 45, 46,
+ 51, 53, 58, 59, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54,
+ 59, 61,
+ /* Size 16x8 */
+ 32, 31, 31, 30, 33, 33, 37, 39, 42, 47, 49, 48, 48, 49, 50, 50, 31, 31,
+ 32, 32, 35, 36, 40, 41, 43, 46, 46, 46, 45, 45, 46, 46, 33, 34, 34, 35,
+ 37, 38, 43, 43, 44, 46, 47, 46, 46, 45, 46, 46, 37, 38, 39, 40, 42, 43,
+ 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 45, 45, 45, 44, 46, 46, 47, 48,
+ 49, 51, 52, 51, 51, 51, 52, 52, 48, 47, 46, 46, 47, 47, 47, 48, 50, 52,
+ 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46, 45, 47, 49, 53, 53, 56,
+ 57, 58, 59, 59, 50, 48, 47, 46, 46, 46, 46, 47, 50, 53, 54, 56, 57, 59,
+ 61, 61,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 45, 48, 48, 49, 49, 49, 50, 52, 31, 31,
+ 31, 31, 33, 38, 38, 39, 45, 47, 47, 48, 48, 48, 49, 51, 31, 31, 31, 31,
+ 34, 38, 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 31, 31, 34, 38,
+ 38, 40, 45, 47, 47, 47, 47, 47, 48, 50, 31, 31, 32, 32, 34, 39, 39, 40,
+ 45, 46, 46, 46, 46, 46, 47, 49, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46,
+ 46, 45, 45, 45, 46, 48, 30, 31, 32, 32, 35, 40, 40, 41, 44, 46, 46, 45,
+ 45, 45, 46, 48, 31, 32, 33, 33, 35, 40, 40, 41, 45, 46, 46, 45, 45, 45,
+ 46, 48, 33, 34, 35, 35, 37, 42, 42, 43, 46, 47, 47, 46, 45, 45, 46, 47,
+ 33, 35, 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 33, 35,
+ 36, 36, 38, 43, 43, 44, 46, 47, 47, 46, 46, 46, 46, 47, 35, 37, 38, 38,
+ 41, 45, 45, 46, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47,
+ 47, 47, 47, 47, 47, 46, 45, 45, 46, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 47, 47, 47, 46, 45, 45, 46, 47, 39, 40, 41, 41, 43, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 47, 48, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49,
+ 49, 49, 50, 50, 42, 42, 43, 43, 44, 47, 47, 48, 49, 50, 50, 49, 49, 49,
+ 50, 50, 43, 43, 43, 43, 45, 47, 47, 48, 50, 50, 50, 50, 50, 50, 50, 51,
+ 47, 46, 46, 46, 46, 48, 48, 48, 51, 52, 52, 52, 53, 53, 53, 53, 49, 47,
+ 46, 46, 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 49, 47, 46, 46,
+ 47, 48, 48, 49, 52, 53, 53, 53, 53, 53, 54, 54, 48, 47, 46, 46, 46, 47,
+ 47, 48, 52, 53, 53, 54, 55, 55, 55, 56, 48, 47, 46, 46, 46, 47, 47, 48,
+ 51, 53, 53, 54, 56, 56, 56, 57, 48, 47, 46, 46, 46, 47, 47, 48, 51, 53,
+ 53, 54, 56, 56, 56, 57, 48, 47, 45, 45, 46, 46, 46, 47, 51, 53, 53, 55,
+ 57, 57, 57, 59, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58,
+ 59, 61, 49, 46, 45, 45, 45, 46, 46, 47, 51, 53, 53, 56, 58, 58, 59, 61,
+ 49, 47, 45, 45, 45, 46, 46, 47, 52, 53, 53, 56, 58, 58, 60, 62, 50, 48,
+ 46, 46, 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 63, 50, 48, 46, 46,
+ 46, 46, 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 50, 48, 46, 46, 46, 46,
+ 46, 48, 52, 54, 54, 57, 59, 59, 61, 64, 51, 49, 47, 47, 47, 47, 47, 48,
+ 52, 54, 54, 58, 60, 60, 62, 65,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+ 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 42, 43, 46, 47, 47, 47,
+ 47, 47, 47, 46, 46, 47, 48, 48, 48, 49, 31, 31, 31, 31, 32, 32, 32, 33,
+ 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45,
+ 45, 45, 46, 46, 46, 47, 31, 31, 31, 31, 32, 32, 32, 33, 35, 36, 36, 38,
+ 40, 40, 41, 43, 43, 43, 46, 46, 46, 46, 46, 46, 45, 45, 45, 45, 46, 46,
+ 46, 47, 33, 33, 34, 34, 34, 35, 35, 35, 37, 38, 38, 41, 43, 43, 43, 44,
+ 44, 45, 46, 47, 47, 46, 46, 46, 46, 45, 45, 45, 46, 46, 46, 47, 37, 38,
+ 38, 38, 39, 40, 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 45, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 46, 46, 46, 46, 46, 46, 46, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+ 44, 46, 47, 47, 47, 48, 48, 48, 48, 49, 49, 48, 48, 48, 47, 47, 47, 47,
+ 48, 48, 48, 48, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47,
+ 48, 49, 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52,
+ 48, 47, 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50,
+ 52, 53, 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 48, 47, 47, 47,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45, 45, 45,
+ 46, 46, 46, 46, 46, 46, 47, 49, 49, 50, 52, 53, 53, 54, 54, 54, 55, 56,
+ 56, 56, 57, 57, 57, 58, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45,
+ 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59,
+ 59, 60, 49, 48, 47, 47, 46, 45, 45, 45, 45, 46, 46, 45, 45, 45, 47, 49,
+ 49, 50, 53, 53, 53, 55, 56, 56, 57, 58, 58, 58, 59, 59, 59, 60, 50, 49,
+ 48, 48, 47, 46, 46, 46, 46, 46, 46, 46, 46, 46, 47, 50, 50, 50, 53, 54,
+ 54, 55, 56, 56, 57, 59, 59, 60, 61, 61, 61, 62, 52, 51, 50, 50, 49, 48,
+ 48, 48, 47, 47, 47, 47, 47, 47, 48, 50, 50, 51, 53, 54, 54, 56, 57, 57,
+ 59, 61, 61, 62, 63, 64, 64, 65,
+ /* Size 4x16 */
+ 31, 37, 48, 49, 31, 38, 47, 47, 31, 39, 46, 46, 31, 40, 46, 45, 34, 42,
+ 47, 45, 35, 43, 47, 46, 39, 47, 47, 45, 40, 47, 48, 47, 42, 47, 50, 49,
+ 46, 48, 52, 53, 47, 48, 53, 53, 47, 47, 53, 56, 47, 46, 53, 57, 46, 46,
+ 53, 58, 48, 46, 54, 59, 48, 46, 54, 59,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 34, 35, 39, 40, 42, 46, 47, 47, 47, 46, 48, 48, 37, 38,
+ 39, 40, 42, 43, 47, 47, 47, 48, 48, 47, 46, 46, 46, 46, 48, 47, 46, 46,
+ 47, 47, 47, 48, 50, 52, 53, 53, 53, 53, 54, 54, 49, 47, 46, 45, 45, 46,
+ 45, 47, 49, 53, 53, 56, 57, 58, 59, 59,
+ /* Size 8x32 */
+ 32, 31, 33, 37, 45, 48, 49, 50, 31, 31, 33, 38, 45, 47, 48, 49, 31, 31,
+ 34, 38, 45, 47, 47, 48, 31, 31, 34, 38, 45, 47, 47, 48, 31, 32, 34, 39,
+ 45, 46, 46, 47, 30, 32, 35, 40, 44, 46, 45, 46, 30, 32, 35, 40, 44, 46,
+ 45, 46, 31, 33, 35, 40, 45, 46, 45, 46, 33, 35, 37, 42, 46, 47, 45, 46,
+ 33, 36, 38, 43, 46, 47, 46, 46, 33, 36, 38, 43, 46, 47, 46, 46, 35, 38,
+ 41, 45, 47, 47, 45, 46, 37, 40, 43, 47, 47, 47, 45, 46, 37, 40, 43, 47,
+ 47, 47, 45, 46, 39, 41, 43, 47, 48, 48, 47, 47, 42, 43, 44, 47, 49, 50,
+ 49, 50, 42, 43, 44, 47, 49, 50, 49, 50, 43, 43, 45, 47, 50, 50, 50, 50,
+ 47, 46, 46, 48, 51, 52, 53, 53, 49, 46, 47, 48, 52, 53, 53, 54, 49, 46,
+ 47, 48, 52, 53, 53, 54, 48, 46, 46, 47, 52, 53, 55, 55, 48, 46, 46, 47,
+ 51, 53, 56, 56, 48, 46, 46, 47, 51, 53, 56, 56, 48, 45, 46, 46, 51, 53,
+ 57, 57, 49, 45, 45, 46, 51, 53, 58, 59, 49, 45, 45, 46, 51, 53, 58, 59,
+ 49, 45, 45, 46, 52, 53, 58, 60, 50, 46, 46, 46, 52, 54, 59, 61, 50, 46,
+ 46, 46, 52, 54, 59, 61, 50, 46, 46, 46, 52, 54, 59, 61, 51, 47, 47, 47,
+ 52, 54, 60, 62,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 42, 43,
+ 47, 49, 49, 48, 48, 48, 48, 49, 49, 49, 50, 50, 50, 51, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 43, 43, 46, 46, 46, 46,
+ 46, 46, 45, 45, 45, 45, 46, 46, 46, 47, 33, 33, 34, 34, 34, 35, 35, 35,
+ 37, 38, 38, 41, 43, 43, 43, 44, 44, 45, 46, 47, 47, 46, 46, 46, 46, 45,
+ 45, 45, 46, 46, 46, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 46, 46, 46, 46, 46, 46,
+ 46, 47, 45, 45, 45, 45, 45, 44, 44, 45, 46, 46, 46, 47, 47, 47, 48, 49,
+ 49, 50, 51, 52, 52, 52, 51, 51, 51, 51, 51, 52, 52, 52, 52, 52, 48, 47,
+ 47, 47, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 52, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 54, 54, 54, 54, 49, 48, 47, 47, 46, 45,
+ 45, 45, 45, 46, 46, 45, 45, 45, 47, 49, 49, 50, 53, 53, 53, 55, 56, 56,
+ 57, 58, 58, 58, 59, 59, 59, 60, 50, 49, 48, 48, 47, 46, 46, 46, 46, 46,
+ 46, 46, 46, 46, 47, 50, 50, 50, 53, 54, 54, 55, 56, 56, 57, 59, 59, 60,
+ 61, 61, 61, 62 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 34, 38, 32, 33, 35, 39, 34, 35, 39, 45, 38, 39, 45, 54,
+ /* Size 8x8 */
+ 31, 31, 32, 32, 33, 34, 37, 41, 31, 32, 32, 32, 33, 34, 36, 39, 32, 32,
+ 32, 33, 34, 35, 37, 40, 32, 32, 33, 34, 35, 36, 38, 41, 33, 33, 34, 35,
+ 37, 39, 41, 44, 34, 34, 35, 36, 39, 43, 46, 49, 37, 36, 37, 38, 41, 46,
+ 51, 54, 41, 39, 40, 41, 44, 49, 54, 58,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 37, 37, 41, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 36, 36, 39, 39, 42, 31, 32, 32, 32, 32, 33, 33, 34, 34, 35, 35, 36,
+ 36, 39, 39, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40,
+ 40, 42, 32, 32, 32, 32, 32, 34, 34, 35, 35, 37, 37, 38, 38, 40, 40, 42,
+ 34, 34, 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 34, 34,
+ 34, 33, 33, 35, 35, 37, 37, 39, 39, 42, 42, 45, 45, 47, 36, 35, 35, 34,
+ 34, 36, 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 36, 35, 35, 34, 34, 36,
+ 36, 38, 38, 42, 42, 48, 48, 50, 50, 54, 39, 38, 38, 37, 37, 39, 39, 40,
+ 40, 45, 45, 50, 50, 54, 54, 58, 39, 38, 38, 37, 37, 39, 39, 40, 40, 45,
+ 45, 50, 50, 54, 54, 58, 44, 42, 42, 41, 41, 42, 42, 42, 42, 47, 47, 54,
+ 54, 58, 58, 63,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37,
+ 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 34, 34, 34, 34, 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34,
+ 34, 34, 35, 35, 35, 36, 38, 38, 38, 39, 41, 41, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37,
+ 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 38, 38, 38, 40, 41, 41, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36,
+ 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38,
+ 39, 39, 39, 40, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 36, 36, 36, 38, 39, 39, 39, 40,
+ 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 35, 36, 36, 36, 36, 37, 37, 37, 38, 40, 40, 40, 41, 42, 42, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37,
+ 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+ 38, 39, 40, 40, 40, 41, 42, 42, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+ 40, 41, 42, 42, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35,
+ 36, 36, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41, 42, 42, 42, 44, 45, 45,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+ 39, 39, 39, 41, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34,
+ 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41,
+ 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 34, 34, 34, 34, 34, 34, 33, 33,
+ 33, 34, 35, 35, 35, 36, 37, 37, 37, 38, 39, 39, 39, 41, 42, 42, 42, 44,
+ 45, 45, 45, 46, 47, 47, 35, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 36,
+ 36, 36, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45, 45, 46, 47, 47, 47, 49,
+ 50, 50, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38,
+ 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+ 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 54, 54, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 48, 49, 50, 50, 50, 52, 54, 54, 37, 37, 37, 37, 37, 36, 36, 36, 36, 37,
+ 38, 38, 38, 38, 39, 39, 39, 41, 44, 44, 44, 46, 49, 49, 49, 51, 52, 52,
+ 52, 54, 56, 56, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40,
+ 40, 40, 40, 42, 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58,
+ 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42,
+ 45, 45, 45, 47, 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 39, 39, 38, 38,
+ 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, 40, 42, 45, 45, 45, 47,
+ 50, 50, 50, 52, 54, 54, 54, 56, 58, 58, 41, 41, 40, 40, 40, 39, 39, 39,
+ 39, 40, 40, 40, 40, 41, 41, 41, 41, 44, 46, 46, 46, 49, 52, 52, 52, 54,
+ 56, 56, 56, 58, 60, 60, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42,
+ 42, 42, 42, 42, 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60,
+ 63, 63, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42,
+ 42, 45, 47, 47, 47, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63,
+ /* Size 4x8 */
+ 31, 32, 34, 39, 32, 32, 34, 38, 32, 33, 34, 38, 32, 33, 36, 40, 33, 34,
+ 38, 42, 34, 36, 41, 47, 37, 38, 44, 52, 40, 40, 46, 56,
+ /* Size 8x4 */
+ 31, 32, 32, 32, 33, 34, 37, 40, 32, 32, 33, 33, 34, 36, 38, 40, 34, 34,
+ 34, 36, 38, 41, 44, 46, 39, 38, 38, 40, 42, 47, 52, 56,
+ /* Size 8x16 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32,
+ 32, 32, 32, 35, 35, 42, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33,
+ 33, 34, 34, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36,
+ 36, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38, 38, 42,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 36, 34,
+ 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 39, 37, 37, 40,
+ 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 44, 41, 41, 43, 43, 53,
+ 53, 63,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 34, 36, 36, 39, 39, 44, 31, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 31, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 37, 37, 41, 32, 32, 32, 33, 33, 34,
+ 34, 35, 35, 37, 37, 38, 38, 40, 40, 43, 32, 32, 32, 33, 33, 34, 34, 35,
+ 35, 37, 37, 38, 38, 40, 40, 43, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42,
+ 42, 48, 48, 50, 50, 53, 36, 35, 35, 34, 34, 36, 36, 38, 38, 42, 42, 48,
+ 48, 50, 50, 53, 44, 42, 42, 41, 41, 42, 42, 42, 42, 48, 48, 54, 54, 58,
+ 58, 63,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 39, 44, 44, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 34, 35, 35, 35, 39, 43, 43, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 34, 35, 35, 35, 38, 42, 42, 31, 32, 32, 32, 32, 32, 32, 32, 32, 34,
+ 35, 35, 35, 38, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 37, 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37,
+ 41, 41, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 37, 41, 41,
+ 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 38, 41, 41, 32, 32,
+ 32, 32, 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32,
+ 32, 33, 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 39, 42, 42, 32, 32, 32, 32, 32, 33, 34, 34,
+ 34, 36, 37, 37, 37, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37,
+ 38, 38, 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38,
+ 38, 40, 42, 42, 32, 32, 33, 33, 33, 34, 35, 35, 35, 37, 38, 38, 38, 40,
+ 42, 42, 33, 33, 33, 33, 33, 34, 36, 36, 36, 38, 40, 40, 40, 42, 45, 45,
+ 34, 34, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34,
+ 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 34, 34, 34, 34,
+ 34, 35, 37, 37, 37, 39, 42, 42, 42, 45, 48, 48, 35, 34, 34, 34, 34, 36,
+ 37, 37, 37, 41, 45, 45, 45, 47, 50, 50, 36, 35, 34, 34, 34, 36, 38, 38,
+ 38, 43, 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43,
+ 48, 48, 48, 51, 54, 54, 36, 35, 34, 34, 34, 36, 38, 38, 38, 43, 48, 48,
+ 48, 51, 54, 54, 37, 37, 36, 36, 36, 38, 39, 39, 39, 44, 49, 49, 49, 52,
+ 56, 56, 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58,
+ 39, 38, 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 39, 38,
+ 37, 37, 37, 39, 40, 40, 40, 45, 50, 50, 50, 54, 58, 58, 41, 40, 39, 39,
+ 39, 40, 42, 42, 42, 46, 52, 52, 52, 56, 60, 60, 44, 42, 41, 41, 41, 42,
+ 43, 43, 43, 48, 53, 53, 53, 58, 63, 63, 44, 42, 41, 41, 41, 42, 43, 43,
+ 43, 48, 53, 53, 53, 58, 63, 63,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34,
+ 35, 35, 35, 37, 38, 38, 38, 40, 42, 42, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39,
+ 41, 41, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 36, 36, 36, 36, 38, 39, 39, 39, 40, 42, 42, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38,
+ 38, 39, 40, 40, 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 34, 34, 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40,
+ 40, 42, 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43,
+ 34, 34, 34, 34, 34, 34, 33, 33, 33, 34, 35, 35, 35, 36, 37, 37, 37, 38,
+ 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 48, 48, 36, 35, 35, 35,
+ 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45,
+ 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34,
+ 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49,
+ 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36,
+ 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48, 48, 49, 50, 50, 50, 52,
+ 53, 53, 39, 39, 38, 38, 38, 38, 37, 37, 37, 38, 39, 39, 39, 40, 40, 40,
+ 40, 42, 45, 45, 45, 47, 51, 51, 51, 52, 54, 54, 54, 56, 58, 58, 44, 43,
+ 42, 42, 42, 41, 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48,
+ 48, 50, 54, 54, 54, 56, 58, 58, 58, 60, 63, 63, 44, 43, 42, 42, 42, 41,
+ 41, 41, 41, 41, 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54,
+ 54, 56, 58, 58, 58, 60, 63, 63,
+ /* Size 4x16 */
+ 31, 32, 34, 39, 32, 32, 34, 38, 32, 32, 34, 38, 32, 32, 33, 37, 32, 32,
+ 33, 37, 32, 33, 35, 39, 32, 33, 35, 39, 32, 34, 37, 40, 32, 34, 37, 40,
+ 34, 35, 39, 45, 34, 35, 39, 45, 35, 36, 43, 51, 35, 36, 43, 51, 38, 39,
+ 45, 54, 38, 39, 45, 54, 42, 42, 48, 58,
+ /* Size 16x4 */
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 35, 35, 38, 38, 42, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 39, 39, 42, 34, 34, 34, 33,
+ 33, 35, 35, 37, 37, 39, 39, 43, 43, 45, 45, 48, 39, 38, 38, 37, 37, 39,
+ 39, 40, 40, 45, 45, 51, 51, 54, 54, 58,
+ /* Size 8x32 */
+ 32, 31, 31, 32, 32, 36, 36, 44, 31, 31, 31, 32, 32, 35, 35, 43, 31, 32,
+ 32, 32, 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 42, 31, 32, 32, 32,
+ 32, 35, 35, 42, 31, 32, 32, 32, 32, 35, 35, 41, 31, 32, 32, 33, 33, 34,
+ 34, 41, 31, 32, 32, 33, 33, 34, 34, 41, 31, 32, 32, 33, 33, 34, 34, 41,
+ 31, 32, 32, 33, 33, 35, 35, 41, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32,
+ 32, 34, 34, 36, 36, 42, 32, 32, 32, 34, 34, 36, 36, 42, 32, 32, 32, 34,
+ 34, 37, 37, 42, 32, 33, 33, 35, 35, 38, 38, 42, 32, 33, 33, 35, 35, 38,
+ 38, 42, 32, 33, 33, 35, 35, 38, 38, 42, 33, 33, 33, 36, 36, 40, 40, 45,
+ 34, 34, 34, 37, 37, 42, 42, 48, 34, 34, 34, 37, 37, 42, 42, 48, 34, 34,
+ 34, 37, 37, 42, 42, 48, 35, 34, 34, 37, 37, 45, 45, 50, 36, 34, 34, 38,
+ 38, 48, 48, 54, 36, 34, 34, 38, 38, 48, 48, 54, 36, 34, 34, 38, 38, 48,
+ 48, 54, 37, 36, 36, 39, 39, 49, 49, 56, 39, 37, 37, 40, 40, 50, 50, 58,
+ 39, 37, 37, 40, 40, 50, 50, 58, 39, 37, 37, 40, 40, 50, 50, 58, 41, 39,
+ 39, 42, 42, 52, 52, 60, 44, 41, 41, 43, 43, 53, 53, 63, 44, 41, 41, 43,
+ 43, 53, 53, 63,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 34, 34, 35, 36, 36, 36, 37, 39, 39, 39, 41, 44, 44, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 36, 37, 37, 37, 39, 41, 41, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 37, 37, 39, 41, 41, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 34, 35, 35, 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42,
+ 43, 43, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 36, 37, 37, 37, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 36, 35,
+ 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42,
+ 42, 45, 48, 48, 48, 49, 50, 50, 50, 52, 53, 53, 36, 35, 35, 35, 35, 35,
+ 34, 34, 34, 35, 36, 36, 36, 37, 38, 38, 38, 40, 42, 42, 42, 45, 48, 48,
+ 48, 49, 50, 50, 50, 52, 53, 53, 44, 43, 42, 42, 42, 41, 41, 41, 41, 41,
+ 42, 42, 42, 42, 42, 42, 42, 45, 48, 48, 48, 50, 54, 54, 54, 56, 58, 58,
+ 58, 60, 63, 63 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 34, 42, 47, 34, 39, 45, 46, 42, 45, 48, 49, 47, 46, 49, 54,
+ /* Size 8x8 */
+ 31, 31, 32, 35, 39, 45, 48, 48, 31, 31, 33, 37, 41, 44, 46, 46, 32, 33,
+ 35, 39, 42, 45, 46, 45, 35, 37, 39, 43, 45, 47, 47, 46, 39, 41, 42, 45,
+ 47, 48, 48, 47, 45, 44, 45, 47, 48, 50, 51, 51, 48, 46, 46, 47, 48, 51,
+ 53, 54, 48, 46, 45, 46, 47, 51, 54, 56,
+ /* Size 16x16 */
+ 32, 31, 31, 30, 30, 33, 33, 36, 36, 41, 41, 49, 49, 48, 48, 49, 31, 31,
+ 31, 31, 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 31, 31, 31, 31,
+ 31, 34, 34, 38, 38, 42, 42, 47, 47, 47, 47, 47, 30, 31, 31, 32, 32, 35,
+ 35, 40, 40, 42, 42, 46, 46, 45, 45, 45, 30, 31, 31, 32, 32, 35, 35, 40,
+ 40, 42, 42, 46, 46, 45, 45, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45,
+ 45, 47, 47, 46, 46, 45, 33, 34, 34, 35, 35, 39, 39, 43, 43, 45, 45, 47,
+ 47, 46, 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46,
+ 46, 45, 36, 38, 38, 40, 40, 43, 43, 47, 47, 47, 47, 48, 48, 46, 46, 45,
+ 41, 42, 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 41, 42,
+ 42, 42, 42, 45, 45, 47, 47, 48, 48, 50, 50, 49, 49, 49, 49, 47, 47, 46,
+ 46, 47, 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 49, 47, 47, 46, 46, 47,
+ 47, 48, 48, 50, 50, 53, 53, 53, 53, 53, 48, 47, 47, 45, 45, 46, 46, 46,
+ 46, 49, 49, 53, 53, 54, 54, 55, 48, 47, 47, 45, 45, 46, 46, 46, 46, 49,
+ 49, 53, 53, 54, 54, 55, 49, 47, 47, 45, 45, 45, 45, 45, 45, 49, 49, 53,
+ 53, 55, 55, 58,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 36, 36, 36, 39,
+ 41, 41, 41, 45, 49, 49, 49, 49, 48, 48, 48, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 34, 34, 35, 37, 37, 37, 39, 42, 42, 42, 45,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34,
+ 34, 36, 38, 38, 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38,
+ 38, 40, 42, 42, 42, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 35, 35, 35, 37, 39, 39, 39, 41, 42, 42,
+ 42, 44, 47, 47, 47, 46, 46, 46, 46, 46, 46, 46, 30, 31, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46,
+ 46, 46, 45, 45, 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33,
+ 35, 35, 35, 37, 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45,
+ 45, 45, 45, 45, 30, 31, 31, 31, 31, 31, 32, 32, 32, 33, 35, 35, 35, 37,
+ 40, 40, 40, 41, 42, 42, 42, 44, 46, 46, 46, 46, 45, 45, 45, 45, 45, 45,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 35, 37, 37, 37, 39, 41, 41, 41, 42,
+ 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 46, 45, 45, 45, 33, 34, 34, 34,
+ 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46,
+ 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35,
+ 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47,
+ 46, 46, 46, 46, 45, 45, 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 39, 39,
+ 39, 41, 43, 43, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 46, 46, 46, 46,
+ 45, 45, 35, 35, 36, 36, 36, 37, 37, 37, 37, 39, 41, 41, 41, 43, 45, 45,
+ 45, 45, 46, 46, 46, 47, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 36, 37,
+ 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39,
+ 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 46, 46, 46, 46, 45, 45, 36, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+ 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 46, 46,
+ 46, 46, 45, 45, 39, 39, 40, 40, 40, 41, 41, 41, 41, 42, 44, 44, 44, 45,
+ 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 48, 48, 48, 48, 47, 47, 47,
+ 41, 42, 42, 42, 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48,
+ 48, 48, 48, 49, 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49,
+ 50, 50, 50, 50, 49, 49, 49, 49, 49, 49, 41, 42, 42, 42, 42, 42, 42, 42,
+ 42, 43, 45, 45, 45, 46, 47, 47, 47, 48, 48, 48, 48, 49, 50, 50, 50, 50,
+ 49, 49, 49, 49, 49, 49, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45, 46, 46,
+ 46, 47, 47, 47, 47, 48, 49, 49, 49, 50, 51, 51, 51, 51, 51, 51, 51, 51,
+ 51, 51, 49, 48, 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48,
+ 48, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48,
+ 47, 47, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 47,
+ 46, 46, 46, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 51, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 46, 46, 46, 46,
+ 47, 47, 47, 47, 47, 47, 47, 48, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53,
+ 53, 54, 54, 54, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46,
+ 46, 46, 46, 48, 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55,
+ 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48,
+ 49, 49, 49, 51, 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 48, 48, 47, 47,
+ 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46, 46, 48, 49, 49, 49, 51,
+ 53, 53, 53, 53, 54, 54, 54, 55, 55, 55, 49, 48, 47, 47, 47, 46, 45, 45,
+ 45, 45, 46, 46, 46, 46, 46, 46, 46, 47, 49, 49, 49, 51, 53, 53, 53, 54,
+ 55, 55, 55, 56, 57, 57, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45,
+ 45, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57,
+ 58, 58, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+ 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 55, 55, 55, 57, 58, 58,
+ /* Size 4x8 */
+ 31, 34, 42, 48, 31, 35, 42, 46, 33, 37, 44, 46, 36, 41, 46, 46, 40, 44,
+ 48, 48, 45, 46, 49, 51, 47, 47, 50, 54, 47, 46, 49, 55,
+ /* Size 8x4 */
+ 31, 31, 33, 36, 40, 45, 47, 47, 34, 35, 37, 41, 44, 46, 47, 46, 42, 42,
+ 44, 46, 48, 49, 50, 49, 48, 46, 46, 46, 48, 51, 54, 55,
+ /* Size 8x16 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31,
+ 31, 38, 38, 47, 47, 47, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40,
+ 40, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47,
+ 47, 46, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 49, 46,
+ 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 48, 46, 46, 47,
+ 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 49, 45, 45, 46, 46, 53,
+ 53, 58,
+ /* Size 16x8 */
+ 32, 31, 31, 30, 30, 33, 33, 37, 37, 42, 42, 49, 49, 48, 48, 49, 31, 31,
+ 31, 32, 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 31, 31, 31, 32,
+ 32, 36, 36, 40, 40, 43, 43, 46, 46, 46, 46, 45, 37, 38, 38, 40, 40, 43,
+ 43, 47, 47, 47, 47, 48, 48, 47, 47, 46, 37, 38, 38, 40, 40, 43, 43, 47,
+ 47, 47, 47, 48, 48, 47, 47, 46, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50,
+ 50, 53, 53, 53, 53, 53, 48, 47, 47, 46, 46, 47, 47, 47, 47, 50, 50, 53,
+ 53, 53, 53, 53, 49, 47, 47, 45, 45, 46, 46, 45, 45, 49, 49, 53, 53, 56,
+ 56, 58,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 33, 37, 37, 37, 42, 48, 48, 48, 48, 49, 49, 31, 31,
+ 31, 31, 31, 34, 37, 37, 37, 42, 47, 47, 47, 48, 48, 48, 31, 31, 31, 31,
+ 31, 34, 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34,
+ 38, 38, 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 34, 38, 38,
+ 38, 42, 47, 47, 47, 47, 47, 47, 31, 31, 32, 32, 32, 35, 39, 39, 39, 42,
+ 46, 46, 46, 46, 46, 46, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46,
+ 46, 45, 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45,
+ 45, 45, 30, 31, 32, 32, 32, 35, 40, 40, 40, 42, 46, 46, 46, 45, 45, 45,
+ 32, 33, 34, 34, 34, 37, 41, 41, 41, 44, 46, 46, 46, 46, 45, 45, 33, 34,
+ 36, 36, 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36,
+ 36, 39, 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 33, 34, 36, 36, 36, 39,
+ 43, 43, 43, 45, 47, 47, 47, 46, 46, 46, 35, 36, 38, 38, 38, 41, 45, 45,
+ 45, 46, 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47,
+ 47, 47, 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47,
+ 47, 46, 45, 45, 37, 38, 40, 40, 40, 43, 47, 47, 47, 47, 47, 47, 47, 46,
+ 45, 45, 39, 40, 41, 41, 41, 44, 47, 47, 47, 48, 49, 49, 49, 48, 47, 47,
+ 42, 42, 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42,
+ 43, 43, 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 42, 42, 43, 43,
+ 43, 45, 47, 47, 47, 48, 50, 50, 50, 50, 49, 49, 45, 45, 44, 44, 44, 46,
+ 47, 47, 47, 49, 51, 51, 51, 51, 51, 51, 49, 48, 46, 46, 46, 47, 48, 48,
+ 48, 50, 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50,
+ 53, 53, 53, 53, 53, 53, 49, 48, 46, 46, 46, 47, 48, 48, 48, 50, 53, 53,
+ 53, 53, 53, 53, 48, 47, 46, 46, 46, 47, 47, 47, 47, 50, 53, 53, 53, 54,
+ 54, 54, 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56,
+ 48, 47, 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47,
+ 46, 46, 46, 46, 47, 47, 47, 50, 53, 53, 53, 54, 56, 56, 48, 47, 45, 45,
+ 45, 46, 46, 46, 46, 49, 53, 53, 53, 55, 57, 57, 49, 47, 45, 45, 45, 45,
+ 46, 46, 46, 49, 53, 53, 53, 56, 58, 58, 49, 47, 45, 45, 45, 45, 46, 46,
+ 46, 49, 53, 53, 53, 56, 58, 58,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+ 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 34, 34, 36, 38, 38, 38, 40, 42, 42, 42, 45,
+ 48, 48, 48, 47, 47, 47, 47, 47, 47, 47, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+ 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36,
+ 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45,
+ 45, 45, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40,
+ 40, 41, 43, 43, 43, 44, 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 37, 39, 39, 39, 41, 43, 43, 43, 44, 45, 45,
+ 45, 46, 47, 47, 47, 47, 46, 46, 46, 46, 45, 45, 37, 37, 38, 38, 38, 39,
+ 40, 40, 40, 41, 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48,
+ 48, 47, 47, 47, 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41,
+ 43, 43, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47,
+ 47, 46, 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45,
+ 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 44, 45, 45, 45, 46, 47, 47, 47, 48,
+ 48, 48, 48, 49, 50, 50, 50, 50, 50, 50, 50, 49, 49, 49, 48, 47, 47, 47,
+ 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51,
+ 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46,
+ 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53,
+ 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 47, 47,
+ 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53, 53, 53, 53, 53, 53, 53,
+ 53, 53, 48, 48, 47, 47, 47, 46, 45, 45, 45, 46, 46, 46, 46, 46, 46, 46,
+ 46, 48, 50, 50, 50, 51, 53, 53, 53, 54, 54, 54, 54, 55, 56, 56, 49, 48,
+ 47, 47, 47, 46, 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49,
+ 49, 51, 53, 53, 53, 54, 56, 56, 56, 57, 58, 58, 49, 48, 47, 47, 47, 46,
+ 45, 45, 45, 45, 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53,
+ 53, 54, 56, 56, 56, 57, 58, 58,
+ /* Size 4x16 */
+ 31, 33, 42, 48, 31, 34, 42, 47, 31, 34, 42, 47, 31, 35, 42, 45, 31, 35,
+ 42, 45, 34, 39, 45, 46, 34, 39, 45, 46, 38, 43, 47, 46, 38, 43, 47, 46,
+ 42, 45, 48, 50, 42, 45, 48, 50, 48, 47, 50, 53, 48, 47, 50, 53, 47, 46,
+ 50, 54, 47, 46, 50, 54, 47, 45, 49, 56,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 34, 34, 38, 38, 42, 42, 48, 48, 47, 47, 47, 33, 34,
+ 34, 35, 35, 39, 39, 43, 43, 45, 45, 47, 47, 46, 46, 45, 42, 42, 42, 42,
+ 42, 45, 45, 47, 47, 48, 48, 50, 50, 50, 50, 49, 48, 47, 47, 45, 45, 46,
+ 46, 46, 46, 50, 50, 53, 53, 54, 54, 56,
+ /* Size 8x32 */
+ 32, 31, 31, 37, 37, 48, 48, 49, 31, 31, 31, 37, 37, 47, 47, 48, 31, 31,
+ 31, 38, 38, 47, 47, 47, 31, 31, 31, 38, 38, 47, 47, 47, 31, 31, 31, 38,
+ 38, 47, 47, 47, 31, 32, 32, 39, 39, 46, 46, 46, 30, 32, 32, 40, 40, 46,
+ 46, 45, 30, 32, 32, 40, 40, 46, 46, 45, 30, 32, 32, 40, 40, 46, 46, 45,
+ 32, 34, 34, 41, 41, 46, 46, 45, 33, 36, 36, 43, 43, 47, 47, 46, 33, 36,
+ 36, 43, 43, 47, 47, 46, 33, 36, 36, 43, 43, 47, 47, 46, 35, 38, 38, 45,
+ 45, 47, 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 37, 40, 40, 47, 47, 47,
+ 47, 45, 37, 40, 40, 47, 47, 47, 47, 45, 39, 41, 41, 47, 47, 49, 49, 47,
+ 42, 43, 43, 47, 47, 50, 50, 49, 42, 43, 43, 47, 47, 50, 50, 49, 42, 43,
+ 43, 47, 47, 50, 50, 49, 45, 44, 44, 47, 47, 51, 51, 51, 49, 46, 46, 48,
+ 48, 53, 53, 53, 49, 46, 46, 48, 48, 53, 53, 53, 49, 46, 46, 48, 48, 53,
+ 53, 53, 48, 46, 46, 47, 47, 53, 53, 54, 48, 46, 46, 47, 47, 53, 53, 56,
+ 48, 46, 46, 47, 47, 53, 53, 56, 48, 46, 46, 47, 47, 53, 53, 56, 48, 45,
+ 45, 46, 46, 53, 53, 57, 49, 45, 45, 46, 46, 53, 53, 58, 49, 45, 45, 46,
+ 46, 53, 53, 58,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 33, 33, 35, 37, 37, 37, 39,
+ 42, 42, 42, 45, 49, 49, 49, 48, 48, 48, 48, 48, 49, 49, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44,
+ 46, 46, 46, 46, 46, 46, 46, 45, 45, 45, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 34, 36, 36, 36, 38, 40, 40, 40, 41, 43, 43, 43, 44, 46, 46, 46, 46,
+ 46, 46, 46, 45, 45, 45, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43,
+ 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46,
+ 46, 46, 37, 37, 38, 38, 38, 39, 40, 40, 40, 41, 43, 43, 43, 45, 47, 47,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 48, 47,
+ 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50,
+ 50, 51, 53, 53, 53, 53, 53, 53, 53, 53, 53, 53, 48, 47, 47, 47, 47, 46,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 49, 50, 50, 50, 51, 53, 53,
+ 53, 53, 53, 53, 53, 53, 53, 53, 49, 48, 47, 47, 47, 46, 45, 45, 45, 45,
+ 46, 46, 46, 45, 45, 45, 45, 47, 49, 49, 49, 51, 53, 53, 53, 54, 56, 56,
+ 56, 57, 58, 58 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 35, 32, 32, 33, 35, 32, 33, 35, 38, 35, 35, 38, 46,
+ /* Size 8x8 */
+ 31, 31, 31, 32, 32, 32, 34, 35, 31, 32, 32, 32, 32, 33, 34, 35, 31, 32,
+ 32, 32, 32, 33, 33, 34, 32, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 34,
+ 35, 35, 36, 38, 32, 33, 33, 34, 35, 36, 38, 40, 34, 34, 33, 35, 36, 38,
+ 39, 42, 35, 35, 34, 36, 38, 40, 42, 48,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 34, 36, 36, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 35, 35, 36, 36, 31, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 35, 35, 36,
+ 36, 36, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 35, 35, 36, 37, 37, 38, 38, 33, 33, 33, 33,
+ 33, 33, 34, 35, 35, 36, 36, 38, 39, 40, 42, 42, 34, 34, 34, 34, 33, 33,
+ 35, 35, 36, 37, 37, 39, 39, 41, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36,
+ 36, 37, 37, 40, 41, 42, 45, 45, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38,
+ 38, 42, 42, 45, 48, 48, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 42,
+ 42, 45, 48, 48,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 37, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35,
+ 35, 36, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36,
+ 36, 37, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 37, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 36, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37,
+ 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37,
+ 37, 38, 38, 38, 38, 39, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 39, 40, 40,
+ 40, 41, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35,
+ 35, 36, 36, 36, 36, 37, 38, 39, 39, 39, 40, 41, 42, 42, 42, 42, 34, 34,
+ 34, 34, 34, 34, 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37,
+ 37, 38, 39, 39, 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34,
+ 34, 33, 33, 33, 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39,
+ 39, 39, 41, 42, 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 38, 39, 39, 39, 39, 41, 42,
+ 42, 42, 42, 43, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36,
+ 36, 36, 36, 37, 37, 37, 37, 38, 40, 41, 41, 41, 42, 44, 45, 45, 45, 45,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 37,
+ 38, 38, 38, 39, 41, 42, 42, 42, 44, 46, 47, 47, 47, 48, 36, 35, 35, 35,
+ 35, 35, 35, 34, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40,
+ 42, 42, 42, 42, 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34,
+ 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42,
+ 45, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 35,
+ 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 42, 42, 42, 42, 45, 47, 48, 48,
+ 48, 49, 37, 37, 36, 36, 36, 36, 36, 35, 35, 35, 35, 36, 37, 37, 37, 37,
+ 38, 39, 39, 39, 39, 41, 42, 43, 43, 43, 45, 48, 49, 49, 49, 50,
+ /* Size 4x8 */
+ 31, 31, 32, 35, 32, 32, 32, 35, 32, 32, 33, 34, 32, 32, 34, 36, 32, 33,
+ 35, 38, 33, 33, 36, 40, 34, 34, 37, 42, 35, 34, 38, 48,
+ /* Size 8x4 */
+ 31, 32, 32, 32, 32, 33, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34, 32, 32,
+ 33, 34, 35, 36, 37, 38, 35, 35, 34, 36, 38, 40, 42, 48,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32,
+ 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32,
+ 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 33, 34, 34,
+ 35, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 37,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 33, 33,
+ 33, 35, 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35,
+ 37, 37, 43, 44, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38,
+ 46, 48,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 34, 34, 36, 36, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 34, 34,
+ 34, 35, 35, 36, 37, 37, 38, 38, 32, 32, 32, 32, 33, 33, 34, 34, 34, 35,
+ 35, 36, 37, 37, 38, 38, 35, 35, 35, 34, 34, 34, 35, 36, 36, 37, 37, 40,
+ 41, 43, 46, 46, 36, 35, 35, 35, 34, 34, 36, 36, 37, 38, 38, 41, 42, 44,
+ 48, 48,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35,
+ 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 34, 35, 35, 35, 35, 31, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 34, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 34, 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34,
+ 34, 35, 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35,
+ 36, 36, 36, 36, 32, 32, 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 37,
+ 37, 37, 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 37, 38, 38, 38,
+ 32, 32, 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32,
+ 32, 33, 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 32, 32, 33,
+ 33, 33, 34, 35, 35, 35, 35, 36, 37, 38, 38, 38, 32, 33, 33, 33, 33, 33,
+ 34, 35, 36, 36, 36, 37, 39, 40, 40, 40, 33, 33, 33, 33, 33, 33, 35, 36,
+ 36, 36, 36, 38, 40, 41, 41, 41, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37,
+ 37, 39, 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+ 41, 42, 42, 42, 34, 34, 34, 34, 34, 34, 35, 36, 37, 37, 37, 39, 41, 42,
+ 42, 42, 34, 34, 34, 34, 34, 34, 35, 37, 37, 37, 37, 40, 43, 44, 44, 44,
+ 35, 35, 34, 34, 34, 34, 36, 37, 38, 38, 38, 41, 45, 47, 47, 47, 36, 35,
+ 35, 34, 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34,
+ 34, 34, 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 36, 35, 35, 34, 34, 34,
+ 36, 37, 38, 38, 38, 42, 46, 48, 48, 48, 37, 36, 36, 36, 36, 36, 37, 38,
+ 39, 39, 39, 42, 46, 49, 49, 49,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 34, 36, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 36, 36, 36, 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37,
+ 37, 37, 37, 38, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 36,
+ 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 38, 39, 39, 39,
+ 40, 41, 42, 42, 42, 42, 35, 35, 35, 35, 35, 35, 34, 34, 34, 34, 34, 35,
+ 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41, 41, 41, 43, 45, 46, 46,
+ 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36,
+ 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35,
+ 35, 35, 35, 35, 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38,
+ 38, 40, 41, 42, 42, 42, 44, 47, 48, 48, 48, 49, 36, 35, 35, 35, 35, 35,
+ 35, 35, 34, 34, 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42,
+ 42, 42, 44, 47, 48, 48, 48, 49,
+ /* Size 4x16 */
+ 31, 31, 32, 36, 31, 32, 32, 35, 32, 32, 32, 35, 32, 32, 32, 35, 32, 32,
+ 33, 34, 32, 32, 33, 34, 32, 32, 34, 36, 32, 32, 34, 36, 32, 32, 34, 37,
+ 32, 33, 35, 38, 32, 33, 35, 38, 33, 33, 36, 41, 34, 34, 37, 42, 34, 34,
+ 37, 44, 35, 34, 38, 48, 35, 34, 38, 48,
+ /* Size 16x4 */
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 32, 32, 32, 32,
+ 33, 33, 34, 34, 34, 35, 35, 36, 37, 37, 38, 38, 36, 35, 35, 35, 34, 34,
+ 36, 36, 37, 38, 38, 41, 42, 44, 48, 48,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 32, 32, 35, 36, 31, 31, 31, 32, 32, 32, 35, 35, 31, 32,
+ 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32,
+ 32, 32, 35, 35, 31, 32, 32, 32, 32, 32, 35, 35, 31, 32, 32, 32, 32, 32,
+ 34, 35, 31, 32, 32, 32, 32, 32, 34, 35, 31, 32, 32, 32, 33, 33, 34, 34,
+ 31, 32, 32, 32, 33, 33, 34, 34, 31, 32, 32, 32, 33, 33, 34, 34, 31, 32,
+ 32, 33, 33, 33, 35, 35, 31, 32, 32, 33, 34, 34, 35, 36, 32, 32, 32, 33,
+ 34, 34, 36, 36, 32, 32, 32, 33, 34, 34, 36, 36, 32, 32, 32, 33, 34, 34,
+ 36, 36, 32, 32, 32, 33, 34, 34, 36, 37, 32, 32, 33, 33, 35, 35, 37, 38,
+ 32, 32, 33, 34, 35, 35, 37, 38, 32, 32, 33, 34, 35, 35, 37, 38, 32, 32,
+ 33, 34, 35, 35, 37, 38, 32, 33, 33, 34, 36, 36, 39, 40, 33, 33, 33, 35,
+ 36, 36, 40, 41, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37,
+ 41, 42, 34, 34, 34, 35, 37, 37, 41, 42, 34, 34, 34, 35, 37, 37, 43, 44,
+ 35, 34, 34, 36, 38, 38, 45, 47, 36, 35, 34, 36, 38, 38, 46, 48, 36, 35,
+ 34, 36, 38, 38, 46, 48, 36, 35, 34, 36, 38, 38, 46, 48, 37, 36, 36, 37,
+ 39, 39, 46, 49,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 34, 34, 35, 36, 36, 36, 37, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 34, 34, 34, 35, 35, 35, 36, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34, 34, 36, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 37, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 35, 35, 35, 35, 35, 35,
+ 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 39, 40, 41,
+ 41, 41, 43, 45, 46, 46, 46, 46, 36, 35, 35, 35, 35, 35, 35, 35, 34, 34,
+ 34, 35, 36, 36, 36, 36, 37, 38, 38, 38, 38, 40, 41, 42, 42, 42, 44, 47,
+ 48, 48, 48, 49 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 32, 38, 46, 32, 34, 41, 46, 38, 41, 47, 47, 46, 46, 47, 52,
+ /* Size 8x8 */
+ 31, 31, 30, 34, 36, 39, 42, 48, 31, 31, 31, 34, 37, 40, 42, 47, 30, 31,
+ 32, 35, 39, 41, 42, 46, 34, 34, 35, 39, 42, 44, 45, 47, 36, 37, 39, 42,
+ 46, 47, 47, 47, 39, 40, 41, 44, 47, 47, 48, 49, 42, 42, 42, 45, 47, 48,
+ 48, 50, 48, 47, 46, 47, 47, 49, 50, 53,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 34, 36, 36, 40, 41, 44, 49, 49, 31, 31,
+ 31, 31, 31, 31, 33, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31, 31, 31,
+ 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 47, 47, 31, 31, 31, 31, 31, 31,
+ 34, 35, 36, 39, 39, 41, 42, 44, 47, 47, 30, 31, 31, 31, 32, 32, 34, 35,
+ 37, 40, 40, 42, 42, 44, 46, 46, 30, 31, 31, 31, 32, 32, 34, 35, 37, 40,
+ 40, 42, 42, 44, 46, 46, 33, 33, 34, 34, 34, 34, 37, 38, 40, 42, 42, 44,
+ 44, 45, 47, 47, 33, 34, 34, 35, 35, 35, 38, 39, 40, 43, 43, 44, 45, 46,
+ 47, 47, 34, 36, 36, 36, 37, 37, 40, 40, 42, 45, 45, 45, 46, 46, 47, 47,
+ 36, 38, 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 36, 38,
+ 38, 39, 40, 40, 42, 43, 45, 47, 47, 47, 47, 47, 48, 48, 40, 41, 41, 41,
+ 42, 42, 44, 44, 45, 47, 47, 48, 48, 49, 50, 50, 41, 42, 42, 42, 42, 42,
+ 44, 45, 46, 47, 47, 48, 48, 49, 50, 50, 44, 44, 44, 44, 44, 44, 45, 46,
+ 46, 47, 47, 49, 49, 50, 51, 51, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48,
+ 48, 50, 50, 51, 53, 53, 49, 48, 47, 47, 46, 46, 47, 47, 47, 48, 48, 50,
+ 50, 51, 53, 53,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 34, 36,
+ 36, 36, 36, 38, 40, 41, 41, 41, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 35, 36, 37, 37, 37, 39,
+ 41, 42, 42, 42, 44, 47, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 33, 34, 34, 34, 36, 37, 38, 38, 38, 39, 41, 42, 42, 42,
+ 44, 46, 48, 48, 48, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 34, 34, 34, 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47,
+ 47, 47, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34,
+ 36, 37, 38, 38, 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38,
+ 38, 40, 41, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 33, 34, 35, 35, 35, 36, 38, 39, 39, 39, 40, 41, 42,
+ 42, 42, 44, 46, 47, 47, 47, 47, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 41, 42, 42, 42, 42, 44, 46,
+ 46, 46, 46, 46, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35,
+ 35, 35, 37, 39, 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39,
+ 40, 40, 40, 41, 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 30, 30, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 39, 40, 40, 40, 41,
+ 42, 42, 42, 42, 44, 45, 46, 46, 46, 46, 31, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 34, 36, 37, 37, 37, 38, 40, 41, 41, 41, 42, 43, 43, 43, 43,
+ 44, 46, 46, 46, 46, 46, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 36,
+ 37, 38, 38, 38, 40, 41, 42, 42, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+ 47, 46, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39,
+ 40, 42, 43, 43, 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43,
+ 43, 44, 44, 45, 45, 45, 46, 47, 47, 47, 47, 47, 33, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 43, 43, 43, 44, 44, 45,
+ 45, 45, 46, 47, 47, 47, 47, 47, 34, 35, 36, 36, 36, 36, 36, 37, 37, 37,
+ 37, 38, 40, 40, 40, 40, 42, 44, 45, 45, 45, 45, 45, 46, 46, 46, 46, 47,
+ 47, 47, 47, 47, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39, 39, 40, 41, 42,
+ 42, 42, 44, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 36, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 47, 36, 37, 38, 38, 38, 38, 39, 39,
+ 40, 40, 40, 41, 42, 43, 43, 43, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48, 48, 47, 38, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42,
+ 43, 44, 44, 44, 45, 47, 47, 47, 47, 47, 48, 48, 48, 48, 48, 48, 49, 49,
+ 49, 48, 40, 41, 41, 41, 41, 41, 41, 42, 42, 42, 42, 43, 44, 44, 44, 44,
+ 45, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 49, 50, 50, 50, 49, 41, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47,
+ 47, 48, 48, 48, 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48,
+ 48, 48, 49, 50, 50, 50, 50, 50, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 43, 44, 45, 45, 45, 46, 47, 47, 47, 47, 48, 48, 48, 48, 48, 49, 50,
+ 50, 50, 50, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 45, 46,
+ 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 51, 51, 51, 51,
+ 47, 47, 46, 46, 46, 46, 46, 46, 45, 45, 45, 46, 46, 47, 47, 47, 47, 47,
+ 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 52, 52, 52, 52, 49, 48, 48, 47,
+ 47, 47, 47, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49,
+ 50, 50, 50, 50, 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46,
+ 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50,
+ 51, 52, 53, 53, 53, 53, 49, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46,
+ 47, 47, 47, 47, 47, 47, 48, 48, 48, 49, 50, 50, 50, 50, 51, 52, 53, 53,
+ 53, 53, 49, 48, 47, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53,
+ /* Size 4x8 */
+ 31, 31, 37, 48, 31, 31, 38, 47, 31, 32, 40, 46, 34, 36, 43, 47, 37, 39,
+ 46, 47, 39, 41, 47, 48, 42, 43, 47, 50, 48, 46, 48, 53,
+ /* Size 8x4 */
+ 31, 31, 31, 34, 37, 39, 42, 48, 31, 31, 32, 36, 39, 41, 43, 46, 37, 38,
+ 40, 43, 46, 47, 47, 48, 48, 47, 46, 47, 47, 48, 50, 53,
+ /* Size 8x16 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31,
+ 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39, 45, 46, 30, 32, 32, 35,
+ 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 33, 34, 35, 37, 42, 42,
+ 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 35, 37, 37, 40, 44, 44, 46, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 41, 42,
+ 42, 44, 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45,
+ 47, 47, 50, 51, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48,
+ 52, 53,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 30, 30, 33, 33, 35, 37, 37, 41, 42, 44, 49, 49, 31, 31,
+ 31, 31, 32, 32, 34, 35, 37, 39, 39, 42, 42, 44, 47, 47, 31, 31, 31, 32,
+ 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 33, 34, 34, 34, 35, 35,
+ 37, 38, 40, 43, 43, 44, 44, 45, 47, 47, 37, 38, 38, 39, 40, 40, 42, 43,
+ 44, 47, 47, 47, 47, 47, 48, 48, 37, 38, 38, 39, 40, 40, 42, 43, 44, 47,
+ 47, 47, 47, 47, 48, 48, 45, 45, 45, 45, 44, 44, 46, 46, 46, 47, 47, 49,
+ 49, 50, 52, 52, 48, 47, 47, 46, 46, 46, 47, 47, 47, 47, 47, 49, 50, 51,
+ 53, 53,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 40, 45, 48, 48, 48, 31, 31,
+ 31, 31, 31, 31, 33, 36, 37, 37, 37, 41, 45, 48, 48, 48, 31, 31, 31, 31,
+ 31, 31, 34, 36, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31,
+ 34, 37, 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37,
+ 38, 38, 38, 41, 45, 47, 47, 47, 31, 31, 31, 31, 31, 31, 34, 37, 38, 38,
+ 38, 41, 45, 47, 47, 47, 31, 31, 31, 32, 32, 32, 34, 37, 39, 39, 39, 41,
+ 45, 46, 46, 46, 30, 31, 31, 32, 32, 32, 34, 38, 39, 39, 39, 42, 44, 46,
+ 46, 46, 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46,
+ 30, 31, 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 30, 31,
+ 32, 32, 32, 32, 35, 38, 40, 40, 40, 42, 44, 46, 46, 46, 31, 32, 33, 33,
+ 33, 33, 36, 39, 41, 41, 41, 43, 45, 46, 46, 46, 33, 34, 34, 35, 35, 35,
+ 37, 40, 42, 42, 42, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41,
+ 43, 43, 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43,
+ 43, 44, 46, 47, 47, 47, 33, 34, 35, 36, 36, 36, 38, 41, 43, 43, 43, 44,
+ 46, 47, 47, 47, 35, 36, 37, 37, 37, 37, 40, 43, 44, 44, 44, 45, 46, 47,
+ 47, 47, 36, 37, 38, 39, 39, 39, 42, 44, 46, 46, 46, 47, 47, 47, 47, 47,
+ 37, 38, 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38,
+ 39, 40, 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 37, 38, 39, 40,
+ 40, 40, 43, 45, 47, 47, 47, 47, 47, 47, 47, 47, 39, 39, 40, 41, 41, 41,
+ 43, 46, 47, 47, 47, 48, 48, 48, 48, 48, 41, 41, 42, 42, 42, 42, 44, 46,
+ 47, 47, 47, 48, 49, 49, 49, 49, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47,
+ 47, 48, 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48,
+ 49, 50, 50, 50, 42, 42, 42, 43, 43, 43, 44, 46, 47, 47, 47, 48, 49, 50,
+ 50, 50, 44, 44, 44, 44, 44, 44, 45, 47, 47, 47, 47, 49, 50, 51, 51, 51,
+ 47, 46, 46, 46, 46, 46, 46, 47, 48, 48, 48, 49, 51, 52, 52, 52, 49, 48,
+ 47, 46, 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46,
+ 46, 46, 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46,
+ 47, 48, 48, 48, 48, 50, 52, 53, 53, 53, 49, 48, 47, 46, 46, 46, 47, 47,
+ 47, 47, 47, 49, 52, 53, 53, 53,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 36, 37, 38, 38, 38, 39,
+ 41, 42, 42, 42, 44, 46, 48, 48, 48, 48, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40, 42, 42, 42, 42,
+ 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+ 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46,
+ 46, 46, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36,
+ 37, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40,
+ 40, 41, 42, 43, 43, 43, 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 36, 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44,
+ 44, 44, 45, 46, 47, 47, 47, 47, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38,
+ 38, 39, 40, 41, 41, 41, 43, 44, 45, 45, 45, 46, 46, 46, 46, 46, 47, 47,
+ 48, 48, 48, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43,
+ 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47,
+ 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46,
+ 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 40, 41, 41, 41, 41, 41, 41, 42,
+ 42, 42, 42, 43, 44, 44, 44, 44, 45, 47, 47, 47, 47, 48, 48, 48, 48, 48,
+ 49, 49, 50, 50, 50, 49, 45, 45, 45, 45, 45, 45, 45, 44, 44, 44, 44, 45,
+ 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49, 49, 49, 50, 51, 52, 52,
+ 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47,
+ 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48,
+ 47, 47, 47, 47, 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47,
+ 47, 48, 49, 50, 50, 50, 51, 52, 53, 53, 53, 53, 48, 48, 47, 47, 47, 47,
+ 46, 46, 46, 46, 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50,
+ 50, 50, 51, 52, 53, 53, 53, 53,
+ /* Size 4x16 */
+ 31, 31, 37, 48, 31, 31, 38, 47, 31, 31, 38, 47, 31, 32, 39, 46, 31, 32,
+ 40, 46, 31, 32, 40, 46, 34, 35, 42, 47, 34, 36, 43, 47, 36, 37, 44, 47,
+ 38, 40, 47, 47, 38, 40, 47, 47, 41, 42, 47, 49, 42, 43, 47, 50, 44, 44,
+ 47, 51, 48, 46, 48, 53, 48, 46, 48, 53,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 34, 34, 36, 38, 38, 41, 42, 44, 48, 48, 31, 31,
+ 31, 32, 32, 32, 35, 36, 37, 40, 40, 42, 43, 44, 46, 46, 37, 38, 38, 39,
+ 40, 40, 42, 43, 44, 47, 47, 47, 47, 47, 48, 48, 48, 47, 47, 46, 46, 46,
+ 47, 47, 47, 47, 47, 49, 50, 51, 53, 53,
+ /* Size 8x32 */
+ 32, 31, 31, 33, 37, 37, 45, 48, 31, 31, 31, 33, 37, 37, 45, 48, 31, 31,
+ 31, 34, 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 31, 34,
+ 38, 38, 45, 47, 31, 31, 31, 34, 38, 38, 45, 47, 31, 31, 32, 34, 39, 39,
+ 45, 46, 30, 31, 32, 34, 39, 39, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46,
+ 30, 32, 32, 35, 40, 40, 44, 46, 30, 32, 32, 35, 40, 40, 44, 46, 31, 33,
+ 33, 36, 41, 41, 45, 46, 33, 34, 35, 37, 42, 42, 46, 47, 33, 35, 36, 38,
+ 43, 43, 46, 47, 33, 35, 36, 38, 43, 43, 46, 47, 33, 35, 36, 38, 43, 43,
+ 46, 47, 35, 37, 37, 40, 44, 44, 46, 47, 36, 38, 39, 42, 46, 46, 47, 47,
+ 37, 39, 40, 43, 47, 47, 47, 47, 37, 39, 40, 43, 47, 47, 47, 47, 37, 39,
+ 40, 43, 47, 47, 47, 47, 39, 40, 41, 43, 47, 47, 48, 48, 41, 42, 42, 44,
+ 47, 47, 49, 49, 42, 42, 43, 44, 47, 47, 49, 50, 42, 42, 43, 44, 47, 47,
+ 49, 50, 42, 42, 43, 44, 47, 47, 49, 50, 44, 44, 44, 45, 47, 47, 50, 51,
+ 47, 46, 46, 46, 48, 48, 51, 52, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47,
+ 46, 47, 48, 48, 52, 53, 49, 47, 46, 47, 48, 48, 52, 53, 49, 47, 46, 47,
+ 47, 47, 52, 53,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 35, 36,
+ 37, 37, 37, 39, 41, 42, 42, 42, 44, 47, 49, 49, 49, 49, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 37, 38, 39, 39, 39, 40,
+ 42, 42, 42, 42, 44, 46, 47, 47, 47, 47, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 35, 36, 36, 36, 37, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 44, 46, 46, 46, 46, 46, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 37, 38, 38, 38, 40, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 46, 47, 47,
+ 47, 47, 37, 37, 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43,
+ 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 37, 37,
+ 38, 38, 38, 38, 39, 39, 40, 40, 40, 41, 42, 43, 43, 43, 44, 46, 47, 47,
+ 47, 47, 47, 47, 47, 47, 47, 48, 48, 48, 48, 47, 45, 45, 45, 45, 45, 45,
+ 45, 44, 44, 44, 44, 45, 46, 46, 46, 46, 46, 47, 47, 47, 47, 48, 49, 49,
+ 49, 49, 50, 51, 52, 52, 52, 52, 48, 48, 47, 47, 47, 47, 46, 46, 46, 46,
+ 46, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 48, 49, 50, 50, 50, 51, 52,
+ 53, 53, 53, 53 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 33, 34, 32, 33, 34, 35,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32,
+ 33, 33, 34, 35, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32, 34, 34,
+ 35, 36, 33, 33, 33, 33, 35, 35, 36, 38,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 34, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 35, 35, 35, 36, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 34, 35, 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 35, 36, 36, 36, 38, 34, 34, 34, 34, 34, 33, 33, 34, 35, 35, 35, 36,
+ 37, 37, 38, 39,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34,
+ 34, 34, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35,
+ 35, 35, 36, 36, 36, 36, 36, 37, 38, 38, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 36, 37, 38, 38, 38, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38,
+ 39, 39, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 37, 37, 38, 38, 39, 39,
+ /* Size 4x8 */
+ 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32,
+ 33, 34, 32, 32, 34, 34, 32, 33, 34, 35, 33, 33, 35, 36,
+ /* Size 8x4 */
+ 31, 31, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 33, 34, 34, 35, 32, 32, 32, 33, 34, 34, 35, 36,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 33, 31, 32,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33,
+ 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+ 32, 32, 33, 35, 35, 35, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33,
+ 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 34, 34, 34, 34, 35, 37,
+ 37, 38,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 34, 34, 34, 35, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 34, 35, 35, 35, 36, 37, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35,
+ 35, 35, 36, 37, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 36, 36,
+ 36, 38,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 34, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 34, 34,
+ 34, 35, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34, 34, 34, 34, 35, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 34, 34, 34, 34, 35, 35, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 35, 35, 35, 35, 35, 36, 32, 32, 32, 32, 33, 33, 33, 33, 33, 34,
+ 35, 35, 35, 35, 36, 36, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35,
+ 35, 35, 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35,
+ 36, 37, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37,
+ 32, 32, 32, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 36, 37, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 36, 38, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 35, 36, 36, 36, 36, 37, 38, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 37, 37, 37, 37, 38, 39, 34, 34, 34, 34, 34, 34, 34, 34,
+ 35, 36, 37, 37, 37, 37, 38, 39,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+ 37, 37, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+ 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 38, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 33, 33, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 36,
+ 37, 37, 37, 37, 38, 38, 39, 39,
+ /* Size 4x16 */
+ 31, 31, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 33, 33, 32, 32, 33, 34,
+ 32, 32, 33, 34, 32, 32, 33, 34, 32, 32, 34, 35, 32, 33, 34, 35, 32, 33,
+ 34, 35, 33, 33, 35, 36, 34, 34, 36, 37,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 35, 36, 32, 32, 32, 32, 32, 33,
+ 33, 33, 34, 34, 34, 35, 35, 35, 36, 37,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32,
+ 32, 33, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 32, 32, 33,
+ 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32,
+ 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32,
+ 32, 33, 33, 33, 31, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 33, 33,
+ 33, 34, 31, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34,
+ 32, 32, 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 34, 32, 32,
+ 32, 32, 33, 34, 34, 34, 32, 32, 32, 32, 33, 34, 34, 35, 32, 32, 32, 32,
+ 33, 35, 35, 35, 32, 32, 33, 33, 33, 35, 35, 36, 32, 32, 33, 33, 34, 35,
+ 35, 36, 32, 32, 33, 33, 34, 35, 35, 36, 32, 32, 33, 33, 34, 35, 35, 36,
+ 32, 32, 33, 33, 34, 35, 35, 36, 32, 33, 33, 33, 34, 36, 36, 36, 33, 33,
+ 33, 33, 34, 36, 36, 37, 34, 34, 34, 34, 35, 37, 37, 38, 34, 34, 34, 34,
+ 35, 37, 37, 38,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 35, 35, 35, 35, 35, 35, 36, 36, 37, 37, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 35, 36, 36, 37, 37, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 36, 36, 36, 36, 36,
+ 36, 37, 38, 38 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 34, 38, 31, 32, 35, 40, 34, 35, 39, 43, 38, 40, 43, 47,
+ /* Size 8x8 */
+ 31, 31, 31, 30, 34, 35, 37, 40, 31, 31, 31, 31, 34, 35, 38, 41, 31, 31,
+ 31, 31, 35, 36, 39, 41, 30, 31, 31, 32, 35, 36, 40, 42, 34, 34, 35, 35,
+ 39, 40, 43, 44, 35, 35, 36, 36, 40, 41, 44, 45, 37, 38, 39, 40, 43, 44,
+ 47, 47, 40, 41, 41, 42, 44, 45, 47, 48,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 36, 36, 38, 41, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 37, 37, 39, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 34, 34, 37, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 35, 35, 37, 39, 39, 40, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35,
+ 35, 38, 40, 40, 41, 42, 30, 31, 31, 31, 31, 32, 32, 32, 34, 35, 35, 38,
+ 40, 40, 41, 42, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40,
+ 41, 43, 33, 33, 34, 34, 34, 34, 34, 35, 37, 38, 38, 41, 42, 42, 43, 44,
+ 33, 34, 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 33, 34,
+ 34, 34, 35, 35, 35, 36, 38, 39, 39, 41, 43, 43, 44, 45, 35, 36, 37, 37,
+ 37, 38, 38, 38, 41, 41, 41, 44, 46, 46, 46, 46, 36, 37, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 46, 47, 47, 47, 47, 36, 37, 38, 38, 39, 40, 40, 40,
+ 42, 43, 43, 46, 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44,
+ 44, 46, 47, 47, 47, 48, 41, 42, 42, 42, 42, 42, 42, 43, 44, 45, 45, 46,
+ 47, 47, 48, 48,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 38, 40, 41, 41, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 32, 33, 34, 34, 34, 34, 35,
+ 36, 37, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35, 36, 37, 37, 37,
+ 37, 38, 39, 40, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 34, 34, 34, 34, 34, 35, 36, 38, 38, 38, 38, 38, 40, 41,
+ 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33,
+ 34, 34, 34, 34, 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34,
+ 34, 35, 37, 38, 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 35, 37, 38,
+ 38, 38, 38, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 38, 39,
+ 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 42, 42,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35,
+ 35, 35, 35, 36, 37, 39, 39, 39, 39, 40, 40, 41, 42, 42, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36,
+ 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40,
+ 40, 40, 41, 42, 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42,
+ 42, 42, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 33,
+ 34, 35, 35, 35, 35, 36, 38, 39, 40, 40, 40, 40, 41, 42, 42, 42, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+ 36, 37, 38, 40, 40, 40, 40, 41, 41, 42, 43, 43, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 38, 39, 41,
+ 41, 41, 41, 42, 42, 43, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 34, 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 42, 42, 43,
+ 43, 44, 44, 44, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35,
+ 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45,
+ 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39,
+ 39, 39, 39, 40, 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34,
+ 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40,
+ 41, 43, 43, 43, 43, 43, 44, 44, 45, 45, 33, 34, 34, 34, 34, 34, 34, 34,
+ 35, 35, 35, 35, 35, 35, 36, 37, 38, 39, 39, 39, 39, 40, 41, 43, 43, 43,
+ 43, 43, 44, 44, 45, 45, 34, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36, 36,
+ 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 44, 44, 44, 44, 44, 45, 45,
+ 45, 45, 35, 36, 36, 36, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 39,
+ 41, 41, 41, 41, 41, 42, 44, 45, 46, 46, 46, 46, 46, 46, 46, 46, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 40, 41, 42, 43, 43, 43,
+ 43, 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47,
+ 47, 47, 47, 47, 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39,
+ 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47,
+ 47, 47, 47, 47, 36, 37, 37, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40,
+ 40, 41, 42, 43, 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47,
+ 37, 37, 38, 38, 39, 39, 39, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43,
+ 43, 43, 43, 44, 46, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40,
+ 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45,
+ 46, 47, 47, 47, 47, 47, 47, 48, 48, 48, 40, 40, 40, 41, 41, 41, 41, 41,
+ 41, 41, 42, 42, 42, 42, 42, 43, 44, 44, 44, 44, 44, 45, 46, 47, 47, 47,
+ 47, 47, 48, 48, 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42,
+ 42, 42, 43, 43, 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48,
+ 48, 48, 41, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 43, 43,
+ 44, 45, 45, 45, 45, 45, 46, 47, 47, 47, 47, 47, 48, 48, 48, 48,
+ /* Size 4x8 */
+ 31, 31, 35, 37, 31, 31, 36, 38, 31, 32, 37, 39, 31, 32, 37, 40, 34, 36,
+ 40, 43, 35, 37, 42, 44, 38, 40, 45, 47, 41, 42, 45, 47,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 34, 35, 38, 41, 31, 31, 32, 32, 36, 37, 40, 42, 35, 36,
+ 37, 37, 40, 42, 45, 45, 37, 38, 39, 40, 43, 44, 47, 47,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 38, 38, 39, 31, 31,
+ 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32,
+ 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40,
+ 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 33, 34, 35, 35, 37, 42, 42, 43,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 35, 37,
+ 38, 38, 41, 45, 45, 46, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40,
+ 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 42, 42, 43, 43, 44, 47,
+ 47, 48,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 30, 30, 31, 33, 33, 33, 35, 37, 37, 39, 42, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 35, 35, 37, 39, 39, 40, 42, 31, 31, 31, 31,
+ 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 31, 31, 31, 31, 32, 32,
+ 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 33, 33, 34, 34, 34, 35, 35, 35,
+ 37, 38, 38, 41, 43, 43, 43, 44, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43,
+ 43, 45, 47, 47, 47, 47, 37, 38, 38, 38, 39, 40, 40, 40, 42, 43, 43, 45,
+ 47, 47, 47, 47, 38, 39, 40, 40, 40, 41, 41, 41, 43, 44, 44, 46, 47, 47,
+ 47, 48,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 38, 42, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 35, 37, 37, 37, 37, 39, 42, 31, 31, 31, 31,
+ 31, 31, 31, 32, 33, 35, 38, 38, 38, 38, 39, 42, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32,
+ 34, 36, 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36,
+ 38, 38, 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38,
+ 38, 38, 40, 42, 31, 31, 31, 31, 31, 31, 31, 32, 34, 36, 38, 38, 38, 38,
+ 40, 42, 31, 31, 31, 31, 32, 32, 32, 32, 34, 36, 39, 39, 39, 39, 40, 42,
+ 30, 31, 31, 32, 32, 32, 32, 32, 34, 37, 39, 39, 39, 39, 40, 42, 30, 31,
+ 31, 32, 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32,
+ 32, 32, 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32,
+ 32, 33, 35, 37, 40, 40, 40, 40, 41, 42, 30, 31, 31, 32, 32, 32, 32, 33,
+ 35, 37, 40, 40, 40, 40, 41, 42, 31, 31, 32, 32, 33, 33, 33, 33, 35, 38,
+ 40, 40, 40, 40, 41, 43, 32, 32, 33, 33, 34, 34, 34, 34, 36, 39, 41, 41,
+ 41, 41, 42, 44, 33, 33, 34, 35, 35, 35, 35, 35, 37, 40, 42, 42, 42, 42,
+ 43, 44, 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45,
+ 33, 34, 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34,
+ 35, 35, 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 33, 34, 35, 35,
+ 36, 36, 36, 36, 38, 40, 43, 43, 43, 43, 44, 45, 34, 35, 36, 37, 37, 37,
+ 37, 37, 39, 42, 44, 44, 44, 44, 45, 45, 35, 36, 37, 38, 38, 38, 38, 39,
+ 41, 43, 45, 45, 45, 45, 46, 46, 36, 37, 38, 39, 39, 39, 39, 40, 42, 44,
+ 47, 47, 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47,
+ 47, 47, 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47,
+ 47, 47, 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47,
+ 37, 38, 39, 40, 40, 40, 40, 41, 43, 45, 47, 47, 47, 47, 47, 47, 39, 39,
+ 40, 41, 41, 41, 41, 42, 43, 45, 47, 47, 47, 47, 47, 48, 40, 41, 41, 42,
+ 42, 42, 42, 42, 44, 45, 47, 47, 47, 47, 47, 48, 42, 42, 42, 43, 43, 43,
+ 43, 43, 44, 46, 47, 47, 47, 47, 48, 48, 42, 42, 42, 43, 43, 43, 43, 43,
+ 44, 46, 47, 47, 47, 47, 48, 48,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 34, 35,
+ 36, 37, 38, 38, 38, 38, 39, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36, 37, 38, 39, 39,
+ 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 33, 35, 35, 35, 35, 35, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+ 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34,
+ 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36,
+ 36, 37, 38, 39, 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39,
+ 40, 40, 40, 40, 41, 42, 43, 43, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 34, 35, 36, 36, 36, 36, 37, 39, 40, 41, 41, 41, 41,
+ 42, 42, 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 36, 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44,
+ 35, 35, 35, 36, 36, 36, 36, 36, 36, 37, 37, 37, 37, 37, 38, 39, 40, 40,
+ 40, 40, 40, 42, 43, 44, 45, 45, 45, 45, 45, 45, 46, 46, 37, 37, 38, 38,
+ 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44,
+ 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38,
+ 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47,
+ 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40,
+ 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47,
+ 47, 47, 37, 37, 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41,
+ 42, 43, 43, 43, 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 38, 39,
+ 39, 40, 40, 40, 40, 40, 40, 40, 41, 41, 41, 41, 41, 42, 43, 44, 44, 44,
+ 44, 45, 46, 47, 47, 47, 47, 47, 47, 47, 48, 48, 42, 42, 42, 42, 42, 42,
+ 42, 42, 42, 42, 42, 42, 42, 42, 43, 44, 44, 45, 45, 45, 45, 45, 46, 47,
+ 47, 47, 47, 47, 48, 48, 48, 48,
+ /* Size 4x16 */
+ 31, 31, 35, 37, 31, 31, 35, 38, 31, 31, 36, 38, 31, 31, 36, 38, 31, 32,
+ 36, 39, 31, 32, 37, 40, 31, 32, 37, 40, 31, 33, 38, 40, 33, 35, 40, 42,
+ 34, 36, 40, 43, 34, 36, 40, 43, 36, 38, 43, 45, 38, 40, 45, 47, 38, 40,
+ 45, 47, 39, 41, 45, 47, 42, 43, 46, 47,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 34, 36, 38, 38, 39, 42, 31, 31,
+ 31, 31, 32, 32, 32, 33, 35, 36, 36, 38, 40, 40, 41, 43, 35, 35, 36, 36,
+ 36, 37, 37, 38, 40, 40, 40, 43, 45, 45, 45, 46, 37, 38, 38, 38, 39, 40,
+ 40, 40, 42, 43, 43, 45, 47, 47, 47, 47,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 33, 37, 37, 38, 31, 31, 31, 31, 33, 37, 37, 39, 31, 31,
+ 31, 31, 33, 38, 38, 39, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31,
+ 34, 38, 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 31, 31, 34, 38,
+ 38, 40, 31, 31, 31, 31, 34, 38, 38, 40, 31, 31, 32, 32, 34, 39, 39, 40,
+ 30, 31, 32, 32, 34, 39, 39, 40, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31,
+ 32, 32, 35, 40, 40, 41, 30, 31, 32, 32, 35, 40, 40, 41, 30, 31, 32, 32,
+ 35, 40, 40, 41, 31, 32, 33, 33, 35, 40, 40, 41, 32, 33, 34, 34, 36, 41,
+ 41, 42, 33, 34, 35, 35, 37, 42, 42, 43, 33, 35, 36, 36, 38, 43, 43, 44,
+ 33, 35, 36, 36, 38, 43, 43, 44, 33, 35, 36, 36, 38, 43, 43, 44, 33, 35,
+ 36, 36, 38, 43, 43, 44, 34, 36, 37, 37, 39, 44, 44, 45, 35, 37, 38, 38,
+ 41, 45, 45, 46, 36, 38, 39, 39, 42, 47, 47, 47, 37, 39, 40, 40, 43, 47,
+ 47, 47, 37, 39, 40, 40, 43, 47, 47, 47, 37, 39, 40, 40, 43, 47, 47, 47,
+ 37, 39, 40, 40, 43, 47, 47, 47, 39, 40, 41, 41, 43, 47, 47, 47, 40, 41,
+ 42, 42, 44, 47, 47, 47, 42, 42, 43, 43, 44, 47, 47, 48, 42, 42, 43, 43,
+ 44, 47, 47, 48,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 32, 33, 33,
+ 33, 33, 33, 34, 35, 36, 37, 37, 37, 37, 39, 40, 42, 42, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 35, 35, 35, 36,
+ 37, 38, 39, 39, 39, 39, 40, 41, 42, 42, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40,
+ 40, 40, 41, 42, 43, 43, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 33, 34, 35, 36, 36, 36, 36, 37, 38, 39, 40, 40, 40, 40, 41, 42,
+ 43, 43, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 36,
+ 37, 38, 38, 38, 38, 39, 41, 42, 43, 43, 43, 43, 43, 44, 44, 44, 37, 37,
+ 38, 38, 38, 38, 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43,
+ 43, 44, 45, 47, 47, 47, 47, 47, 47, 47, 47, 47, 37, 37, 38, 38, 38, 38,
+ 38, 38, 39, 39, 40, 40, 40, 40, 40, 41, 42, 43, 43, 43, 43, 44, 45, 47,
+ 47, 47, 47, 47, 47, 47, 47, 47, 38, 39, 39, 40, 40, 40, 40, 40, 40, 40,
+ 41, 41, 41, 41, 41, 42, 43, 44, 44, 44, 44, 45, 46, 47, 47, 47, 47, 47,
+ 47, 47, 48, 48 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 4x8 */
+ 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32,
+ 33, 34,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 34, 34,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 33, 33, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33,
+ 34, 34, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 34, 34, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 33, 34, 34,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 34, 34, 34, 34, 34,
+ /* Size 4x16 */
+ 31, 31, 31, 32, 31, 31, 31, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32,
+ 32, 33, 32, 32, 32, 33, 32, 32, 32, 33,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 33, 33, 33,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32,
+ 31, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 32, 31, 32,
+ 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32, 32, 33, 31, 32, 32, 32,
+ 32, 32, 33, 33, 31, 32, 32, 32, 32, 32, 33, 33, 31, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34,
+ 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32,
+ 32, 32, 32, 32, 33, 34, 32, 32, 32, 32, 32, 32, 33, 34, 32, 32, 32, 32,
+ 32, 32, 33, 34,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 32, 35, 34, 35, 35, 39,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 30, 31, 33, 33, 31, 31, 31, 31, 31, 32, 34, 34, 31, 31,
+ 31, 31, 31, 32, 34, 34, 31, 31, 31, 31, 31, 32, 35, 35, 30, 31, 31, 31,
+ 32, 32, 35, 35, 31, 32, 32, 32, 32, 33, 36, 36, 33, 34, 34, 35, 35, 36,
+ 39, 39, 33, 34, 34, 35, 35, 36, 39, 39,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 32, 33, 34, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+ 34, 35, 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+ 35, 35, 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35,
+ 30, 30, 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 30, 30,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 34, 36, 37, 37, 37, 33, 33, 33, 34, 34, 34,
+ 34, 34, 34, 34, 34, 36, 37, 38, 38, 38, 33, 34, 34, 34, 34, 34, 35, 35,
+ 35, 35, 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35,
+ 35, 37, 38, 39, 39, 39, 33, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 37,
+ 38, 39, 39, 39,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 31,
+ 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 32, 32, 33, 34,
+ 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34,
+ 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 34, 34, 34, 34, 34, 35, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33,
+ 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34,
+ 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 33, 34, 35,
+ 35, 35, 35, 35, 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+ 35, 35, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34,
+ 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35,
+ 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 30, 30, 30, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 36, 36, 36, 36, 36,
+ 36, 37, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 32, 32,
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 34, 35, 36, 37, 37, 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 33, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39,
+ 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35,
+ 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40,
+ 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35,
+ 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36,
+ 37, 37, 38, 39, 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39,
+ 39, 39, 39, 39, 39, 40, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 35,
+ 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 39, 39, 39, 39, 39,
+ 39, 40, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36,
+ 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40,
+ /* Size 4x8 */
+ 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32,
+ 32, 36, 31, 33, 33, 37, 34, 36, 36, 40, 34, 36, 36, 40,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 34, 34, 31, 31, 31, 32, 32, 33, 36, 36, 31, 31,
+ 31, 32, 32, 33, 36, 36, 34, 35, 35, 36, 36, 37, 40, 40,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31,
+ 31, 31, 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31,
+ 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32,
+ 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 31, 32,
+ 33, 33, 33, 33, 36, 39, 33, 34, 34, 35, 35, 35, 37, 40, 33, 34, 35, 36,
+ 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36,
+ 38, 41,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 31, 33, 33, 33, 33, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 34, 34, 34, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 33, 34, 35, 35, 35, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 33, 35, 36, 36, 36, 33, 33, 34, 34, 34, 34, 34, 34, 35, 35, 35, 36,
+ 37, 38, 38, 38, 35, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 40, 41,
+ 41, 41,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 35, 37, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 36, 37, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 33, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 34, 35, 36, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33,
+ 34, 35, 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35,
+ 37, 38, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 33, 34, 35, 37, 38, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 33, 34, 36, 37, 39, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 33, 34, 36, 37, 39, 30, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 33, 34, 36, 38, 39, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33,
+ 35, 36, 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36,
+ 38, 40, 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40,
+ 30, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 30, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 33, 35, 36, 38, 40, 31, 31, 31, 32, 32, 33,
+ 33, 33, 33, 33, 33, 34, 35, 37, 38, 40, 31, 32, 32, 33, 33, 33, 33, 33,
+ 33, 33, 33, 35, 36, 37, 39, 41, 32, 32, 33, 33, 34, 34, 34, 34, 34, 34,
+ 34, 35, 37, 38, 40, 41, 33, 33, 34, 34, 34, 35, 35, 35, 35, 35, 35, 36,
+ 37, 39, 40, 42, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40,
+ 41, 43, 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43,
+ 33, 34, 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34,
+ 34, 35, 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35,
+ 35, 36, 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 33, 34, 34, 35, 35, 36,
+ 36, 36, 36, 36, 36, 37, 38, 40, 41, 43, 34, 34, 35, 35, 36, 36, 36, 36,
+ 36, 36, 36, 38, 39, 40, 42, 44,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 33, 34, 34, 34, 34, 34, 34, 34, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34,
+ 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 33, 34, 35, 35, 35, 35, 35,
+ 35, 35, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 33, 34, 34, 35, 35, 35, 35, 35, 35, 36, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34,
+ 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36,
+ 36, 36, 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33,
+ 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 32, 32, 32, 32, 32, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 35, 35, 36, 37,
+ 37, 37, 37, 37, 37, 38, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34,
+ 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37, 37, 38, 38, 38, 38, 38,
+ 38, 39, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 35, 35, 36, 36, 36, 36,
+ 36, 36, 36, 36, 36, 37, 37, 38, 39, 40, 40, 40, 40, 40, 40, 40, 35, 35,
+ 36, 36, 36, 37, 37, 37, 37, 37, 37, 37, 37, 37, 38, 38, 38, 38, 38, 38,
+ 38, 38, 39, 40, 40, 41, 41, 41, 41, 41, 41, 42, 37, 37, 37, 38, 38, 38,
+ 38, 38, 38, 38, 38, 38, 39, 39, 39, 40, 40, 40, 40, 40, 40, 40, 41, 41,
+ 42, 43, 43, 43, 43, 43, 43, 44,
+ /* Size 4x16 */
+ 31, 31, 31, 34, 31, 31, 31, 34, 31, 31, 31, 35, 31, 31, 31, 35, 31, 31,
+ 31, 35, 31, 31, 31, 35, 31, 32, 32, 36, 31, 32, 32, 36, 31, 32, 32, 36,
+ 31, 32, 32, 36, 31, 32, 32, 36, 32, 33, 33, 37, 33, 35, 35, 39, 34, 36,
+ 36, 40, 34, 36, 36, 40, 34, 36, 36, 40,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 33, 34, 34, 34, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 33, 35, 36, 36, 36, 34, 34, 35, 35, 35, 35,
+ 36, 36, 36, 36, 36, 37, 39, 40, 40, 40,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 33, 35, 31, 31, 31, 31, 31, 31, 33, 35, 31, 31,
+ 31, 31, 31, 31, 33, 36, 31, 31, 31, 31, 31, 31, 33, 36, 31, 31, 31, 31,
+ 31, 31, 34, 36, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31,
+ 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37,
+ 31, 31, 31, 31, 31, 31, 34, 37, 31, 31, 31, 31, 31, 31, 34, 37, 31, 31,
+ 31, 31, 31, 31, 34, 37, 31, 31, 31, 32, 32, 32, 34, 37, 31, 31, 31, 32,
+ 32, 32, 34, 37, 30, 31, 31, 32, 32, 32, 34, 38, 30, 31, 32, 32, 32, 32,
+ 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38,
+ 30, 31, 32, 32, 32, 32, 35, 38, 30, 31, 32, 32, 32, 32, 35, 38, 30, 31,
+ 32, 32, 32, 32, 35, 38, 31, 31, 32, 33, 33, 33, 35, 38, 31, 32, 33, 33,
+ 33, 33, 36, 39, 32, 33, 34, 34, 34, 34, 37, 40, 33, 34, 34, 35, 35, 35,
+ 37, 40, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41,
+ 33, 34, 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 33, 34,
+ 35, 36, 36, 36, 38, 41, 33, 34, 35, 36, 36, 36, 38, 41, 34, 35, 36, 36,
+ 36, 36, 39, 42,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 32, 33, 33, 33, 33, 33, 33, 33, 34, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 33, 34, 34, 34, 34, 34, 34, 34, 35, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 33, 34, 34, 35,
+ 35, 35, 35, 35, 35, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36,
+ 36, 36, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 33, 33, 34, 35, 36, 36, 36, 36, 36, 36, 36, 33, 33, 33, 33, 34, 34,
+ 34, 34, 34, 34, 34, 34, 34, 34, 34, 35, 35, 35, 35, 35, 35, 35, 36, 37,
+ 37, 38, 38, 38, 38, 38, 38, 39, 35, 35, 36, 36, 36, 37, 37, 37, 37, 37,
+ 37, 37, 37, 37, 38, 38, 38, 38, 38, 38, 38, 38, 39, 40, 40, 41, 41, 41,
+ 41, 41, 41, 42 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31, 32,
+ 32, 32, 31, 32, 32, 32, 31, 32, 32, 32,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 31, 31, 31, 32, 32, 32, 32, 32, 31, 31, 31, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 8x8 */
+ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 16x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32,
+ /* Size 32x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ /* Size 4x8 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 30, 31, 32, 32,
+ /* Size 8x4 */
+ 31, 31, 31, 31, 31, 31, 31, 30, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 32, 32, 31, 31, 31, 31, 31, 31, 32, 32,
+ /* Size 8x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31,
+ 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 30, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 31, 31, 31, 31, 31, 31,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 31, 31,
+ 32, 32, 31, 31, 32, 32, 30, 31, 32, 32,
+ /* Size 16x4 */
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32,
+ 32, 32, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31, 31, 31, 31, 32, 32, 32,
+ 30, 31, 31, 31, 31, 32, 32, 32, 30, 31, 31, 31, 31, 32, 32, 32, 30, 31,
+ 31, 31, 32, 32, 32, 32, 30, 31, 31, 31, 32, 32, 32, 32, 30, 31, 31, 31,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ },
+};
+
+static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS][2][QM_TOTAL_SIZE] = {
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 24, 14, 11, 24, 15, 11, 9, 14, 11, 7, 7, 11, 9, 7, 5,
+ /* Size 8x8 */
+ 32, 32, 27, 20, 15, 12, 11, 9, 32, 29, 26, 21, 16, 13, 12, 10, 27, 26,
+ 19, 16, 13, 11, 10, 10, 20, 21, 16, 12, 11, 9, 9, 8, 15, 16, 13, 11, 9,
+ 8, 7, 7, 12, 13, 11, 9, 8, 7, 6, 6, 11, 12, 10, 9, 7, 6, 6, 5, 9, 10,
+ 10, 8, 7, 6, 5, 5,
+ /* Size 16x16 */
+ 32, 33, 33, 30, 28, 23, 21, 17, 16, 13, 12, 11, 11, 10, 9, 9, 33, 32,
+ 32, 31, 30, 25, 23, 19, 17, 14, 14, 12, 11, 11, 10, 9, 33, 32, 31, 29,
+ 28, 24, 23, 19, 17, 14, 14, 13, 12, 11, 10, 10, 30, 31, 29, 26, 24, 22,
+ 20, 18, 16, 14, 13, 13, 12, 11, 11, 10, 28, 30, 28, 24, 21, 19, 18, 16,
+ 15, 13, 13, 12, 11, 11, 10, 10, 23, 25, 24, 22, 19, 16, 15, 14, 13, 11,
+ 11, 11, 10, 10, 9, 9, 21, 23, 23, 20, 18, 15, 14, 13, 12, 11, 10, 10, 9,
+ 9, 9, 9, 17, 19, 19, 18, 16, 14, 13, 11, 10, 9, 9, 9, 9, 8, 8, 8, 16,
+ 17, 17, 16, 15, 13, 12, 10, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13,
+ 11, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 7,
+ 7, 7, 7, 7, 6, 6, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 7, 6, 6, 6, 6, 6,
+ 11, 11, 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 10, 11, 11, 11,
+ 11, 10, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 8, 7,
+ 6, 6, 5, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 5, 5, 4,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 30, 29, 28, 26, 23, 22, 21, 19, 17, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+ 32, 30, 30, 29, 27, 24, 23, 22, 20, 18, 17, 17, 15, 13, 13, 13, 12, 12,
+ 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32, 32, 31, 30, 30, 28,
+ 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12, 12, 11, 11, 11, 10,
+ 10, 9, 9, 9, 33, 32, 32, 32, 31, 31, 30, 29, 29, 27, 25, 24, 23, 21, 19,
+ 18, 17, 16, 14, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 33,
+ 32, 32, 31, 31, 30, 29, 28, 28, 26, 24, 23, 23, 20, 19, 18, 17, 16, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 32, 32, 32, 31, 30,
+ 29, 28, 28, 27, 26, 24, 23, 22, 21, 19, 19, 18, 16, 15, 15, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24,
+ 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 10, 10, 9, 29, 30, 30, 29, 28, 28, 25, 24, 23, 22, 20, 20, 19,
+ 18, 17, 16, 16, 15, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 26, 27, 28,
+ 27, 26, 26, 23, 22, 20, 19, 18, 17, 17, 16, 15, 14, 14, 13, 12, 12, 12,
+ 11, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 23, 24, 25, 25, 24, 24, 22,
+ 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 22, 23, 24, 24, 23, 23, 21, 20, 19, 17, 16, 15,
+ 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 8, 21, 22, 23, 23, 23, 22, 20, 19, 18, 17, 15, 15, 14, 13, 13, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 19, 20, 21, 21, 20,
+ 21, 19, 18, 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+ 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13,
+ 13, 12, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 17,
+ 17, 18, 18, 18, 19, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 9,
+ 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 16, 17, 17, 17, 17, 18, 16, 16,
+ 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 7, 7, 7, 14, 15, 16, 16, 16, 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10,
+ 9, 9, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15,
+ 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 13, 13, 12, 11, 11, 11, 10,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 13, 14, 14,
+ 14, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 6, 6, 6, 6, 6, 12, 12, 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12,
+ 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+ 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12,
+ 12, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+ 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 10, 11, 11,
+ 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+ 5, 5, 5, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10,
+ 10, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5,
+ 5, 5, 5, 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 4, 9, 9, 9, 10, 10, 10, 10,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
+ 4, 4, 8, 9, 9, 9, 9, 9, 9, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+ /* Size 4x8 */
+ 32, 24, 14, 11, 31, 24, 15, 12, 28, 18, 12, 11, 21, 14, 10, 9, 16, 12,
+ 8, 8, 13, 11, 7, 7, 11, 10, 7, 6, 10, 9, 7, 5,
+ /* Size 8x4 */
+ 32, 31, 28, 21, 16, 13, 11, 10, 24, 24, 18, 14, 12, 11, 10, 9, 14, 15,
+ 12, 10, 8, 7, 7, 7, 11, 12, 11, 9, 8, 7, 6, 5,
+ /* Size 8x16 */
+ 32, 32, 28, 19, 16, 12, 11, 10, 33, 31, 30, 21, 17, 13, 12, 11, 32, 30,
+ 28, 20, 17, 13, 12, 12, 30, 28, 24, 19, 16, 13, 13, 12, 28, 27, 21, 17,
+ 15, 12, 12, 11, 23, 24, 19, 14, 13, 11, 11, 11, 21, 22, 18, 13, 12, 10,
+ 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 16, 18, 15, 11, 10, 8, 8, 8, 13,
+ 15, 13, 10, 9, 7, 8, 8, 12, 14, 13, 10, 8, 7, 7, 7, 11, 13, 12, 10, 8,
+ 7, 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 6, 9, 10,
+ 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5,
+ /* Size 16x8 */
+ 32, 33, 32, 30, 28, 23, 21, 18, 16, 13, 12, 11, 11, 10, 9, 9, 32, 31,
+ 30, 28, 27, 24, 22, 19, 18, 15, 14, 13, 12, 11, 10, 10, 28, 30, 28, 24,
+ 21, 19, 18, 16, 15, 13, 13, 12, 11, 10, 10, 10, 19, 21, 20, 19, 17, 14,
+ 13, 12, 11, 10, 10, 10, 10, 9, 9, 9, 16, 17, 17, 16, 15, 13, 12, 10, 10,
+ 9, 8, 8, 8, 8, 7, 8, 12, 13, 13, 13, 12, 11, 10, 9, 8, 7, 7, 7, 7, 7, 6,
+ 7, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+ 12, 11, 11, 10, 9, 8, 8, 7, 6, 6, 6, 5, 5,
+ /* Size 16x32 */
+ 32, 33, 32, 30, 28, 23, 19, 17, 16, 13, 12, 11, 11, 11, 10, 10, 33, 32,
+ 32, 30, 29, 24, 20, 18, 17, 14, 12, 12, 12, 11, 11, 11, 33, 32, 31, 31,
+ 30, 25, 21, 19, 17, 14, 13, 12, 12, 11, 11, 11, 33, 32, 31, 30, 29, 25,
+ 21, 19, 17, 14, 13, 13, 12, 12, 11, 11, 32, 32, 30, 29, 28, 24, 20, 19,
+ 17, 14, 13, 13, 12, 12, 12, 11, 32, 31, 29, 28, 27, 24, 21, 19, 18, 15,
+ 14, 13, 12, 12, 12, 11, 30, 30, 28, 26, 24, 21, 19, 18, 16, 14, 13, 13,
+ 13, 12, 12, 11, 29, 30, 28, 25, 23, 20, 18, 17, 16, 13, 12, 12, 12, 12,
+ 12, 11, 28, 30, 27, 24, 21, 19, 17, 16, 15, 13, 12, 12, 12, 12, 11, 11,
+ 26, 28, 26, 23, 20, 18, 16, 15, 14, 12, 12, 12, 11, 11, 11, 11, 23, 25,
+ 24, 21, 19, 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 11, 22, 24, 23, 21,
+ 19, 16, 14, 13, 12, 11, 10, 10, 10, 10, 10, 10, 21, 23, 22, 20, 18, 15,
+ 13, 13, 12, 11, 10, 10, 10, 10, 10, 10, 19, 21, 20, 19, 17, 14, 12, 12,
+ 11, 10, 9, 10, 10, 9, 10, 9, 18, 19, 19, 18, 16, 14, 12, 11, 10, 9, 9,
+ 9, 9, 9, 9, 9, 17, 18, 18, 17, 16, 13, 12, 11, 10, 9, 9, 9, 9, 9, 9, 9,
+ 16, 17, 18, 16, 15, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 15,
+ 14, 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 10, 9,
+ 9, 8, 7, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+ 7, 7, 12, 14, 14, 13, 13, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13,
+ 13, 12, 11, 9, 9, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 10, 10, 9,
+ 8, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 6, 6, 6,
+ 6, 6, 11, 12, 12, 12, 11, 10, 10, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 12,
+ 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9,
+ 8, 7, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 5,
+ 5, 5, 9, 10, 10, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 10, 10, 10,
+ 10, 9, 9, 8, 7, 7, 6, 6, 6, 5, 5, 5, 9, 9, 10, 10, 10, 9, 9, 8, 8, 7, 7,
+ 6, 6, 5, 5, 5, 8, 9, 9, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 5,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 33, 32, 32, 32, 32,
+ 31, 30, 30, 30, 28, 25, 24, 23, 21, 19, 18, 17, 16, 14, 14, 14, 13, 12,
+ 12, 12, 11, 11, 11, 10, 10, 9, 9, 32, 32, 31, 31, 30, 29, 28, 28, 27,
+ 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 10, 9, 30, 30, 31, 30, 29, 28, 26, 25, 24, 23, 21, 21, 20,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 28, 29, 30, 29, 28, 27, 24, 23, 21, 20, 19, 19, 18, 17, 16, 16, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 23, 24, 25,
+ 25, 24, 24, 21, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 19, 18,
+ 17, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9,
+ 9, 9, 9, 9, 9, 17, 18, 19, 19, 19, 19, 18, 17, 16, 15, 14, 13, 13, 12,
+ 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 9, 8, 8, 8, 8, 16, 17, 17,
+ 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8,
+ 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 13, 14, 14, 14, 14, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 12, 12, 13, 13, 13, 14, 13, 12, 12, 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 11, 12, 12, 13, 13, 13, 13, 12, 12,
+ 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+ 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12, 12, 12, 12,
+ 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+ 5, 5, 5, 10, 11, 11, 11, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9,
+ 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 10, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5,
+ 5, 5, 5, 5,
+ /* Size 4x16 */
+ 33, 23, 13, 11, 32, 25, 14, 11, 32, 24, 14, 12, 30, 21, 14, 12, 30, 19,
+ 13, 12, 25, 16, 11, 11, 23, 15, 11, 10, 19, 14, 9, 9, 17, 13, 9, 8, 14,
+ 11, 8, 8, 14, 11, 8, 7, 12, 10, 7, 6, 12, 10, 7, 6, 11, 10, 7, 6, 10, 9,
+ 7, 5, 9, 9, 7, 5,
+ /* Size 16x4 */
+ 33, 32, 32, 30, 30, 25, 23, 19, 17, 14, 14, 12, 12, 11, 10, 9, 23, 25,
+ 24, 21, 19, 16, 15, 14, 13, 11, 11, 10, 10, 10, 9, 9, 13, 14, 14, 14,
+ 13, 11, 11, 9, 9, 8, 8, 7, 7, 7, 7, 7, 11, 11, 12, 12, 12, 11, 10, 9, 8,
+ 8, 7, 6, 6, 6, 5, 5,
+ /* Size 8x32 */
+ 32, 32, 28, 19, 16, 12, 11, 10, 33, 32, 29, 20, 17, 12, 12, 11, 33, 31,
+ 30, 21, 17, 13, 12, 11, 33, 31, 29, 21, 17, 13, 12, 11, 32, 30, 28, 20,
+ 17, 13, 12, 12, 32, 29, 27, 21, 18, 14, 12, 12, 30, 28, 24, 19, 16, 13,
+ 13, 12, 29, 28, 23, 18, 16, 12, 12, 12, 28, 27, 21, 17, 15, 12, 12, 11,
+ 26, 26, 20, 16, 14, 12, 11, 11, 23, 24, 19, 14, 13, 11, 11, 11, 22, 23,
+ 19, 14, 12, 10, 10, 10, 21, 22, 18, 13, 12, 10, 10, 10, 19, 20, 17, 12,
+ 11, 9, 10, 10, 18, 19, 16, 12, 10, 9, 9, 9, 17, 18, 16, 12, 10, 9, 9, 9,
+ 16, 18, 15, 11, 10, 8, 8, 8, 14, 16, 14, 11, 9, 8, 8, 8, 13, 15, 13, 10,
+ 9, 7, 8, 8, 13, 14, 13, 10, 9, 7, 7, 7, 12, 14, 13, 10, 8, 7, 7, 7, 12,
+ 13, 12, 9, 8, 7, 7, 7, 11, 13, 12, 10, 8, 7, 6, 6, 11, 12, 11, 10, 8, 7,
+ 6, 6, 11, 12, 11, 10, 8, 7, 6, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11, 10,
+ 9, 8, 7, 6, 6, 10, 11, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9, 7, 6, 6, 5, 9,
+ 10, 10, 9, 7, 6, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 8, 9, 10, 9, 8, 7, 6,
+ 5,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 32, 32, 30, 29, 28, 26, 23, 22, 21, 19, 18, 17, 16, 14,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 32, 32, 31, 31, 30,
+ 29, 28, 28, 27, 26, 24, 23, 22, 20, 19, 18, 18, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 28, 29, 30, 29, 28, 27, 24, 23, 21,
+ 20, 19, 19, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 19, 20, 21, 21, 20, 21, 19, 18, 17, 16, 14, 14, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 16,
+ 17, 17, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 8, 8, 12, 12, 13, 13, 13, 14, 13, 12, 12,
+ 12, 11, 10, 10, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7,
+ 11, 12, 12, 12, 12, 12, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5,
+ 5, 5 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 29, 22, 18, 16, 22, 17, 15, 14, 18, 15, 11, 11, 16, 14, 11, 9,
+ /* Size 8x8 */
+ 33, 27, 22, 20, 18, 16, 15, 14, 27, 22, 22, 22, 20, 18, 17, 15, 22, 22,
+ 19, 18, 17, 16, 15, 15, 20, 22, 18, 16, 14, 13, 14, 14, 18, 20, 17, 14,
+ 12, 12, 12, 12, 16, 18, 16, 13, 12, 11, 11, 11, 15, 17, 15, 14, 12, 11,
+ 10, 10, 14, 15, 15, 14, 12, 11, 10, 9,
+ /* Size 16x16 */
+ 32, 34, 31, 25, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 34, 32,
+ 29, 24, 22, 23, 22, 21, 20, 18, 18, 17, 16, 15, 15, 14, 31, 29, 26, 23,
+ 22, 23, 22, 21, 20, 18, 18, 17, 17, 16, 16, 15, 25, 24, 23, 21, 20, 21,
+ 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 21, 22, 22, 20, 19, 19, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 23, 21, 19, 18, 17, 17, 16, 15,
+ 15, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17, 17, 16, 15, 14, 14, 14,
+ 14, 14, 14, 14, 19, 21, 21, 20, 19, 17, 16, 14, 14, 13, 13, 13, 13, 13,
+ 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12,
+ 16, 18, 18, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 18,
+ 18, 18, 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17,
+ 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 10, 10, 15, 16, 17, 17, 16, 15,
+ 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13,
+ 12, 12, 11, 10, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 9, 9, 9, 13, 14, 15, 15, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+ 9, 9, 9,
+ /* Size 32x32 */
+ 32, 33, 34, 32, 31, 28, 25, 23, 21, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 33, 31,
+ 30, 27, 24, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 34, 33, 32, 31, 29, 26, 24, 23,
+ 22, 23, 23, 23, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 15, 14, 14, 14, 32, 31, 31, 29, 28, 25, 24, 23, 22, 22, 23, 22,
+ 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15,
+ 15, 15, 31, 30, 29, 28, 26, 24, 23, 22, 22, 22, 23, 22, 22, 22, 21, 20,
+ 20, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 28, 27,
+ 26, 25, 24, 22, 22, 22, 21, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 25, 24, 24, 24, 23, 22,
+ 21, 21, 20, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 17, 16, 16, 16, 15, 15, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16,
+ 16, 16, 16, 16, 21, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+ 21, 22, 23, 22, 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 21, 22, 23, 23,
+ 23, 23, 21, 20, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 22, 23, 22, 22, 22, 21, 20,
+ 19, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+ 14, 14, 14, 14, 14, 14, 20, 21, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 20, 20, 22, 22, 22, 22, 20, 20, 19, 18, 17, 17, 16, 16, 15, 15,
+ 15, 14, 14, 14, 14, 13, 14, 14, 13, 14, 14, 13, 14, 14, 13, 13, 19, 20,
+ 21, 21, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 20, 20, 20, 21,
+ 20, 19, 18, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 12, 13,
+ 13, 13, 13, 13, 13, 13, 13, 12, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 17, 18, 19, 19, 19, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 18, 18, 18, 19, 18, 17,
+ 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 10, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 15, 16,
+ 16, 17, 17, 17, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 16, 16, 15,
+ 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 16, 16, 15, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 10, 14,
+ 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16,
+ 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 14, 14, 14, 15, 15, 15, 16, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 9, 9, 13, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 13, 14, 14,
+ 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11,
+ 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ /* Size 4x8 */
+ 33, 22, 17, 16, 26, 23, 19, 17, 22, 18, 16, 16, 21, 17, 14, 14, 19, 16,
+ 12, 12, 17, 15, 11, 11, 16, 15, 11, 10, 15, 14, 12, 10,
+ /* Size 8x4 */
+ 33, 26, 22, 21, 19, 17, 16, 15, 22, 23, 18, 17, 16, 15, 15, 14, 17, 19,
+ 16, 14, 12, 11, 11, 12, 16, 17, 16, 14, 12, 11, 10, 10,
+ /* Size 8x16 */
+ 32, 28, 21, 20, 18, 16, 15, 14, 34, 26, 22, 21, 20, 17, 16, 16, 31, 24,
+ 22, 22, 20, 17, 17, 16, 24, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19,
+ 18, 17, 17, 17, 21, 22, 19, 17, 16, 15, 16, 16, 20, 22, 19, 16, 15, 14,
+ 14, 15, 19, 21, 19, 15, 14, 13, 13, 14, 18, 20, 18, 15, 13, 12, 13, 13,
+ 16, 19, 17, 14, 12, 11, 12, 12, 16, 18, 17, 14, 12, 11, 11, 12, 15, 17,
+ 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14,
+ 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 13, 15, 15, 14, 12, 11,
+ 10, 9,
+ /* Size 16x8 */
+ 32, 34, 31, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 14, 14, 13, 28, 26,
+ 24, 22, 21, 22, 22, 21, 20, 19, 18, 17, 17, 16, 15, 15, 21, 22, 22, 20,
+ 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 16, 15, 20, 21, 22, 20, 19, 17,
+ 16, 15, 15, 14, 14, 14, 14, 14, 14, 14, 18, 20, 20, 19, 18, 16, 15, 14,
+ 13, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 17, 17, 15, 14, 13, 12, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 14, 13, 13, 12, 11, 10,
+ 10, 10, 10, 10, 14, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9,
+ /* Size 16x32 */
+ 32, 33, 28, 24, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 14, 14, 33, 33,
+ 27, 24, 22, 22, 20, 20, 19, 17, 16, 16, 16, 16, 15, 15, 34, 32, 26, 24,
+ 22, 23, 21, 20, 20, 18, 17, 17, 16, 16, 16, 15, 32, 30, 25, 23, 22, 23,
+ 21, 21, 20, 18, 17, 17, 17, 16, 16, 16, 31, 28, 24, 23, 22, 22, 22, 21,
+ 20, 18, 17, 17, 17, 17, 16, 16, 28, 26, 22, 22, 22, 23, 22, 21, 20, 19,
+ 18, 18, 17, 17, 17, 16, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 17, 18,
+ 17, 17, 17, 16, 23, 23, 22, 21, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17,
+ 17, 17, 21, 22, 21, 20, 19, 19, 19, 19, 18, 17, 17, 16, 17, 16, 17, 17,
+ 21, 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 16, 21, 23,
+ 22, 21, 19, 18, 17, 17, 16, 15, 15, 15, 16, 16, 16, 16, 21, 22, 22, 21,
+ 19, 17, 17, 16, 16, 15, 14, 15, 15, 15, 15, 15, 20, 22, 22, 20, 19, 17,
+ 16, 16, 15, 14, 14, 14, 14, 15, 15, 15, 20, 21, 22, 20, 19, 17, 16, 15,
+ 14, 14, 13, 14, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 15, 14, 14, 13,
+ 13, 13, 13, 14, 14, 14, 19, 20, 21, 20, 18, 16, 15, 14, 14, 13, 12, 13,
+ 13, 13, 13, 13, 18, 20, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 13, 13,
+ 13, 13, 17, 19, 20, 19, 18, 16, 14, 14, 13, 12, 12, 12, 12, 12, 13, 13,
+ 16, 18, 19, 18, 17, 15, 14, 13, 12, 12, 11, 12, 12, 12, 12, 13, 16, 18,
+ 19, 18, 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 16, 17, 18, 18,
+ 17, 15, 14, 13, 12, 11, 11, 11, 11, 11, 12, 12, 15, 17, 18, 17, 16, 15,
+ 13, 13, 12, 11, 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 14, 14, 13,
+ 12, 11, 11, 11, 10, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 10, 11, 11, 15, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11,
+ 10, 10, 10, 11, 14, 16, 16, 17, 15, 15, 14, 13, 12, 11, 11, 10, 10, 10,
+ 10, 10, 14, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10,
+ 14, 16, 16, 16, 16, 15, 14, 13, 12, 12, 11, 10, 10, 10, 10, 10, 14, 15,
+ 15, 16, 16, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16,
+ 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 16, 15, 14,
+ 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 13, 15, 15, 15, 15, 14, 14, 13,
+ 13, 11, 11, 10, 10, 9, 9, 9,
+ /* Size 32x16 */
+ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 33, 33, 32, 30,
+ 28, 26, 24, 23, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 28, 27, 26, 25, 24, 22, 22, 22,
+ 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 15, 15, 15, 15, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21,
+ 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16,
+ 16, 15, 21, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 16, 16, 16, 16, 15, 15, 21, 22,
+ 23, 23, 22, 23, 21, 20, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15,
+ 15, 15, 14, 15, 15, 15, 15, 15, 15, 14, 14, 14, 20, 20, 21, 21, 22, 22,
+ 20, 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 19, 20, 20, 21, 21, 21, 20, 19, 19, 17,
+ 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14,
+ 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+ 16, 17, 18, 18, 18, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 12, 12, 11, 12, 12, 12, 12, 12, 11, 16, 16, 17, 17,
+ 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 11, 10,
+ 10, 10, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15,
+ 16, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10, 10, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11,
+ 10, 10, 10, 10, 9, 9, 9,
+ /* Size 4x16 */
+ 33, 21, 16, 15, 32, 23, 18, 16, 28, 22, 18, 17, 24, 21, 18, 17, 22, 19,
+ 17, 16, 23, 18, 15, 16, 22, 17, 14, 15, 20, 17, 13, 14, 20, 16, 12, 13,
+ 18, 15, 12, 12, 17, 15, 11, 11, 17, 14, 11, 11, 16, 15, 12, 10, 16, 15,
+ 12, 10, 15, 15, 12, 10, 15, 14, 12, 10,
+ /* Size 16x4 */
+ 33, 32, 28, 24, 22, 23, 22, 20, 20, 18, 17, 17, 16, 16, 15, 15, 21, 23,
+ 22, 21, 19, 18, 17, 17, 16, 15, 15, 14, 15, 15, 15, 14, 16, 18, 18, 18,
+ 17, 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 12, 15, 16, 17, 17, 16, 16,
+ 15, 14, 13, 12, 11, 11, 10, 10, 10, 10,
+ /* Size 8x32 */
+ 32, 28, 21, 20, 18, 16, 15, 14, 33, 27, 22, 20, 19, 16, 16, 15, 34, 26,
+ 22, 21, 20, 17, 16, 16, 32, 25, 22, 21, 20, 17, 17, 16, 31, 24, 22, 22,
+ 20, 17, 17, 16, 28, 22, 22, 22, 20, 18, 17, 17, 24, 22, 20, 20, 19, 17,
+ 17, 17, 23, 22, 20, 20, 19, 17, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+ 21, 22, 19, 18, 17, 16, 16, 16, 21, 22, 19, 17, 16, 15, 16, 16, 21, 22,
+ 19, 17, 16, 14, 15, 15, 20, 22, 19, 16, 15, 14, 14, 15, 20, 22, 19, 16,
+ 14, 13, 14, 14, 19, 21, 19, 15, 14, 13, 13, 14, 19, 21, 18, 15, 14, 12,
+ 13, 13, 18, 20, 18, 15, 13, 12, 13, 13, 17, 20, 18, 14, 13, 12, 12, 13,
+ 16, 19, 17, 14, 12, 11, 12, 12, 16, 19, 17, 14, 12, 11, 12, 12, 16, 18,
+ 17, 14, 12, 11, 11, 12, 15, 18, 16, 13, 12, 11, 11, 11, 15, 17, 16, 14,
+ 12, 11, 10, 11, 15, 17, 16, 14, 12, 11, 10, 11, 15, 17, 16, 14, 12, 11,
+ 10, 10, 14, 16, 15, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10,
+ 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 12, 11, 10, 10, 14, 15,
+ 16, 14, 12, 11, 10, 9, 13, 15, 15, 14, 12, 11, 10, 9, 13, 15, 15, 14,
+ 13, 11, 10, 9,
+ /* Size 32x8 */
+ 32, 33, 34, 32, 31, 28, 24, 23, 21, 21, 21, 21, 20, 20, 19, 19, 18, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 28, 27, 26, 25,
+ 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 22, 22, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15,
+ 16, 16, 16, 16, 15, 15, 20, 20, 21, 21, 22, 22, 20, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 18, 19, 20, 20, 20, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 16, 16,
+ 17, 17, 17, 18, 17, 17, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+ 17, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 14, 15, 16, 16, 16, 17, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10,
+ 10, 9, 9, 9 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 25, 15, 11, 25, 16, 12, 10, 15, 12, 8, 7, 11, 10, 7, 6,
+ /* Size 8x8 */
+ 32, 32, 28, 22, 17, 13, 11, 10, 32, 29, 26, 22, 18, 14, 12, 11, 28, 26,
+ 20, 17, 14, 12, 11, 10, 22, 22, 17, 14, 12, 10, 10, 9, 17, 18, 14, 12,
+ 10, 8, 8, 8, 13, 14, 12, 10, 8, 7, 7, 7, 11, 12, 11, 10, 8, 7, 6, 6, 10,
+ 11, 10, 9, 8, 7, 6, 5,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 33, 32,
+ 32, 31, 30, 28, 23, 20, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32, 31, 30,
+ 28, 26, 23, 20, 18, 16, 14, 13, 12, 12, 11, 10, 32, 31, 30, 28, 26, 24,
+ 22, 20, 18, 16, 14, 13, 13, 12, 11, 10, 28, 30, 28, 26, 21, 20, 18, 17,
+ 16, 14, 13, 12, 12, 11, 11, 10, 26, 28, 26, 24, 20, 19, 17, 16, 15, 13,
+ 12, 12, 11, 11, 10, 10, 22, 23, 23, 22, 18, 17, 15, 14, 13, 12, 11, 10,
+ 10, 10, 9, 9, 19, 20, 20, 20, 17, 16, 14, 12, 12, 11, 10, 9, 9, 9, 9, 8,
+ 17, 18, 18, 18, 16, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16,
+ 16, 14, 13, 12, 11, 10, 9, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11,
+ 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 10, 9, 9, 8, 7, 7,
+ 7, 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10,
+ 11, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6, 5, 10, 10, 11, 11, 11,
+ 10, 9, 9, 8, 8, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 10, 9, 8, 8, 7, 7,
+ 6, 6, 5, 5, 5,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 33, 32, 32, 32, 32,
+ 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 18, 17, 15, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 32, 31, 31, 30,
+ 28, 28, 25, 23, 22, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 10, 9, 33, 32, 32, 32, 32, 31, 31, 30, 29, 28, 27, 25, 23,
+ 23, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 10, 33, 32, 32, 32, 31, 30, 30, 29, 28, 27, 26, 24, 23, 22, 20, 19, 18,
+ 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 32, 32, 32,
+ 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 21, 19, 19, 18, 16, 16, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28,
+ 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23,
+ 22, 20, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+ 11, 11, 10, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 27,
+ 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 15, 14, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 26, 27, 28, 27, 26,
+ 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 10, 10, 10, 10, 10, 23, 24, 25, 25, 24, 24, 23, 22, 19,
+ 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9,
+ 21, 22, 22, 23, 22, 22, 21, 20, 18, 17, 17, 15, 14, 14, 13, 13, 12, 12,
+ 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 20, 21, 20,
+ 21, 20, 19, 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9,
+ 9, 9, 9, 9, 9, 9, 8, 8, 8, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15,
+ 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 17, 18, 18, 18, 18, 19, 18, 17, 16, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+ 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 7, 14, 15, 16, 16, 16, 16, 16, 15, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 13, 14, 15, 15,
+ 15, 16, 15, 14, 13, 13, 13, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14, 14, 14, 15, 14, 14, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13,
+ 13, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12, 12, 13, 13, 13, 14, 13, 13, 12, 12,
+ 12, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 7, 6, 6, 6, 6, 6,
+ 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 13, 13, 12,
+ 12, 11, 11, 11, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8,
+ 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 10, 11, 11, 12, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ 5, 5, 5, 10, 11, 11, 11, 11, 11, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8,
+ 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 10, 10, 10, 11, 11, 11, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+ 5, 5, 5, 9, 10, 10, 10, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 9, 10, 10, 10, 10, 10, 10,
+ 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5,
+ 5, 5, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
+ /* Size 4x8 */
+ 32, 24, 15, 12, 31, 24, 16, 12, 28, 18, 13, 12, 22, 15, 11, 10, 17, 13,
+ 9, 8, 14, 11, 8, 7, 12, 11, 8, 6, 10, 10, 8, 6,
+ /* Size 8x4 */
+ 32, 31, 28, 22, 17, 14, 12, 10, 24, 24, 18, 15, 13, 11, 11, 10, 15, 16,
+ 13, 11, 9, 8, 8, 8, 12, 12, 12, 10, 8, 7, 6, 6,
+ /* Size 8x16 */
+ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 32, 30,
+ 28, 23, 17, 14, 13, 12, 32, 29, 26, 22, 17, 14, 13, 12, 28, 28, 21, 18,
+ 15, 13, 12, 12, 26, 26, 20, 17, 14, 12, 11, 11, 22, 23, 18, 15, 12, 11,
+ 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 17, 18, 16, 13, 10, 9, 9, 9, 14,
+ 16, 14, 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8,
+ 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 10, 12, 11, 9, 8, 7, 6, 6, 10, 11,
+ 11, 9, 8, 7, 6, 6, 9, 10, 10, 9, 8, 7, 6, 5,
+ /* Size 16x8 */
+ 32, 33, 32, 32, 28, 26, 22, 19, 17, 14, 13, 12, 11, 10, 10, 9, 32, 32,
+ 30, 29, 28, 26, 23, 20, 18, 16, 15, 13, 12, 12, 11, 10, 28, 29, 28, 26,
+ 21, 20, 18, 17, 16, 14, 13, 12, 12, 11, 11, 10, 22, 23, 23, 22, 18, 17,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 17, 15, 14, 12, 11, 10,
+ 9, 9, 8, 8, 8, 8, 8, 13, 14, 14, 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7,
+ 7, 7, 11, 12, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 6, 11, 11, 12,
+ 12, 12, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 5,
+ /* Size 16x32 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 16, 14, 13, 12, 11, 11, 11, 10, 33, 32,
+ 32, 31, 29, 24, 23, 20, 17, 15, 14, 12, 12, 12, 11, 11, 33, 32, 32, 31,
+ 29, 25, 23, 21, 17, 15, 14, 13, 12, 12, 11, 11, 33, 32, 31, 31, 29, 25,
+ 23, 21, 17, 16, 14, 13, 12, 12, 12, 11, 32, 32, 30, 30, 28, 24, 23, 20,
+ 17, 16, 14, 13, 13, 12, 12, 11, 32, 31, 29, 28, 27, 24, 23, 21, 18, 16,
+ 15, 13, 13, 12, 12, 12, 32, 31, 29, 28, 26, 23, 22, 20, 17, 16, 14, 13,
+ 13, 13, 12, 12, 30, 30, 28, 27, 24, 21, 20, 19, 16, 15, 14, 13, 12, 13,
+ 12, 12, 28, 30, 28, 26, 21, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12,
+ 27, 28, 26, 25, 21, 18, 18, 16, 14, 13, 13, 12, 12, 12, 11, 11, 26, 28,
+ 26, 24, 20, 18, 17, 16, 14, 13, 12, 11, 11, 11, 11, 11, 23, 25, 24, 23,
+ 19, 16, 16, 14, 13, 12, 11, 11, 11, 11, 11, 10, 22, 23, 23, 22, 18, 16,
+ 15, 14, 12, 11, 11, 10, 10, 10, 10, 10, 21, 22, 22, 21, 18, 15, 14, 13,
+ 12, 11, 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 14, 14, 12, 11, 10,
+ 10, 9, 9, 10, 9, 10, 18, 19, 19, 19, 16, 14, 13, 12, 10, 10, 9, 9, 9, 9,
+ 9, 9, 17, 18, 18, 18, 16, 13, 13, 12, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17,
+ 17, 17, 15, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 12,
+ 12, 11, 9, 9, 8, 8, 8, 8, 8, 8, 13, 15, 15, 15, 13, 12, 11, 10, 9, 8, 8,
+ 8, 8, 8, 8, 8, 13, 14, 15, 14, 13, 11, 11, 10, 9, 8, 8, 7, 7, 7, 7, 8,
+ 12, 14, 14, 14, 13, 11, 11, 10, 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13,
+ 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 10, 9, 8,
+ 8, 7, 7, 7, 7, 7, 6, 11, 12, 12, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6,
+ 6, 11, 12, 12, 12, 11, 11, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6, 10, 12, 12,
+ 12, 11, 11, 9, 9, 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 9, 9,
+ 8, 8, 7, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+ 6, 6, 10, 10, 11, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5, 5, 9, 10, 10,
+ 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 6, 5, 5, 9, 10, 10, 10, 10, 9, 9, 8, 8,
+ 7, 7, 6, 6, 5, 5, 5,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 33, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 18, 17, 16, 15, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 32, 32, 32, 31, 30, 29, 29, 28,
+ 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12,
+ 12, 11, 11, 11, 10, 10, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+ 22, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 28, 29, 29, 29, 28, 27, 26, 24, 21, 21, 20, 19, 18, 18, 17, 16,
+ 16, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 23, 24,
+ 25, 25, 24, 24, 23, 21, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 12, 12,
+ 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9, 9, 22, 23, 23, 23, 23, 23,
+ 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16,
+ 14, 14, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8,
+ 8, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 7, 7, 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 11, 11,
+ 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13,
+ 13, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7,
+ 7, 7, 7, 6, 6, 7, 7, 6, 6, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11,
+ 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8,
+ 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5,
+ 5, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+ 8, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 5,
+ /* Size 4x16 */
+ 33, 23, 14, 11, 32, 25, 15, 12, 32, 24, 16, 12, 31, 23, 16, 13, 30, 19,
+ 14, 12, 28, 18, 13, 11, 23, 16, 11, 10, 21, 14, 10, 10, 18, 13, 10, 9,
+ 16, 12, 9, 8, 14, 11, 8, 7, 13, 11, 8, 7, 12, 11, 8, 6, 12, 11, 8, 6,
+ 11, 10, 8, 6, 10, 9, 7, 6,
+ /* Size 16x4 */
+ 33, 32, 32, 31, 30, 28, 23, 21, 18, 16, 14, 13, 12, 12, 11, 10, 23, 25,
+ 24, 23, 19, 18, 16, 14, 13, 12, 11, 11, 11, 11, 10, 9, 14, 15, 16, 16,
+ 14, 13, 11, 10, 10, 9, 8, 8, 8, 8, 8, 7, 11, 12, 12, 13, 12, 11, 10, 10,
+ 9, 8, 7, 7, 6, 6, 6, 6,
+ /* Size 8x32 */
+ 32, 32, 28, 22, 16, 13, 11, 11, 33, 32, 29, 23, 17, 14, 12, 11, 33, 32,
+ 29, 23, 17, 14, 12, 11, 33, 31, 29, 23, 17, 14, 12, 12, 32, 30, 28, 23,
+ 17, 14, 13, 12, 32, 29, 27, 23, 18, 15, 13, 12, 32, 29, 26, 22, 17, 14,
+ 13, 12, 30, 28, 24, 20, 16, 14, 12, 12, 28, 28, 21, 18, 15, 13, 12, 12,
+ 27, 26, 21, 18, 14, 13, 12, 11, 26, 26, 20, 17, 14, 12, 11, 11, 23, 24,
+ 19, 16, 13, 11, 11, 11, 22, 23, 18, 15, 12, 11, 10, 10, 21, 22, 18, 14,
+ 12, 11, 10, 10, 19, 20, 17, 14, 11, 10, 9, 9, 18, 19, 16, 13, 10, 9, 9,
+ 9, 17, 18, 16, 13, 10, 9, 9, 9, 16, 17, 15, 12, 10, 9, 8, 8, 14, 16, 14,
+ 12, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7,
+ 12, 14, 13, 11, 8, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 7, 12, 13, 12, 10,
+ 8, 7, 7, 7, 11, 12, 12, 10, 8, 7, 7, 6, 11, 12, 11, 10, 9, 7, 6, 6, 10,
+ 12, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7, 6, 6, 10, 11, 11, 9, 8, 7,
+ 6, 6, 10, 11, 11, 9, 8, 7, 6, 5, 9, 10, 10, 9, 8, 7, 6, 5, 9, 10, 10, 9,
+ 8, 7, 6, 5,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 17, 16,
+ 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 32, 32, 32, 31,
+ 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 18, 17, 16, 15, 15, 14,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 28, 29, 29, 29, 28, 27, 26, 24,
+ 21, 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 11, 10, 10, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 9, 16, 17, 17, 17, 17, 18, 17, 16, 15, 14, 14, 13, 12, 12, 11, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15,
+ 14, 14, 13, 13, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 11, 12, 12, 12, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10,
+ 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 11, 11, 11, 12,
+ 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6,
+ 6, 6, 6, 6, 5, 5, 5 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 31, 23, 18, 16, 23, 18, 16, 15, 18, 16, 12, 12, 16, 15, 12, 10,
+ /* Size 8x8 */
+ 33, 27, 22, 21, 19, 17, 16, 15, 27, 22, 22, 22, 20, 19, 17, 16, 22, 22,
+ 19, 19, 18, 16, 16, 16, 21, 22, 19, 17, 15, 14, 14, 14, 19, 20, 18, 15,
+ 13, 12, 12, 12, 17, 19, 16, 14, 12, 11, 11, 11, 16, 17, 16, 14, 12, 11,
+ 10, 10, 15, 16, 16, 14, 12, 11, 10, 9,
+ /* Size 16x16 */
+ 32, 34, 31, 27, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 34, 33,
+ 29, 25, 22, 22, 22, 21, 20, 19, 18, 17, 16, 16, 15, 15, 31, 29, 26, 23,
+ 22, 22, 22, 22, 20, 19, 18, 18, 17, 17, 16, 15, 27, 25, 23, 22, 21, 21,
+ 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 21, 22, 22, 21, 19, 19, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 21, 22, 22, 21, 19, 19, 18, 18, 17, 17,
+ 16, 16, 15, 16, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 14, 20, 21, 22, 21, 19, 18, 16, 16, 15, 14, 14, 13, 14, 13,
+ 13, 13, 19, 20, 20, 20, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13,
+ 17, 19, 19, 19, 18, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18,
+ 18, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 12, 11, 16, 17, 18, 18,
+ 17, 16, 14, 13, 13, 12, 11, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 15,
+ 14, 14, 13, 12, 11, 11, 10, 10, 10, 10, 15, 16, 17, 17, 16, 16, 14, 13,
+ 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 16, 17, 16, 15, 14, 13, 13, 12,
+ 12, 11, 10, 10, 10, 9, 14, 15, 15, 16, 16, 15, 14, 13, 13, 12, 11, 11,
+ 10, 10, 9, 9,
+ /* Size 32x32 */
+ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 33, 32,
+ 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 17, 17, 17,
+ 16, 16, 16, 16, 15, 15, 15, 15, 15, 14, 34, 33, 33, 32, 29, 26, 25, 24,
+ 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 14, 33, 32, 32, 31, 28, 26, 25, 24, 22, 22, 23, 23,
+ 22, 22, 22, 21, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16,
+ 15, 15, 31, 30, 29, 28, 26, 24, 23, 23, 22, 22, 22, 23, 22, 22, 22, 21,
+ 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 28, 27,
+ 26, 26, 24, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 27, 26, 25, 25, 23, 22,
+ 22, 21, 21, 21, 21, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20,
+ 21, 21, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+ 17, 16, 16, 16, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 17, 17, 16, 16, 16, 16, 16, 16,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+ 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22, 22, 23,
+ 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 16, 16, 15, 15, 15, 15, 15, 21, 22, 23, 23, 23, 23, 22, 21,
+ 19, 19, 19, 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20,
+ 21, 22, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14,
+ 14, 14, 13, 13, 14, 13, 13, 14, 13, 13, 13, 14, 19, 20, 20, 21, 21, 21,
+ 21, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 19, 19, 20, 20, 20, 21, 20, 20, 18, 18,
+ 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 17, 18, 19, 19, 19, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 17, 18, 18,
+ 19, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12,
+ 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18,
+ 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 12, 11, 11, 12, 16, 17, 17, 18, 18, 19, 18, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 16, 16, 17, 17, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13,
+ 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+ 17, 17, 17, 18, 18, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 11, 10, 10, 11, 11, 11, 11, 10, 15, 16, 16, 17, 17, 17,
+ 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 10, 10, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 17, 17, 17, 17, 17, 16,
+ 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 14, 15, 15, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 14, 15, 15, 16,
+ 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 10, 10, 9, 9, 14, 15, 15, 16, 16, 16, 16, 16,
+ 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 9, 9, 14, 15, 15, 15, 15, 16, 16, 16, 16, 15, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 14, 14, 14, 15, 15, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12,
+ 12, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ /* Size 4x8 */
+ 33, 22, 18, 16, 26, 23, 20, 17, 22, 19, 17, 16, 22, 17, 15, 14, 20, 16,
+ 13, 13, 17, 15, 12, 11, 16, 16, 12, 10, 16, 15, 12, 10,
+ /* Size 8x4 */
+ 33, 26, 22, 22, 20, 17, 16, 16, 22, 23, 19, 17, 16, 15, 16, 15, 18, 20,
+ 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 11, 10, 10,
+ /* Size 8x16 */
+ 32, 29, 21, 20, 18, 16, 15, 15, 34, 27, 22, 22, 20, 18, 16, 16, 31, 25,
+ 22, 22, 20, 18, 17, 16, 26, 22, 21, 22, 20, 19, 18, 17, 21, 21, 19, 19,
+ 18, 17, 17, 17, 21, 22, 19, 18, 17, 16, 16, 16, 20, 22, 19, 17, 16, 15,
+ 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 18, 16, 14, 13, 13, 13,
+ 17, 19, 18, 15, 13, 12, 12, 12, 16, 19, 17, 15, 12, 12, 11, 12, 16, 18,
+ 17, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11, 11, 11, 15, 17, 16, 14,
+ 13, 12, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 15, 16, 14, 13, 12,
+ 10, 10,
+ /* Size 16x8 */
+ 32, 34, 31, 26, 21, 21, 20, 20, 19, 17, 16, 16, 15, 15, 14, 14, 29, 27,
+ 25, 22, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 16, 15, 21, 22, 22, 21,
+ 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 19, 18,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 18, 20, 20, 20, 18, 17, 16, 14,
+ 14, 13, 12, 12, 13, 13, 12, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+ 12, 11, 11, 12, 11, 12, 15, 16, 17, 18, 17, 16, 14, 14, 13, 12, 11, 11,
+ 11, 10, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10,
+ 10, 10,
+ /* Size 16x32 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 18, 17, 16, 15, 15, 15, 15, 14, 33, 33,
+ 28, 26, 22, 22, 21, 20, 19, 18, 17, 16, 16, 16, 16, 15, 34, 32, 27, 26,
+ 22, 23, 22, 21, 20, 19, 18, 17, 16, 16, 16, 15, 33, 31, 27, 25, 22, 23,
+ 22, 21, 20, 19, 18, 17, 17, 17, 16, 16, 31, 28, 25, 23, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 17, 16, 16, 28, 26, 23, 22, 22, 23, 22, 22, 20, 20,
+ 19, 18, 17, 17, 17, 17, 26, 25, 22, 22, 21, 22, 22, 21, 20, 19, 19, 18,
+ 18, 17, 17, 17, 24, 24, 22, 21, 20, 21, 20, 20, 19, 18, 18, 17, 17, 17,
+ 17, 17, 21, 22, 21, 21, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 17, 17,
+ 21, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+ 22, 21, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22,
+ 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 16, 15, 20, 22, 22, 21, 19, 17,
+ 17, 16, 16, 15, 15, 14, 14, 15, 15, 15, 20, 22, 22, 21, 19, 17, 17, 16,
+ 15, 15, 14, 14, 14, 14, 15, 14, 20, 21, 22, 21, 19, 17, 16, 16, 14, 14,
+ 14, 13, 14, 14, 14, 14, 19, 20, 21, 20, 19, 17, 16, 15, 14, 13, 13, 13,
+ 13, 13, 14, 14, 19, 20, 21, 20, 18, 16, 16, 15, 14, 13, 13, 13, 13, 13,
+ 13, 14, 18, 20, 20, 20, 18, 16, 16, 15, 13, 13, 12, 12, 12, 13, 13, 13,
+ 17, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 13, 17, 18,
+ 19, 19, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 16, 18, 19, 18,
+ 17, 15, 15, 14, 12, 12, 12, 11, 11, 12, 12, 12, 16, 17, 18, 18, 17, 15,
+ 14, 14, 12, 12, 11, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 15, 14, 13,
+ 12, 12, 11, 11, 11, 11, 11, 12, 15, 17, 17, 18, 16, 15, 14, 13, 12, 12,
+ 11, 11, 11, 11, 11, 11, 15, 17, 17, 17, 16, 15, 14, 13, 13, 12, 11, 11,
+ 11, 10, 11, 11, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 11, 11, 10, 10,
+ 10, 10, 15, 16, 17, 17, 16, 16, 14, 13, 13, 12, 12, 11, 10, 10, 10, 10,
+ 14, 16, 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16,
+ 16, 17, 16, 15, 14, 14, 12, 12, 11, 11, 10, 10, 10, 10, 14, 16, 16, 16,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 15,
+ 14, 13, 13, 12, 12, 11, 10, 10, 10, 10, 14, 15, 15, 16, 16, 14, 14, 13,
+ 13, 12, 12, 11, 11, 10, 10, 9,
+ /* Size 32x16 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 33, 33, 32, 31,
+ 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 29, 28, 27, 27, 25, 23, 22, 22,
+ 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 15, 15, 27, 26, 26, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+ 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16,
+ 16, 16, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+ 23, 23, 22, 23, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16,
+ 15, 15, 15, 15, 15, 16, 16, 15, 15, 15, 15, 14, 20, 21, 22, 22, 22, 22,
+ 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18,
+ 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 14,
+ 14, 13, 13, 13, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13,
+ 17, 18, 19, 19, 19, 20, 19, 18, 17, 17, 17, 16, 15, 15, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 17, 17, 17, 18, 18, 17,
+ 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 10, 15, 16,
+ 16, 16, 16, 17, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 17,
+ 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 10, 10, 10, 10, 10, 10, 9,
+ /* Size 4x16 */
+ 33, 21, 17, 15, 32, 23, 19, 16, 28, 22, 19, 17, 25, 22, 19, 17, 22, 19,
+ 17, 17, 22, 18, 17, 16, 22, 17, 15, 15, 21, 17, 14, 14, 20, 16, 13, 13,
+ 19, 16, 12, 12, 18, 15, 12, 12, 17, 15, 12, 11, 17, 15, 12, 10, 16, 16,
+ 12, 10, 16, 15, 12, 10, 15, 15, 12, 10,
+ /* Size 16x4 */
+ 33, 32, 28, 25, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 15, 21, 23,
+ 22, 22, 19, 18, 17, 17, 16, 16, 15, 15, 15, 16, 15, 15, 17, 19, 19, 19,
+ 17, 17, 15, 14, 13, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 16,
+ 15, 14, 13, 12, 12, 11, 10, 10, 10, 10,
+ /* Size 8x32 */
+ 32, 29, 21, 20, 18, 16, 15, 15, 33, 28, 22, 21, 19, 17, 16, 16, 34, 27,
+ 22, 22, 20, 18, 16, 16, 33, 27, 22, 22, 20, 18, 17, 16, 31, 25, 22, 22,
+ 20, 18, 17, 16, 28, 23, 22, 22, 20, 19, 17, 17, 26, 22, 21, 22, 20, 19,
+ 18, 17, 24, 22, 20, 20, 19, 18, 17, 17, 21, 21, 19, 19, 18, 17, 17, 17,
+ 21, 22, 19, 19, 18, 17, 16, 16, 21, 22, 19, 18, 17, 16, 16, 16, 21, 23,
+ 19, 17, 16, 15, 15, 16, 20, 22, 19, 17, 16, 15, 14, 15, 20, 22, 19, 17,
+ 15, 14, 14, 15, 20, 22, 19, 16, 14, 14, 14, 14, 19, 21, 19, 16, 14, 13,
+ 13, 14, 19, 21, 18, 16, 14, 13, 13, 13, 18, 20, 18, 16, 13, 12, 12, 13,
+ 17, 19, 18, 15, 13, 12, 12, 12, 17, 19, 17, 15, 13, 12, 12, 12, 16, 19,
+ 17, 15, 12, 12, 11, 12, 16, 18, 17, 14, 12, 11, 11, 12, 16, 18, 17, 14,
+ 12, 11, 11, 11, 15, 17, 16, 14, 12, 11, 11, 11, 15, 17, 16, 14, 13, 11,
+ 11, 11, 15, 17, 16, 14, 13, 11, 10, 10, 15, 17, 16, 14, 13, 12, 10, 10,
+ 14, 16, 16, 14, 12, 11, 10, 10, 14, 16, 16, 14, 12, 11, 10, 10, 14, 16,
+ 16, 14, 13, 11, 10, 10, 14, 15, 16, 14, 13, 12, 10, 10, 14, 15, 16, 14,
+ 13, 12, 11, 10,
+ /* Size 32x8 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 29, 28, 27, 27,
+ 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 15, 15, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 16, 16, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 18, 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14,
+ 14, 13, 13, 13, 12, 12, 12, 12, 13, 13, 13, 12, 12, 13, 13, 13, 16, 17,
+ 18, 18, 18, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17,
+ 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 11, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16,
+ 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 10, 10, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 27, 16, 12, 27, 18, 13, 11, 16, 13, 9, 8, 12, 11, 8, 6,
+ /* Size 8x8 */
+ 32, 32, 29, 22, 18, 13, 12, 11, 32, 30, 28, 23, 19, 15, 13, 11, 29, 28,
+ 21, 18, 16, 13, 12, 11, 22, 23, 18, 15, 13, 11, 10, 10, 18, 19, 16, 13,
+ 11, 9, 8, 8, 13, 15, 13, 11, 9, 8, 7, 7, 12, 13, 12, 10, 8, 7, 7, 6, 11,
+ 11, 11, 10, 8, 7, 6, 6,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 33, 32,
+ 32, 32, 30, 27, 25, 22, 19, 17, 16, 14, 13, 12, 11, 10, 33, 32, 31, 30,
+ 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 32, 32, 30, 29, 28, 26,
+ 24, 22, 20, 18, 16, 14, 14, 13, 12, 11, 30, 30, 28, 28, 24, 22, 20, 19,
+ 17, 16, 15, 13, 12, 12, 12, 11, 26, 27, 26, 26, 22, 19, 18, 17, 15, 14,
+ 13, 12, 11, 11, 11, 10, 23, 25, 24, 24, 20, 18, 16, 15, 14, 13, 12, 11,
+ 11, 10, 10, 10, 21, 22, 22, 22, 19, 17, 15, 14, 13, 12, 11, 10, 10, 10,
+ 9, 9, 18, 19, 19, 20, 17, 15, 14, 13, 11, 11, 10, 9, 9, 9, 9, 8, 16, 17,
+ 17, 18, 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 14, 16, 16, 16, 15,
+ 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 7, 13, 14, 14, 14, 13, 12, 11, 10, 9,
+ 9, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 12, 11, 11, 10, 9, 8, 8, 7, 7, 7,
+ 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 10, 11,
+ 12, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 6, 6, 6, 10, 10, 11, 11, 11, 10,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6, 5,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 30, 28, 26, 25, 23, 21, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+ 32, 32, 32, 30, 30, 29, 27, 26, 24, 22, 22, 20, 19, 18, 17, 16, 15, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 27, 26, 25, 23, 22, 20, 19, 19, 17, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 11, 11, 10, 10, 33, 32, 32, 32, 32, 32, 32, 31, 30, 30, 28, 27,
+ 25, 23, 23, 21, 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 33, 32, 32, 32, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20,
+ 19, 19, 17, 16, 16, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32,
+ 32, 32, 31, 30, 30, 28, 28, 28, 26, 26, 24, 23, 22, 21, 19, 19, 18, 17,
+ 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 30, 30,
+ 29, 28, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 14,
+ 14, 13, 13, 12, 12, 12, 11, 11, 30, 30, 31, 31, 29, 28, 28, 26, 25, 24,
+ 23, 22, 22, 20, 20, 19, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 12, 11, 11, 30, 30, 30, 30, 28, 28, 28, 25, 24, 23, 22, 21, 20, 19,
+ 19, 18, 17, 17, 16, 15, 15, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11,
+ 28, 29, 30, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17, 16, 16,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 26, 27, 27, 28,
+ 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+ 12, 12, 11, 12, 11, 11, 11, 11, 10, 10, 25, 26, 26, 27, 26, 26, 25, 22,
+ 21, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24, 24, 22, 20, 19, 18, 17,
+ 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 21, 22, 23, 23, 23, 23, 23, 20, 19, 18, 17, 17, 16, 15, 14, 13,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 21, 22,
+ 22, 23, 22, 22, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 19, 20, 20, 21, 20, 21, 21,
+ 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9,
+ 9, 9, 9, 9, 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14,
+ 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 9, 17, 18,
+ 19, 19, 19, 19, 19, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 8, 9, 8, 8, 8, 8, 8, 16, 17, 17, 17, 17, 18, 18, 16, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 15, 16, 16, 16, 16, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11,
+ 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 14, 15, 16, 16, 16,
+ 16, 16, 15, 15, 14, 13, 13, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 8, 8, 7, 8, 7, 7, 7, 13, 13, 14, 14, 14, 15, 15, 14, 13, 13, 12, 12, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 13, 13, 14,
+ 14, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 13, 13, 13, 12,
+ 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 12,
+ 12, 13, 13, 13, 13, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12,
+ 12, 12, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6,
+ 6, 11, 12, 12, 12, 12, 12, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9,
+ 8, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 6, 6, 10, 11, 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11,
+ 11, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 6, 6, 6, 5, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5, 5, 10, 10, 10, 11,
+ 11, 11, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6,
+ 6, 6, 6, 6, 5, 5, 5,
+ /* Size 4x8 */
+ 32, 27, 17, 12, 32, 26, 18, 13, 30, 20, 15, 12, 23, 17, 12, 10, 19, 15,
+ 10, 9, 14, 12, 9, 8, 12, 12, 8, 7, 11, 10, 8, 6,
+ /* Size 8x4 */
+ 32, 32, 30, 23, 19, 14, 12, 11, 27, 26, 20, 17, 15, 12, 12, 10, 17, 18,
+ 15, 12, 10, 9, 8, 8, 12, 13, 12, 10, 9, 8, 7, 6,
+ /* Size 8x16 */
+ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 25, 19, 14, 13, 12, 32, 31,
+ 28, 24, 19, 14, 13, 12, 32, 30, 27, 24, 20, 15, 13, 12, 30, 28, 23, 20,
+ 17, 14, 13, 12, 26, 26, 20, 18, 15, 12, 12, 11, 23, 24, 19, 16, 14, 11,
+ 11, 11, 21, 22, 18, 15, 13, 11, 10, 10, 18, 19, 16, 14, 11, 9, 9, 9, 16,
+ 17, 15, 13, 11, 9, 8, 8, 14, 16, 14, 12, 10, 8, 8, 8, 13, 14, 13, 11, 9,
+ 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 12, 12, 10, 9, 8, 7, 6, 10, 12,
+ 12, 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+ /* Size 16x8 */
+ 32, 33, 32, 32, 30, 26, 23, 21, 18, 16, 14, 13, 12, 11, 10, 10, 32, 32,
+ 31, 30, 28, 26, 24, 22, 19, 17, 16, 14, 13, 12, 12, 11, 28, 29, 28, 27,
+ 23, 20, 19, 18, 16, 15, 14, 13, 12, 12, 12, 11, 23, 25, 24, 24, 20, 18,
+ 16, 15, 14, 13, 12, 11, 11, 10, 10, 10, 18, 19, 19, 20, 17, 15, 14, 13,
+ 11, 11, 10, 9, 9, 9, 8, 9, 13, 14, 14, 15, 14, 12, 11, 11, 9, 9, 8, 8,
+ 7, 8, 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 6, 6, 11,
+ 12, 12, 12, 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 16x32 */
+ 32, 33, 32, 32, 28, 26, 23, 19, 18, 16, 13, 13, 12, 11, 11, 11, 33, 32,
+ 32, 32, 29, 27, 24, 20, 19, 17, 14, 13, 12, 12, 12, 11, 33, 32, 32, 32,
+ 29, 27, 25, 20, 19, 17, 14, 14, 13, 12, 12, 11, 33, 32, 32, 31, 30, 28,
+ 25, 21, 19, 17, 14, 14, 13, 12, 12, 12, 32, 32, 31, 30, 28, 26, 24, 20,
+ 19, 17, 14, 14, 13, 13, 12, 12, 32, 32, 30, 30, 28, 26, 24, 21, 19, 18,
+ 15, 14, 13, 13, 12, 12, 32, 31, 30, 29, 27, 26, 24, 21, 20, 18, 15, 15,
+ 13, 13, 12, 12, 30, 30, 29, 28, 24, 23, 21, 19, 18, 16, 14, 14, 13, 13,
+ 13, 12, 30, 30, 28, 28, 23, 22, 20, 18, 17, 16, 14, 13, 13, 12, 12, 12,
+ 28, 30, 28, 27, 21, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 12, 26, 28,
+ 26, 26, 20, 19, 18, 16, 15, 14, 12, 12, 12, 12, 11, 12, 26, 27, 26, 25,
+ 20, 19, 17, 15, 15, 14, 12, 12, 11, 11, 11, 11, 23, 25, 24, 24, 19, 18,
+ 16, 14, 14, 13, 11, 11, 11, 11, 11, 11, 22, 23, 23, 22, 18, 17, 16, 14,
+ 13, 12, 11, 11, 10, 10, 10, 10, 21, 22, 22, 22, 18, 17, 15, 13, 13, 12,
+ 11, 10, 10, 10, 10, 10, 19, 21, 20, 20, 17, 16, 14, 12, 12, 11, 10, 10,
+ 9, 9, 10, 9, 18, 19, 19, 19, 16, 15, 14, 12, 11, 11, 9, 9, 9, 9, 9, 9,
+ 17, 19, 19, 19, 16, 15, 14, 12, 11, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+ 18, 15, 14, 13, 11, 11, 10, 9, 9, 8, 8, 8, 9, 15, 16, 17, 17, 14, 13,
+ 12, 11, 10, 9, 8, 8, 8, 8, 8, 8, 14, 16, 16, 16, 14, 13, 12, 11, 10, 9,
+ 8, 8, 8, 8, 8, 8, 13, 14, 14, 15, 13, 12, 11, 10, 9, 9, 8, 8, 7, 8, 8,
+ 7, 13, 14, 14, 14, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 7, 12, 14, 14,
+ 14, 13, 12, 11, 10, 9, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 13, 12, 11, 11,
+ 9, 9, 8, 7, 7, 7, 7, 7, 7, 11, 12, 13, 13, 12, 12, 10, 9, 9, 8, 8, 7, 7,
+ 7, 6, 6, 11, 12, 12, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 6, 6, 6, 11, 12,
+ 12, 12, 12, 11, 10, 10, 9, 8, 7, 7, 7, 6, 6, 6, 10, 12, 12, 12, 12, 11,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6, 6, 10, 11, 11, 12, 11, 10, 10, 9, 9, 8, 7,
+ 7, 6, 6, 6, 6, 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 7, 7, 6, 6, 6, 6,
+ 10, 11, 11, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6, 5,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 33, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 28, 27, 25, 23, 22, 21, 19, 19, 17, 16, 16, 14,
+ 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 32, 32, 32, 32, 31, 30, 30, 29,
+ 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 26, 25,
+ 24, 22, 22, 20, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 28, 29, 29, 30, 28, 28, 27, 24, 23, 21, 20, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 26, 27,
+ 27, 28, 26, 26, 26, 23, 22, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 13,
+ 13, 12, 12, 12, 11, 12, 11, 11, 11, 10, 10, 10, 23, 24, 25, 25, 24, 24,
+ 24, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 10, 10, 19, 20, 20, 21, 20, 21, 21, 19, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 10, 10, 9,
+ 9, 9, 9, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 16, 17, 17, 17, 17,
+ 18, 18, 16, 16, 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 12, 12,
+ 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 8, 8, 7, 7, 7, 7, 8, 13, 13,
+ 14, 14, 14, 14, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8, 8,
+ 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12,
+ 12, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7,
+ 11, 12, 12, 12, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13,
+ 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 6, 6, 6, 6,
+ 6, 6, 6, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11, 10, 10, 9,
+ 9, 9, 9, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 5,
+ /* Size 4x16 */
+ 33, 26, 16, 11, 32, 27, 17, 12, 32, 26, 17, 13, 31, 26, 18, 13, 30, 22,
+ 16, 12, 28, 19, 14, 12, 25, 18, 13, 11, 22, 17, 12, 10, 19, 15, 11, 9,
+ 17, 14, 10, 8, 16, 13, 9, 8, 14, 12, 9, 7, 13, 11, 8, 7, 12, 11, 8, 6,
+ 12, 11, 8, 6, 11, 10, 8, 6,
+ /* Size 16x4 */
+ 33, 32, 32, 31, 30, 28, 25, 22, 19, 17, 16, 14, 13, 12, 12, 11, 26, 27,
+ 26, 26, 22, 19, 18, 17, 15, 14, 13, 12, 11, 11, 11, 10, 16, 17, 17, 18,
+ 16, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 11, 12, 13, 13, 12, 12, 11,
+ 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 8x32 */
+ 32, 32, 28, 23, 18, 13, 12, 11, 33, 32, 29, 24, 19, 14, 12, 12, 33, 32,
+ 29, 25, 19, 14, 13, 12, 33, 32, 30, 25, 19, 14, 13, 12, 32, 31, 28, 24,
+ 19, 14, 13, 12, 32, 30, 28, 24, 19, 15, 13, 12, 32, 30, 27, 24, 20, 15,
+ 13, 12, 30, 29, 24, 21, 18, 14, 13, 13, 30, 28, 23, 20, 17, 14, 13, 12,
+ 28, 28, 21, 19, 16, 13, 12, 12, 26, 26, 20, 18, 15, 12, 12, 11, 26, 26,
+ 20, 17, 15, 12, 11, 11, 23, 24, 19, 16, 14, 11, 11, 11, 22, 23, 18, 16,
+ 13, 11, 10, 10, 21, 22, 18, 15, 13, 11, 10, 10, 19, 20, 17, 14, 12, 10,
+ 9, 10, 18, 19, 16, 14, 11, 9, 9, 9, 17, 19, 16, 14, 11, 9, 9, 9, 16, 17,
+ 15, 13, 11, 9, 8, 8, 15, 17, 14, 12, 10, 8, 8, 8, 14, 16, 14, 12, 10, 8,
+ 8, 8, 13, 14, 13, 11, 9, 8, 7, 8, 13, 14, 13, 11, 9, 8, 7, 7, 12, 14,
+ 13, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 7, 7, 7, 11, 13, 12, 10, 9, 8, 7,
+ 6, 11, 12, 12, 10, 9, 8, 7, 6, 11, 12, 12, 10, 9, 7, 7, 6, 10, 12, 12,
+ 10, 8, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6, 10, 11, 11, 10, 9, 7, 6, 6,
+ 10, 11, 11, 10, 9, 8, 7, 6,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 30, 28, 26, 26, 23, 22, 21, 19, 18, 17,
+ 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 32, 32, 32, 32,
+ 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 19, 17, 17, 16, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 28, 29, 29, 30, 28, 28, 27, 24,
+ 23, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12,
+ 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24, 24, 21, 20, 19, 18, 17,
+ 16, 16, 15, 14, 14, 14, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ 10, 10, 18, 19, 19, 19, 19, 19, 20, 18, 17, 16, 15, 15, 14, 13, 13, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 9, 9, 8, 9, 9, 9, 13, 14, 14, 14, 14,
+ 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7,
+ 8, 8, 7, 7, 7, 7, 8, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11,
+ 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 7, 11, 12, 12,
+ 12, 12, 12, 12, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7,
+ 7, 7, 6, 6, 6, 6, 6, 6, 6 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 23, 19, 16, 23, 19, 17, 15, 19, 17, 13, 12, 16, 15, 12, 10,
+ /* Size 8x8 */
+ 33, 28, 22, 21, 20, 17, 16, 15, 28, 24, 22, 22, 21, 19, 17, 16, 22, 22,
+ 19, 19, 19, 17, 16, 16, 21, 22, 19, 17, 16, 15, 14, 14, 20, 21, 19, 16,
+ 14, 13, 13, 13, 17, 19, 17, 15, 13, 12, 12, 12, 16, 17, 16, 14, 13, 12,
+ 11, 10, 15, 16, 16, 14, 13, 12, 10, 10,
+ /* Size 16x16 */
+ 32, 34, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 15, 15, 15, 14, 34, 33,
+ 29, 26, 23, 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 15, 31, 29, 26, 24,
+ 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 28, 26, 24, 22, 22, 22,
+ 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 23, 23, 22, 22, 20, 20, 20, 20,
+ 19, 19, 18, 17, 17, 17, 16, 17, 21, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 16, 16, 16, 21, 22, 23, 23, 20, 19, 18, 17, 17, 16, 16, 15,
+ 15, 15, 15, 15, 20, 22, 22, 22, 20, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+ 14, 14, 19, 20, 21, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13,
+ 18, 19, 20, 20, 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 19,
+ 19, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 12, 12, 16, 17, 18, 19,
+ 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 11, 11, 11, 15, 16, 17, 18, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 10, 10, 15, 16, 16, 17, 16, 16, 15, 14, 13, 12,
+ 12, 11, 11, 10, 10, 10, 14, 15, 16, 16, 17, 16, 15, 14, 13, 12, 12, 11,
+ 11, 10, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 34, 34, 31, 29, 28, 25, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 14, 33, 33, 33, 33,
+ 30, 28, 27, 24, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 34, 33, 33, 33, 29, 28, 26, 24,
+ 23, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 34, 33, 33, 32, 29, 28, 26, 24, 23, 22, 23, 23,
+ 23, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 31, 30, 29, 29, 26, 25, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22,
+ 21, 21, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 29, 28,
+ 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 28, 27, 26, 26, 24, 23,
+ 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 25, 24, 24, 24, 23, 22, 22, 21, 21, 20,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 17, 17, 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 17, 17, 17,
+ 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23,
+ 22, 22, 22, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+ 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 21, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 20, 21, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 21,
+ 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+ 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22,
+ 22, 20, 20, 19, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14,
+ 13, 13, 14, 13, 13, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19,
+ 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 20, 19, 19, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 17, 18, 19, 19,
+ 19, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 19, 19, 19, 19, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 16, 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 12, 11, 12, 11, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 16, 17,
+ 17, 18, 18, 18, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+ 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18,
+ 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+ 10, 10, 11, 10, 15, 16, 16, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10,
+ 15, 16, 16, 16, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 15, 15, 16, 16,
+ 16, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 17, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 10, 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10,
+ 10, 10, 14, 15, 15, 16, 16, 16, 16, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+ /* Size 4x8 */
+ 33, 22, 19, 16, 27, 22, 20, 17, 22, 19, 18, 17, 22, 18, 16, 14, 20, 17,
+ 14, 13, 18, 16, 12, 12, 17, 16, 12, 11, 16, 15, 12, 10,
+ /* Size 8x4 */
+ 33, 27, 22, 22, 20, 18, 17, 16, 22, 22, 19, 18, 17, 16, 16, 15, 19, 20,
+ 18, 16, 14, 12, 12, 12, 16, 17, 17, 14, 13, 12, 11, 10,
+ /* Size 8x16 */
+ 32, 30, 21, 21, 19, 16, 15, 15, 33, 28, 22, 22, 20, 18, 17, 16, 31, 26,
+ 22, 22, 21, 18, 17, 17, 28, 23, 22, 23, 21, 19, 18, 17, 23, 22, 20, 20,
+ 19, 17, 17, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 23, 19, 18, 17, 15,
+ 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 13, 13, 13,
+ 18, 20, 18, 16, 14, 12, 12, 13, 17, 19, 18, 16, 14, 12, 12, 12, 16, 18,
+ 17, 15, 13, 12, 11, 12, 16, 17, 16, 15, 13, 11, 11, 11, 15, 17, 16, 14,
+ 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12,
+ 11, 10,
+ /* Size 16x8 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 19, 18, 17, 16, 16, 15, 15, 14, 30, 28,
+ 26, 23, 22, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22, 22, 22,
+ 20, 19, 19, 19, 19, 18, 18, 17, 16, 16, 16, 16, 21, 22, 22, 23, 20, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 15, 15, 19, 20, 21, 21, 19, 18, 17, 16,
+ 15, 14, 14, 13, 13, 13, 13, 13, 16, 18, 18, 19, 17, 16, 15, 14, 13, 12,
+ 12, 12, 11, 12, 12, 12, 15, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 10,
+ 10, 10,
+ /* Size 16x32 */
+ 32, 33, 30, 28, 21, 21, 21, 20, 19, 18, 16, 16, 15, 15, 15, 15, 33, 33,
+ 29, 27, 22, 22, 22, 20, 20, 19, 17, 17, 16, 16, 16, 16, 33, 32, 28, 26,
+ 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 16, 34, 32, 28, 26, 22, 23,
+ 23, 21, 21, 20, 18, 18, 17, 17, 17, 16, 31, 28, 26, 24, 22, 22, 22, 22,
+ 21, 20, 18, 18, 17, 17, 17, 16, 29, 27, 24, 23, 22, 22, 23, 22, 21, 20,
+ 19, 18, 18, 17, 17, 17, 28, 26, 23, 22, 22, 22, 23, 22, 21, 20, 19, 19,
+ 18, 18, 17, 17, 24, 24, 23, 22, 20, 20, 21, 20, 20, 19, 18, 18, 17, 18,
+ 17, 17, 23, 23, 22, 22, 20, 20, 20, 20, 19, 19, 17, 17, 17, 17, 17, 17,
+ 21, 22, 22, 21, 19, 19, 19, 19, 19, 18, 17, 17, 16, 17, 17, 16, 21, 22,
+ 22, 22, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 16, 16, 21, 23, 22, 22,
+ 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 16, 21, 23, 23, 22, 19, 18,
+ 18, 17, 17, 16, 15, 15, 15, 15, 15, 16, 20, 22, 22, 22, 19, 18, 17, 16,
+ 16, 16, 15, 14, 15, 14, 15, 15, 20, 22, 22, 22, 19, 18, 17, 16, 16, 15,
+ 14, 14, 14, 14, 14, 15, 20, 21, 22, 22, 19, 18, 17, 16, 15, 14, 14, 14,
+ 13, 14, 14, 14, 19, 21, 21, 21, 19, 18, 17, 15, 15, 14, 13, 13, 13, 13,
+ 13, 14, 19, 20, 21, 21, 19, 17, 17, 15, 15, 14, 13, 13, 13, 13, 13, 13,
+ 18, 20, 20, 20, 18, 17, 16, 15, 14, 13, 12, 12, 12, 12, 13, 13, 17, 19,
+ 20, 20, 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 17, 19, 19, 20,
+ 18, 17, 16, 14, 14, 13, 12, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 12, 12, 12, 16, 18, 18, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 11, 12, 12, 16, 17, 18, 18, 17, 16, 15, 14, 13, 12,
+ 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 16, 16, 15, 13, 13, 12, 11, 11,
+ 11, 11, 11, 11, 15, 17, 17, 18, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11,
+ 11, 11, 15, 17, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11, 10, 11,
+ 15, 16, 17, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 10, 10, 10, 15, 16,
+ 16, 17, 16, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17,
+ 16, 15, 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 17, 16, 15,
+ 15, 14, 13, 12, 12, 11, 11, 10, 10, 10, 14, 16, 16, 16, 16, 15, 15, 13,
+ 13, 12, 12, 11, 11, 10, 10, 10,
+ /* Size 32x16 */
+ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 33, 33, 32, 32,
+ 28, 27, 26, 24, 23, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 30, 29, 28, 28, 26, 24, 23, 23,
+ 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17,
+ 17, 17, 16, 16, 16, 16, 28, 27, 26, 26, 24, 23, 22, 22, 22, 21, 22, 22,
+ 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 17, 16, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 21, 22,
+ 22, 23, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 21, 22, 22, 23, 22, 23,
+ 23, 21, 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+ 15, 15, 14, 14, 15, 15, 15, 15, 20, 20, 21, 21, 22, 22, 22, 20, 20, 19,
+ 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14, 14, 14,
+ 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16,
+ 16, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 18, 19, 19, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 13, 12, 12, 12, 16, 17, 18, 18,
+ 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12,
+ 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10, 15, 16,
+ 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 10, 10, 10,
+ /* Size 4x16 */
+ 33, 21, 18, 15, 32, 22, 19, 16, 28, 22, 20, 17, 26, 22, 20, 18, 23, 20,
+ 19, 17, 22, 19, 17, 16, 23, 18, 16, 15, 22, 18, 15, 14, 21, 18, 14, 13,
+ 20, 17, 13, 12, 19, 17, 13, 12, 18, 16, 12, 11, 17, 16, 12, 11, 17, 16,
+ 12, 11, 16, 16, 13, 10, 16, 15, 12, 10,
+ /* Size 16x4 */
+ 33, 32, 28, 26, 23, 22, 23, 22, 21, 20, 19, 18, 17, 17, 16, 16, 21, 22,
+ 22, 22, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 18, 19, 20, 20,
+ 19, 17, 16, 15, 14, 13, 13, 12, 12, 12, 13, 12, 15, 16, 17, 18, 17, 16,
+ 15, 14, 13, 12, 12, 11, 11, 11, 10, 10,
+ /* Size 8x32 */
+ 32, 30, 21, 21, 19, 16, 15, 15, 33, 29, 22, 22, 20, 17, 16, 16, 33, 28,
+ 22, 22, 20, 18, 17, 16, 34, 28, 22, 23, 21, 18, 17, 17, 31, 26, 22, 22,
+ 21, 18, 17, 17, 29, 24, 22, 23, 21, 19, 18, 17, 28, 23, 22, 23, 21, 19,
+ 18, 17, 24, 23, 20, 21, 20, 18, 17, 17, 23, 22, 20, 20, 19, 17, 17, 17,
+ 21, 22, 19, 19, 19, 17, 16, 17, 21, 22, 19, 18, 18, 16, 16, 16, 21, 22,
+ 19, 18, 17, 16, 16, 16, 21, 23, 19, 18, 17, 15, 15, 15, 20, 22, 19, 17,
+ 16, 15, 15, 15, 20, 22, 19, 17, 16, 14, 14, 14, 20, 22, 19, 17, 15, 14,
+ 13, 14, 19, 21, 19, 17, 15, 13, 13, 13, 19, 21, 19, 17, 15, 13, 13, 13,
+ 18, 20, 18, 16, 14, 12, 12, 13, 17, 20, 18, 16, 14, 12, 12, 12, 17, 19,
+ 18, 16, 14, 12, 12, 12, 16, 18, 17, 15, 13, 12, 11, 12, 16, 18, 17, 15,
+ 13, 12, 11, 12, 16, 18, 17, 15, 13, 11, 11, 11, 16, 17, 16, 15, 13, 11,
+ 11, 11, 15, 17, 16, 15, 13, 12, 11, 11, 15, 17, 16, 14, 13, 12, 11, 10,
+ 15, 17, 16, 14, 13, 12, 11, 10, 15, 16, 16, 15, 13, 12, 11, 10, 14, 16,
+ 16, 15, 13, 12, 11, 10, 14, 16, 16, 15, 13, 12, 11, 10, 14, 16, 16, 15,
+ 13, 12, 11, 10,
+ /* Size 32x8 */
+ 32, 33, 33, 34, 31, 29, 28, 24, 23, 21, 21, 21, 21, 20, 20, 20, 19, 19,
+ 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 30, 29, 28, 28,
+ 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 18,
+ 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 23, 23, 21, 20, 19, 18, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 14, 15, 15,
+ 15, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+ 15, 15, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 16, 17,
+ 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 12, 12, 11, 11, 12, 12, 12, 12, 12, 12, 12, 15, 16, 17, 17, 17, 18,
+ 18, 17, 17, 16, 16, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 17, 17, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 28, 18, 13, 28, 19, 14, 11, 18, 14, 10, 8, 13, 11, 8, 7,
+ /* Size 8x8 */
+ 32, 32, 29, 24, 19, 15, 13, 11, 32, 31, 28, 24, 20, 16, 14, 12, 29, 28,
+ 22, 20, 17, 14, 13, 12, 24, 24, 20, 16, 14, 12, 11, 10, 19, 20, 17, 14,
+ 12, 10, 9, 9, 15, 16, 14, 12, 10, 9, 8, 8, 13, 14, 13, 11, 9, 8, 7, 7,
+ 11, 12, 12, 10, 9, 8, 7, 6,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 30, 28, 25, 22, 19, 17, 16, 14, 12, 12, 11, 11, 33, 32,
+ 32, 32, 30, 29, 26, 23, 20, 19, 17, 15, 13, 13, 12, 11, 33, 32, 31, 31,
+ 29, 28, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 32, 32, 31, 29, 28, 27,
+ 25, 23, 21, 19, 18, 16, 14, 14, 13, 12, 30, 30, 29, 28, 26, 24, 22, 20,
+ 19, 18, 16, 15, 13, 13, 12, 12, 28, 29, 28, 27, 24, 21, 20, 18, 17, 16,
+ 15, 14, 13, 12, 11, 11, 25, 26, 26, 25, 22, 20, 18, 17, 15, 14, 14, 12,
+ 12, 11, 11, 11, 22, 23, 23, 23, 20, 18, 17, 15, 14, 13, 12, 11, 11, 10,
+ 10, 10, 19, 20, 21, 21, 19, 17, 15, 14, 12, 12, 11, 10, 10, 9, 9, 9, 17,
+ 19, 19, 19, 18, 16, 14, 13, 12, 11, 10, 10, 9, 9, 9, 8, 16, 17, 17, 18,
+ 16, 15, 14, 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 12,
+ 11, 10, 10, 9, 8, 8, 8, 7, 7, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9,
+ 8, 7, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+ 11, 12, 12, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6, 6, 11, 11, 12, 12,
+ 12, 11, 11, 10, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 18,
+ 17, 16, 16, 14, 14, 13, 12, 12, 12, 11, 11, 11, 11, 10, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 19, 18, 17, 17, 15,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 29, 27, 26, 24, 23, 23, 20, 20, 19, 17, 17, 15, 15, 14, 13, 13,
+ 13, 12, 12, 12, 11, 11, 33, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+ 27, 25, 23, 23, 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12,
+ 12, 11, 33, 32, 32, 32, 31, 31, 31, 30, 29, 28, 28, 26, 26, 24, 23, 23,
+ 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 33, 32,
+ 32, 32, 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 32, 32, 32, 32, 31, 30,
+ 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 18, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26,
+ 26, 24, 24, 23, 22, 22, 20, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+ 13, 12, 12, 12, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22,
+ 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 12, 12, 12,
+ 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 17, 17,
+ 17, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 28, 29, 29, 30,
+ 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 26, 27, 27, 28, 26, 26, 26, 24,
+ 23, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 11, 11, 11, 25, 26, 26, 27, 26, 26, 25, 24, 22, 20, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 23, 24, 24, 25, 24, 24, 24, 23, 22, 20, 19, 18, 17, 16, 16, 15,
+ 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 22, 23,
+ 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16, 15, 15, 14, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 21, 22, 23, 23, 23, 23,
+ 22, 22, 20, 19, 18, 17, 16, 15, 15, 14, 13, 13, 13, 12, 12, 11, 11, 11,
+ 10, 10, 10, 10, 10, 10, 9, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17,
+ 17, 16, 15, 14, 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9,
+ 9, 9, 9, 18, 19, 20, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 13, 13,
+ 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 17, 18, 19, 19,
+ 19, 19, 19, 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 9, 9, 8, 8, 9, 16, 17, 17, 18, 18, 18, 18, 17, 17, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8,
+ 8, 8, 16, 17, 17, 17, 17, 17, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16,
+ 16, 16, 16, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 8, 8, 8, 8, 7, 14, 14, 15, 15, 15, 15, 16, 15, 15, 14, 14, 13,
+ 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 13,
+ 13, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14,
+ 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7,
+ 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+ 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13,
+ 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+ 7, 7, 6, 6, 6, 11, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10,
+ 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12,
+ 12, 12, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7,
+ 7, 7, 7, 7, 6, 6, 6, 6, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 11, 11,
+ 11, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 10, 11, 11, 11, 11, 12, 12, 12, 12, 11,
+ 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6,
+ /* Size 4x8 */
+ 32, 29, 17, 12, 32, 28, 18, 13, 30, 22, 16, 12, 25, 19, 13, 11, 20, 17,
+ 11, 9, 16, 14, 9, 8, 14, 13, 9, 7, 12, 11, 9, 7,
+ /* Size 8x4 */
+ 32, 32, 30, 25, 20, 16, 14, 12, 29, 28, 22, 19, 17, 14, 13, 11, 17, 18,
+ 16, 13, 11, 9, 9, 9, 12, 13, 12, 11, 9, 8, 7, 7,
+ /* Size 8x16 */
+ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 30, 25, 20, 17, 13, 12, 33, 31,
+ 29, 24, 21, 17, 14, 13, 32, 30, 28, 24, 21, 18, 14, 13, 30, 29, 25, 21,
+ 19, 16, 13, 13, 28, 28, 22, 19, 17, 15, 13, 12, 25, 26, 21, 17, 15, 13,
+ 12, 11, 22, 23, 19, 16, 14, 12, 11, 10, 19, 20, 18, 14, 12, 11, 10, 9,
+ 18, 19, 17, 14, 12, 10, 9, 9, 16, 17, 16, 13, 11, 10, 9, 8, 14, 15, 14,
+ 12, 10, 9, 8, 8, 12, 14, 13, 11, 10, 9, 7, 7, 12, 13, 12, 11, 9, 8, 7,
+ 7, 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 30, 28, 25, 22, 19, 18, 16, 14, 12, 12, 11, 11, 33, 32,
+ 31, 30, 29, 28, 26, 23, 20, 19, 17, 15, 14, 13, 12, 12, 29, 30, 29, 28,
+ 25, 22, 21, 19, 18, 17, 16, 14, 13, 12, 12, 12, 23, 25, 24, 24, 21, 19,
+ 17, 16, 14, 14, 13, 12, 11, 11, 11, 11, 19, 20, 21, 21, 19, 17, 15, 14,
+ 12, 12, 11, 10, 10, 9, 9, 9, 16, 17, 17, 18, 16, 15, 13, 12, 11, 10, 10,
+ 9, 9, 8, 8, 8, 12, 13, 14, 14, 13, 13, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7,
+ 11, 12, 13, 13, 13, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6,
+ /* Size 16x32 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 19, 17, 16, 13, 12, 12, 11, 11, 33, 32,
+ 32, 32, 29, 29, 24, 23, 20, 17, 17, 14, 13, 12, 12, 12, 33, 32, 32, 32,
+ 30, 29, 25, 23, 20, 18, 17, 14, 13, 12, 12, 12, 33, 32, 32, 31, 30, 30,
+ 25, 23, 21, 18, 17, 14, 14, 13, 12, 12, 33, 32, 31, 30, 29, 28, 24, 23,
+ 21, 18, 17, 14, 14, 13, 13, 12, 32, 32, 31, 30, 28, 28, 24, 23, 20, 18,
+ 17, 14, 14, 13, 13, 12, 32, 31, 30, 29, 28, 27, 24, 23, 21, 18, 18, 15,
+ 14, 13, 13, 12, 32, 31, 30, 28, 26, 26, 23, 22, 20, 18, 17, 14, 14, 13,
+ 13, 13, 30, 30, 29, 28, 25, 24, 21, 20, 19, 17, 16, 14, 13, 13, 13, 13,
+ 29, 30, 28, 27, 23, 22, 20, 19, 17, 16, 15, 13, 13, 12, 12, 12, 28, 30,
+ 28, 27, 22, 21, 19, 18, 17, 16, 15, 13, 13, 12, 12, 12, 26, 28, 26, 26,
+ 21, 20, 18, 17, 16, 14, 14, 12, 12, 12, 12, 11, 25, 26, 26, 25, 21, 20,
+ 17, 17, 15, 14, 13, 12, 12, 11, 11, 11, 23, 25, 24, 24, 20, 19, 16, 16,
+ 14, 13, 13, 11, 11, 11, 11, 11, 22, 23, 23, 23, 19, 18, 16, 15, 14, 12,
+ 12, 11, 11, 10, 10, 10, 21, 23, 23, 22, 19, 18, 15, 15, 13, 12, 12, 11,
+ 10, 10, 10, 10, 19, 21, 20, 20, 18, 17, 14, 14, 12, 11, 11, 10, 10, 10,
+ 9, 10, 19, 20, 20, 20, 17, 17, 14, 13, 12, 11, 11, 10, 9, 9, 9, 9, 18,
+ 19, 19, 19, 17, 16, 14, 13, 12, 11, 10, 9, 9, 9, 9, 9, 16, 18, 18, 18,
+ 16, 15, 13, 12, 11, 10, 10, 9, 9, 9, 9, 8, 16, 17, 17, 18, 16, 15, 13,
+ 12, 11, 10, 10, 9, 9, 8, 8, 8, 14, 16, 16, 16, 14, 14, 12, 12, 11, 9, 9,
+ 8, 8, 8, 8, 8, 14, 15, 15, 16, 14, 14, 12, 11, 10, 9, 9, 8, 8, 8, 8, 8,
+ 13, 14, 14, 15, 13, 13, 11, 11, 10, 9, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14,
+ 13, 13, 11, 11, 10, 9, 9, 8, 7, 7, 7, 7, 12, 14, 14, 14, 13, 13, 11, 11,
+ 10, 9, 8, 8, 7, 7, 7, 7, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7,
+ 7, 7, 7, 12, 12, 13, 13, 12, 12, 11, 10, 9, 9, 8, 7, 7, 7, 7, 6, 11, 12,
+ 12, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 6, 11, 12, 12, 12, 12, 11,
+ 11, 10, 9, 9, 8, 8, 7, 7, 6, 6, 11, 12, 12, 12, 12, 11, 11, 10, 9, 8, 8,
+ 7, 7, 6, 6, 6, 10, 11, 11, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 6, 6, 6,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+ 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 21, 20, 19, 18, 17, 16,
+ 15, 14, 14, 14, 13, 12, 12, 12, 12, 11, 33, 32, 32, 32, 31, 31, 30, 30,
+ 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16, 15, 14, 14, 14,
+ 13, 13, 12, 12, 12, 11, 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 27, 26,
+ 25, 24, 23, 22, 20, 20, 19, 18, 18, 16, 16, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 29, 29, 30, 30, 29, 28, 28, 26, 25, 23, 22, 21, 21, 20, 19, 19,
+ 18, 17, 17, 16, 16, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 28, 29,
+ 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 23, 24, 25, 25, 24, 24,
+ 24, 23, 21, 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14,
+ 14, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 17,
+ 17, 18, 18, 18, 18, 18, 18, 17, 16, 16, 14, 14, 13, 12, 12, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 16, 17, 17, 17, 17, 17, 18, 17,
+ 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 8, 8, 8, 8, 13, 14, 14, 14, 14, 14, 15, 14, 14, 13, 13, 12, 12, 11, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 8, 8, 7, 7, 12, 13, 13, 14,
+ 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 12, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 6, 6, 11, 12,
+ 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12,
+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6,
+ /* Size 4x16 */
+ 33, 28, 17, 12, 32, 29, 18, 12, 32, 28, 18, 13, 31, 27, 18, 13, 30, 24,
+ 17, 13, 30, 21, 16, 12, 26, 20, 14, 11, 23, 18, 12, 10, 21, 17, 11, 10,
+ 19, 16, 11, 9, 17, 15, 10, 8, 15, 14, 9, 8, 14, 13, 9, 7, 13, 12, 9, 7,
+ 12, 12, 9, 7, 12, 11, 8, 6,
+ /* Size 16x4 */
+ 33, 32, 32, 31, 30, 30, 26, 23, 21, 19, 17, 15, 14, 13, 12, 12, 28, 29,
+ 28, 27, 24, 21, 20, 18, 17, 16, 15, 14, 13, 12, 12, 11, 17, 18, 18, 18,
+ 17, 16, 14, 12, 11, 11, 10, 9, 9, 9, 9, 8, 12, 12, 13, 13, 13, 12, 11,
+ 10, 10, 9, 8, 8, 7, 7, 7, 6,
+ /* Size 8x32 */
+ 32, 33, 29, 23, 19, 16, 12, 11, 33, 32, 29, 24, 20, 17, 13, 12, 33, 32,
+ 30, 25, 20, 17, 13, 12, 33, 32, 30, 25, 21, 17, 14, 12, 33, 31, 29, 24,
+ 21, 17, 14, 13, 32, 31, 28, 24, 20, 17, 14, 13, 32, 30, 28, 24, 21, 18,
+ 14, 13, 32, 30, 26, 23, 20, 17, 14, 13, 30, 29, 25, 21, 19, 16, 13, 13,
+ 29, 28, 23, 20, 17, 15, 13, 12, 28, 28, 22, 19, 17, 15, 13, 12, 26, 26,
+ 21, 18, 16, 14, 12, 12, 25, 26, 21, 17, 15, 13, 12, 11, 23, 24, 20, 16,
+ 14, 13, 11, 11, 22, 23, 19, 16, 14, 12, 11, 10, 21, 23, 19, 15, 13, 12,
+ 10, 10, 19, 20, 18, 14, 12, 11, 10, 9, 19, 20, 17, 14, 12, 11, 9, 9, 18,
+ 19, 17, 14, 12, 10, 9, 9, 16, 18, 16, 13, 11, 10, 9, 9, 16, 17, 16, 13,
+ 11, 10, 9, 8, 14, 16, 14, 12, 11, 9, 8, 8, 14, 15, 14, 12, 10, 9, 8, 8,
+ 13, 14, 13, 11, 10, 9, 8, 7, 12, 14, 13, 11, 10, 9, 7, 7, 12, 14, 13,
+ 11, 10, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7, 12, 13, 12, 11, 9, 8, 7, 7,
+ 11, 12, 12, 11, 9, 8, 7, 7, 11, 12, 12, 11, 9, 8, 7, 6, 11, 12, 12, 11,
+ 9, 8, 7, 6, 10, 11, 12, 11, 9, 8, 7, 6,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 19, 19,
+ 18, 16, 16, 14, 14, 13, 12, 12, 12, 12, 11, 11, 11, 10, 33, 32, 32, 32,
+ 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 20, 20, 19, 18, 17, 16,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 29, 29, 30, 30, 29, 28, 28, 26,
+ 25, 23, 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 12, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+ 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 19, 20, 20, 21, 21, 20, 21, 20, 19, 17, 17, 16, 15, 14, 14, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 16, 17, 17,
+ 17, 17, 17, 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 12, 13, 13, 14, 14, 14, 14, 14, 13, 13,
+ 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7,
+ 11, 12, 12, 12, 13, 13, 13, 13, 13, 12, 12, 12, 11, 11, 10, 10, 9, 9, 9,
+ 9, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 6, 6 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 23, 20, 17, 23, 19, 17, 16, 20, 17, 14, 13, 17, 16, 13, 11,
+ /* Size 8x8 */
+ 33, 30, 22, 22, 20, 18, 17, 16, 30, 26, 22, 23, 21, 19, 18, 17, 22, 22,
+ 20, 20, 19, 18, 17, 17, 22, 23, 20, 18, 17, 16, 15, 15, 20, 21, 19, 17,
+ 15, 14, 13, 13, 18, 19, 18, 16, 14, 12, 12, 12, 17, 18, 17, 15, 13, 12,
+ 11, 11, 16, 17, 17, 15, 13, 12, 11, 10,
+ /* Size 16x16 */
+ 32, 33, 31, 28, 25, 21, 21, 20, 20, 19, 18, 17, 16, 15, 15, 15, 33, 33,
+ 30, 26, 24, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 16, 31, 30, 28, 24,
+ 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 28, 26, 24, 22, 22, 21,
+ 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 25, 24, 23, 22, 21, 20, 21, 20,
+ 20, 20, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 20, 19, 19, 19, 19, 19,
+ 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16,
+ 16, 15, 15, 15, 20, 22, 22, 22, 20, 19, 18, 17, 16, 16, 16, 15, 15, 14,
+ 14, 14, 20, 21, 22, 22, 20, 19, 17, 16, 16, 15, 15, 14, 14, 13, 14, 14,
+ 19, 20, 21, 21, 20, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+ 20, 20, 19, 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 17, 18, 19, 19,
+ 18, 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17,
+ 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16, 15, 14,
+ 13, 13, 12, 12, 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13,
+ 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16, 15, 14, 14, 13, 12, 12,
+ 11, 11, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 15, 14, 33, 33, 33, 33,
+ 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 33, 33, 33, 33, 30, 29, 26, 26,
+ 24, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 34, 33, 33, 32, 30, 29, 26, 25, 24, 23, 22, 23,
+ 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 16, 16, 31, 30, 30, 30, 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 31, 30,
+ 29, 29, 27, 26, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 28, 27, 26, 26, 24, 24,
+ 22, 22, 22, 22, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19,
+ 19, 19, 18, 18, 17, 17, 17, 17, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21,
+ 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18, 18, 18,
+ 18, 17, 17, 17, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21,
+ 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17,
+ 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22,
+ 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 22, 22, 22, 22, 21, 20, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+ 15, 15, 21, 22, 22, 23, 23, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 15, 15, 20, 21,
+ 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 15, 14, 14, 14, 15, 20, 21, 22, 22, 22, 22,
+ 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 20, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+ 19, 18, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 14,
+ 14, 13, 14, 14, 19, 20, 20, 21, 21, 21, 22, 21, 20, 19, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 19, 20, 20, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 18, 19, 19, 20,
+ 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13,
+ 13, 13, 12, 12, 12, 13, 12, 13, 13, 12, 18, 19, 19, 20, 20, 20, 20, 20,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 20, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17,
+ 17, 18, 18, 18, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 12, 11, 16, 17, 17, 18, 18, 18,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17,
+ 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 15, 16, 16, 17, 17, 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17,
+ 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 10, 10, 10, 10, 15, 16, 16, 17, 17, 17, 17, 17,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 10, 15, 16, 16, 16, 16, 17, 17, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 10, 14, 15, 15, 16, 16, 17, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+ 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10,
+ /* Size 4x8 */
+ 33, 22, 19, 16, 28, 22, 20, 17, 22, 20, 19, 17, 23, 19, 16, 15, 21, 19,
+ 14, 13, 19, 18, 13, 12, 17, 17, 13, 11, 16, 16, 13, 11,
+ /* Size 8x4 */
+ 33, 28, 22, 23, 21, 19, 17, 16, 22, 22, 20, 19, 19, 18, 17, 16, 19, 20,
+ 19, 16, 14, 13, 13, 13, 16, 17, 17, 15, 13, 12, 11, 11,
+ /* Size 8x16 */
+ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 21, 19, 17, 16, 31, 28,
+ 22, 23, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20, 19, 17, 24, 23, 21, 21,
+ 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22, 20, 18, 17, 17,
+ 16, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 19, 17, 16, 14, 14, 14,
+ 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 12, 17, 19,
+ 18, 16, 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15,
+ 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 16, 17, 15, 14, 12,
+ 11, 10,
+ /* Size 16x8 */
+ 32, 33, 31, 28, 24, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 15, 31, 30,
+ 28, 24, 23, 22, 22, 22, 22, 21, 20, 19, 18, 17, 17, 16, 23, 23, 22, 22,
+ 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 21, 22, 23, 23, 21, 19,
+ 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 20, 21, 22, 22, 20, 19, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 14, 18, 19, 20, 20, 19, 18, 17, 16, 14, 14,
+ 13, 13, 12, 12, 12, 12, 16, 17, 18, 19, 18, 17, 16, 14, 14, 13, 12, 12,
+ 11, 11, 11, 11, 15, 16, 17, 17, 17, 16, 15, 14, 14, 13, 12, 12, 11, 11,
+ 11, 10,
+ /* Size 16x32 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 15, 33, 33,
+ 30, 27, 23, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 16, 33, 32, 30, 26,
+ 23, 22, 22, 22, 21, 20, 19, 17, 17, 17, 16, 16, 34, 32, 29, 26, 23, 22,
+ 23, 22, 21, 20, 20, 18, 18, 17, 17, 17, 31, 29, 28, 24, 22, 22, 23, 22,
+ 22, 20, 20, 18, 18, 17, 17, 17, 31, 28, 27, 24, 22, 22, 22, 22, 22, 20,
+ 20, 18, 18, 17, 17, 17, 28, 26, 24, 22, 22, 22, 23, 22, 22, 21, 20, 19,
+ 19, 18, 17, 17, 26, 25, 24, 22, 21, 21, 22, 22, 21, 20, 20, 19, 18, 18,
+ 18, 17, 24, 24, 23, 22, 21, 20, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 18, 17, 17, 17, 17, 17, 21, 22,
+ 22, 21, 20, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 17, 21, 22, 22, 22,
+ 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 16, 21, 23, 22, 22, 20, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 16, 21, 23, 23, 22, 20, 19, 18, 17,
+ 17, 16, 16, 15, 15, 15, 15, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16,
+ 16, 15, 14, 15, 14, 15, 20, 22, 22, 22, 20, 19, 17, 17, 16, 16, 15, 14,
+ 14, 14, 14, 14, 20, 21, 22, 22, 19, 19, 17, 16, 16, 15, 14, 14, 14, 14,
+ 14, 14, 19, 21, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 14, 13,
+ 19, 20, 21, 21, 19, 19, 17, 16, 15, 14, 14, 13, 13, 13, 13, 13, 18, 20,
+ 20, 20, 19, 18, 16, 16, 15, 14, 13, 13, 12, 13, 13, 13, 18, 20, 20, 20,
+ 19, 18, 16, 16, 15, 14, 13, 12, 12, 12, 12, 13, 17, 19, 19, 20, 18, 18,
+ 16, 15, 14, 13, 13, 12, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 15,
+ 14, 13, 13, 12, 12, 12, 12, 12, 16, 18, 18, 19, 17, 17, 15, 15, 14, 13,
+ 12, 12, 11, 11, 12, 12, 16, 18, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11,
+ 11, 11, 11, 12, 16, 17, 18, 18, 17, 17, 15, 14, 14, 13, 12, 11, 11, 11,
+ 11, 11, 16, 17, 17, 18, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 11,
+ 15, 17, 17, 18, 17, 16, 15, 15, 13, 13, 12, 11, 11, 11, 11, 11, 15, 17,
+ 17, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10, 15, 16, 17, 17,
+ 17, 16, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 16,
+ 15, 14, 14, 13, 12, 12, 11, 11, 10, 10, 15, 16, 16, 17, 17, 15, 15, 14,
+ 14, 12, 12, 11, 11, 10, 10, 10,
+ /* Size 32x16 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 33, 33, 32, 32,
+ 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 31, 30, 30, 29, 28, 27, 24, 24,
+ 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 28, 27, 26, 26, 24, 24, 22, 22, 22, 21, 21, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17,
+ 17, 17, 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 21, 22, 22, 23, 23, 22,
+ 23, 22, 21, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 14, 14, 14, 15,
+ 14, 14, 14, 14, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14,
+ 18, 19, 20, 20, 20, 20, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14,
+ 14, 14, 14, 13, 13, 13, 13, 13, 12, 13, 13, 13, 13, 12, 18, 19, 19, 20,
+ 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11,
+ 11, 11, 12, 12, 12, 11, 16, 17, 17, 18, 18, 18, 19, 18, 18, 17, 17, 16,
+ 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 15, 16, 17, 17, 17, 17, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+ 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 10, 15, 16,
+ 16, 17, 17, 17, 17, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 11, 11, 10, 10, 10, 15, 16, 16, 17, 17, 17,
+ 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10,
+ /* Size 4x16 */
+ 33, 21, 18, 15, 32, 22, 20, 17, 29, 22, 20, 17, 26, 22, 21, 18, 24, 20,
+ 19, 17, 22, 19, 18, 16, 23, 19, 17, 16, 22, 19, 16, 15, 21, 19, 15, 14,
+ 20, 19, 14, 13, 20, 18, 14, 12, 18, 17, 13, 12, 18, 17, 13, 11, 17, 16,
+ 12, 11, 17, 16, 13, 11, 16, 16, 13, 11,
+ /* Size 16x4 */
+ 33, 32, 29, 26, 24, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 16, 21, 22,
+ 22, 22, 20, 19, 19, 19, 19, 19, 18, 17, 17, 16, 16, 16, 18, 20, 20, 21,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 13, 13, 15, 17, 17, 18, 17, 16,
+ 16, 15, 14, 13, 12, 12, 11, 11, 11, 11,
+ /* Size 8x32 */
+ 32, 31, 23, 21, 20, 18, 16, 15, 33, 30, 23, 22, 20, 19, 17, 16, 33, 30,
+ 23, 22, 21, 19, 17, 16, 34, 29, 23, 23, 21, 20, 18, 17, 31, 28, 22, 23,
+ 22, 20, 18, 17, 31, 27, 22, 22, 22, 20, 18, 17, 28, 24, 22, 23, 22, 20,
+ 19, 17, 26, 24, 21, 22, 21, 20, 18, 18, 24, 23, 21, 21, 20, 19, 18, 17,
+ 22, 22, 20, 19, 19, 18, 17, 17, 21, 22, 20, 19, 19, 18, 17, 16, 21, 22,
+ 20, 18, 18, 17, 16, 16, 21, 22, 20, 18, 17, 17, 16, 15, 21, 23, 20, 18,
+ 17, 16, 15, 15, 20, 22, 20, 17, 16, 16, 14, 14, 20, 22, 20, 17, 16, 15,
+ 14, 14, 20, 22, 19, 17, 16, 14, 14, 14, 19, 21, 19, 17, 15, 14, 13, 14,
+ 19, 21, 19, 17, 15, 14, 13, 13, 18, 20, 19, 16, 15, 13, 12, 13, 18, 20,
+ 19, 16, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 17, 19, 18, 16,
+ 14, 13, 12, 12, 16, 18, 17, 15, 14, 12, 11, 12, 16, 18, 17, 15, 14, 12,
+ 11, 11, 16, 18, 17, 15, 14, 12, 11, 11, 16, 17, 17, 15, 13, 12, 11, 11,
+ 15, 17, 17, 15, 13, 12, 11, 11, 15, 17, 17, 15, 13, 12, 11, 11, 15, 17,
+ 17, 15, 13, 12, 11, 10, 15, 16, 17, 15, 14, 12, 11, 10, 15, 16, 17, 15,
+ 14, 12, 11, 10,
+ /* Size 32x8 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 31, 30, 30, 29,
+ 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 16, 16, 23, 23, 23, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17,
+ 17, 17, 17, 17, 17, 17, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 20, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 14, 14, 18, 19,
+ 19, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+ 19, 18, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11,
+ 11, 10, 10, 10 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 30, 19, 14, 30, 21, 16, 13, 19, 16, 11, 9, 14, 13, 9, 7,
+ /* Size 8x8 */
+ 32, 32, 30, 26, 20, 17, 13, 12, 32, 31, 29, 26, 21, 17, 14, 13, 30, 29,
+ 26, 22, 19, 16, 14, 13, 26, 26, 22, 18, 16, 14, 12, 11, 20, 21, 19, 16,
+ 13, 11, 10, 10, 17, 17, 16, 14, 11, 10, 9, 8, 13, 14, 14, 12, 10, 9, 8,
+ 7, 12, 13, 13, 11, 10, 8, 7, 7,
+ /* Size 16x16 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 11, 33, 32,
+ 32, 32, 31, 29, 27, 24, 22, 20, 18, 16, 15, 13, 13, 12, 33, 32, 32, 31,
+ 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 32, 32, 31, 30, 28, 28,
+ 26, 24, 23, 21, 19, 17, 16, 14, 14, 13, 31, 31, 30, 28, 27, 24, 23, 22,
+ 20, 19, 18, 16, 15, 14, 13, 13, 28, 29, 29, 28, 24, 21, 20, 19, 18, 17,
+ 16, 15, 14, 13, 12, 12, 26, 27, 27, 26, 23, 20, 19, 18, 17, 16, 15, 14,
+ 13, 12, 12, 11, 23, 24, 25, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11,
+ 11, 11, 21, 22, 23, 23, 20, 18, 17, 15, 14, 13, 13, 12, 11, 10, 10, 10,
+ 19, 20, 21, 21, 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 17, 18,
+ 19, 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 9, 16, 16, 17, 17, 16,
+ 15, 14, 13, 12, 11, 10, 10, 9, 8, 8, 8, 14, 15, 15, 16, 15, 14, 13, 12,
+ 11, 10, 10, 9, 8, 8, 8, 7, 13, 13, 14, 14, 14, 13, 12, 11, 10, 10, 9, 8,
+ 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7, 11,
+ 12, 12, 13, 13, 12, 11, 11, 10, 9, 9, 8, 7, 7, 7, 6,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 31, 30, 28, 28, 26, 25, 23, 22, 21, 20,
+ 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 20, 20, 18, 18, 17,
+ 16, 15, 14, 13, 13, 13, 12, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 29, 29, 27, 26, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 15, 14,
+ 13, 13, 13, 12, 12, 12, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+ 28, 27, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24,
+ 23, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32,
+ 32, 32, 31, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 23, 23, 21, 20, 19,
+ 19, 17, 17, 16, 15, 14, 14, 14, 13, 13, 13, 12, 32, 32, 32, 32, 31, 31,
+ 30, 29, 28, 28, 28, 27, 26, 26, 24, 23, 23, 21, 21, 19, 19, 18, 17, 16,
+ 16, 15, 14, 14, 14, 13, 13, 12, 32, 32, 32, 32, 31, 30, 29, 29, 28, 28,
+ 27, 27, 26, 25, 24, 23, 22, 21, 21, 19, 19, 18, 17, 16, 16, 15, 14, 14,
+ 14, 13, 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23,
+ 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+ 30, 30, 30, 31, 30, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 21, 20, 19,
+ 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 28, 29, 29, 30,
+ 29, 28, 28, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 28, 29, 29, 30, 29, 28, 27, 27,
+ 24, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 11, 26, 27, 27, 28, 27, 26, 26, 26, 23, 23, 20, 20,
+ 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 25, 26, 26, 27, 26, 26, 26, 25, 23, 22, 20, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 15, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 23, 24,
+ 24, 25, 25, 24, 24, 24, 22, 22, 19, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 11, 11, 22, 23, 23, 24, 24, 23,
+ 23, 23, 21, 21, 19, 19, 17, 17, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 10, 10, 10, 10, 21, 22, 22, 23, 23, 23, 23, 22, 20, 20,
+ 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10,
+ 10, 10, 10, 10, 20, 20, 21, 21, 21, 21, 21, 21, 20, 19, 17, 17, 16, 16,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 10, 10, 9,
+ 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 18, 18, 19, 19, 19,
+ 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 9, 9, 9, 9, 9, 9, 9, 17, 18, 18, 19, 19, 19, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 9, 9, 16, 17, 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12,
+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 16, 16, 16, 17,
+ 17, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 8, 8, 14, 14, 15, 15, 15, 15, 16, 16, 15, 15, 14, 14, 13, 12, 12, 12,
+ 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 8, 13, 13, 14, 14, 14,
+ 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 7, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 13, 13,
+ 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 12,
+ 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 12, 12, 13, 13, 13, 13, 14, 14,
+ 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7,
+ 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 12, 12, 11, 11, 11, 10,
+ 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6, 11, 12, 12, 12, 12,
+ 13, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 7,
+ 7, 7, 7, 7, 7, 6, 6, 11, 12, 12, 12, 12, 12, 12, 13, 13, 12, 12, 11, 11,
+ 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6,
+ /* Size 4x8 */
+ 32, 29, 20, 13, 32, 28, 20, 14, 30, 24, 19, 14, 27, 20, 15, 12, 21, 17,
+ 13, 10, 17, 15, 11, 9, 14, 13, 10, 8, 13, 12, 9, 7,
+ /* Size 8x4 */
+ 32, 32, 30, 27, 21, 17, 14, 13, 29, 28, 24, 20, 17, 15, 13, 12, 20, 20,
+ 19, 15, 13, 11, 10, 9, 13, 14, 14, 12, 10, 9, 8, 7,
+ /* Size 8x16 */
+ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+ 30, 27, 22, 17, 14, 13, 32, 31, 28, 26, 21, 18, 15, 13, 31, 30, 27, 23,
+ 20, 17, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 26, 27, 23, 19, 16, 14,
+ 12, 12, 23, 25, 22, 17, 15, 13, 11, 11, 21, 23, 20, 17, 14, 12, 11, 10,
+ 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 10, 9, 9, 16, 17, 16,
+ 14, 11, 10, 9, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14, 14, 12, 10, 9, 8,
+ 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12, 13, 11, 10, 9, 7, 7,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 11, 33, 32,
+ 32, 31, 30, 29, 27, 25, 23, 21, 19, 17, 15, 14, 13, 12, 31, 31, 30, 28,
+ 27, 24, 23, 22, 20, 19, 18, 16, 15, 14, 13, 13, 26, 26, 27, 26, 23, 20,
+ 19, 17, 17, 16, 15, 14, 13, 12, 11, 11, 20, 21, 22, 21, 20, 18, 16, 15,
+ 14, 13, 12, 11, 11, 10, 10, 10, 16, 17, 17, 18, 17, 15, 14, 13, 12, 11,
+ 10, 10, 9, 9, 8, 9, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8,
+ 7, 7, 12, 12, 13, 13, 13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 7, 7,
+ /* Size 16x32 */
+ 32, 33, 33, 32, 31, 28, 26, 23, 20, 19, 16, 16, 13, 13, 12, 11, 33, 32,
+ 32, 32, 31, 29, 26, 24, 21, 20, 17, 16, 14, 13, 12, 12, 33, 32, 32, 32,
+ 31, 29, 26, 24, 21, 20, 17, 17, 14, 13, 12, 12, 33, 32, 32, 31, 31, 30,
+ 27, 25, 22, 21, 17, 17, 14, 14, 13, 13, 33, 32, 32, 31, 30, 29, 27, 25,
+ 22, 21, 17, 17, 14, 14, 13, 13, 32, 32, 31, 30, 29, 28, 26, 24, 21, 20,
+ 17, 17, 14, 14, 13, 13, 32, 32, 31, 29, 28, 28, 26, 24, 21, 21, 18, 17,
+ 15, 14, 13, 13, 32, 31, 31, 29, 28, 27, 25, 24, 21, 21, 18, 17, 15, 15,
+ 14, 13, 31, 31, 30, 28, 27, 25, 23, 22, 20, 19, 17, 16, 14, 14, 13, 13,
+ 30, 30, 30, 28, 26, 24, 23, 21, 19, 19, 16, 16, 14, 14, 13, 12, 28, 30,
+ 29, 27, 24, 21, 20, 19, 18, 17, 15, 15, 13, 13, 12, 12, 28, 29, 29, 27,
+ 24, 21, 20, 19, 17, 17, 15, 15, 13, 13, 12, 12, 26, 28, 27, 26, 23, 20,
+ 19, 18, 16, 16, 14, 14, 12, 12, 12, 12, 26, 27, 26, 25, 23, 20, 18, 17,
+ 16, 15, 14, 13, 12, 12, 11, 11, 23, 25, 25, 24, 22, 19, 17, 16, 15, 14,
+ 13, 13, 11, 11, 11, 11, 22, 24, 24, 23, 21, 19, 17, 16, 14, 14, 12, 12,
+ 11, 11, 11, 10, 21, 23, 23, 22, 20, 18, 17, 15, 14, 13, 12, 12, 11, 10,
+ 10, 10, 20, 21, 21, 21, 20, 17, 16, 15, 13, 13, 11, 11, 10, 10, 10, 10,
+ 19, 21, 21, 20, 19, 17, 16, 14, 13, 12, 11, 11, 10, 10, 9, 10, 18, 19,
+ 19, 19, 18, 16, 15, 14, 12, 12, 11, 10, 9, 9, 9, 9, 18, 19, 19, 19, 18,
+ 16, 15, 14, 12, 12, 10, 10, 9, 9, 9, 9, 16, 17, 17, 18, 17, 15, 14, 13,
+ 12, 11, 10, 10, 9, 9, 8, 8, 16, 17, 17, 17, 16, 15, 14, 13, 11, 11, 10,
+ 10, 9, 8, 8, 8, 14, 16, 16, 16, 15, 14, 13, 12, 11, 11, 9, 9, 8, 8, 8,
+ 8, 14, 15, 15, 16, 15, 14, 13, 12, 11, 10, 9, 9, 8, 8, 8, 8, 13, 14, 14,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 8, 7, 7, 13, 14, 14, 14, 14, 13,
+ 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 12, 14, 14, 14, 14, 13, 12, 11, 10,
+ 10, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 13, 12, 11, 11, 10, 9, 8, 8, 7, 7,
+ 7, 7, 12, 13, 13, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7, 7, 7, 11, 12,
+ 12, 13, 13, 12, 11, 10, 10, 9, 9, 8, 7, 7, 7, 7, 11, 12, 12, 13, 13, 11,
+ 11, 10, 10, 9, 9, 8, 8, 7, 7, 6,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+ 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 28, 27, 25, 24, 23, 21, 21, 19, 19, 17,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 33, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17, 17, 16, 15, 14,
+ 14, 14, 13, 13, 12, 12, 32, 32, 32, 31, 31, 30, 29, 29, 28, 28, 27, 27,
+ 26, 25, 24, 23, 22, 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13,
+ 13, 13, 31, 31, 31, 31, 30, 29, 28, 28, 27, 26, 24, 24, 23, 23, 22, 21,
+ 20, 20, 19, 18, 18, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 28, 29,
+ 29, 30, 29, 28, 28, 27, 25, 24, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 26, 26, 26, 27, 27, 26,
+ 26, 25, 23, 23, 20, 20, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 22, 21,
+ 19, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16,
+ 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10,
+ 19, 20, 20, 21, 21, 20, 21, 21, 19, 19, 17, 17, 16, 15, 14, 14, 13, 13,
+ 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 16, 17, 17, 17, 17,
+ 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11, 10, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 9, 9, 16, 16, 17, 17, 17, 17, 17, 17, 16, 16, 15,
+ 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8, 8,
+ 8, 13, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 7, 7, 7, 8, 13, 13, 13, 14, 14, 14,
+ 14, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8, 8, 8, 8,
+ 7, 7, 7, 7, 7, 7, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 11, 12,
+ 12, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+ 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 6,
+ /* Size 4x16 */
+ 33, 28, 19, 13, 32, 29, 20, 13, 32, 29, 21, 14, 32, 28, 21, 14, 31, 25,
+ 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11, 23, 18, 13, 10,
+ 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 8, 15, 14, 10, 8, 14, 13, 10,
+ 7, 13, 12, 9, 7, 12, 12, 9, 7,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 31, 30, 28, 25, 23, 21, 19, 17, 15, 14, 13, 12, 28, 29,
+ 29, 28, 25, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 12, 19, 20, 21, 21,
+ 19, 17, 16, 14, 13, 12, 12, 11, 10, 10, 9, 9, 13, 13, 14, 14, 14, 13,
+ 12, 11, 10, 10, 9, 8, 8, 7, 7, 7,
+ /* Size 8x32 */
+ 32, 33, 31, 26, 20, 16, 13, 12, 33, 32, 31, 26, 21, 17, 14, 12, 33, 32,
+ 31, 26, 21, 17, 14, 12, 33, 32, 31, 27, 22, 17, 14, 13, 33, 32, 30, 27,
+ 22, 17, 14, 13, 32, 31, 29, 26, 21, 17, 14, 13, 32, 31, 28, 26, 21, 18,
+ 15, 13, 32, 31, 28, 25, 21, 18, 15, 14, 31, 30, 27, 23, 20, 17, 14, 13,
+ 30, 30, 26, 23, 19, 16, 14, 13, 28, 29, 24, 20, 18, 15, 13, 12, 28, 29,
+ 24, 20, 17, 15, 13, 12, 26, 27, 23, 19, 16, 14, 12, 12, 26, 26, 23, 18,
+ 16, 14, 12, 11, 23, 25, 22, 17, 15, 13, 11, 11, 22, 24, 21, 17, 14, 12,
+ 11, 11, 21, 23, 20, 17, 14, 12, 11, 10, 20, 21, 20, 16, 13, 11, 10, 10,
+ 19, 21, 19, 16, 13, 11, 10, 9, 18, 19, 18, 15, 12, 11, 9, 9, 18, 19, 18,
+ 15, 12, 10, 9, 9, 16, 17, 17, 14, 12, 10, 9, 8, 16, 17, 16, 14, 11, 10,
+ 9, 8, 14, 16, 15, 13, 11, 9, 8, 8, 14, 15, 15, 13, 11, 9, 8, 8, 13, 14,
+ 14, 12, 10, 9, 8, 7, 13, 14, 14, 12, 10, 9, 8, 7, 12, 14, 14, 12, 10, 8,
+ 8, 7, 12, 13, 13, 11, 10, 8, 7, 7, 12, 13, 13, 11, 10, 8, 7, 7, 11, 12,
+ 13, 11, 10, 9, 7, 7, 11, 12, 13, 11, 10, 9, 8, 7,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 31, 30, 28, 28, 26, 26, 23, 22, 21, 20,
+ 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 11, 11, 33, 32, 32, 32,
+ 32, 31, 31, 31, 30, 30, 29, 29, 27, 26, 25, 24, 23, 21, 21, 19, 19, 17,
+ 17, 16, 15, 14, 14, 14, 13, 13, 12, 12, 31, 31, 31, 31, 30, 29, 28, 28,
+ 27, 26, 24, 24, 23, 23, 22, 21, 20, 20, 19, 18, 18, 17, 16, 15, 15, 14,
+ 14, 14, 13, 13, 13, 13, 26, 26, 26, 27, 27, 26, 26, 25, 23, 23, 20, 20,
+ 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 11, 20, 21, 21, 22, 22, 21, 21, 21, 20, 19, 18, 17, 16, 16, 15, 14,
+ 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 10, 10, 10, 16, 17,
+ 17, 17, 17, 17, 18, 18, 17, 16, 15, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+ 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 9, 9, 13, 14, 14, 14, 14, 14, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8,
+ 7, 7, 7, 8, 12, 12, 12, 13, 13, 13, 13, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 22, 21, 18, 22, 19, 19, 17, 21, 19, 15, 13, 18, 17, 13, 11,
+ /* Size 8x8 */
+ 33, 30, 24, 22, 21, 19, 17, 16, 30, 26, 23, 22, 22, 20, 18, 17, 24, 23,
+ 21, 21, 20, 19, 18, 17, 22, 22, 21, 19, 18, 17, 16, 16, 21, 22, 20, 18,
+ 16, 15, 14, 14, 19, 20, 19, 17, 15, 13, 12, 12, 17, 18, 18, 16, 14, 12,
+ 12, 11, 16, 17, 17, 16, 14, 12, 11, 11,
+ /* Size 16x16 */
+ 32, 33, 33, 29, 26, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 33, 33,
+ 32, 28, 25, 22, 22, 22, 21, 21, 20, 19, 18, 17, 17, 16, 33, 32, 30, 26,
+ 24, 22, 22, 23, 22, 22, 21, 20, 19, 18, 17, 17, 29, 28, 26, 23, 22, 22,
+ 22, 23, 22, 22, 21, 20, 19, 18, 18, 17, 26, 25, 24, 22, 21, 20, 21, 21,
+ 21, 21, 20, 19, 19, 18, 17, 17, 21, 22, 22, 22, 20, 19, 19, 19, 19, 19,
+ 19, 18, 17, 17, 17, 17, 21, 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17,
+ 17, 16, 16, 16, 21, 22, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15,
+ 15, 15, 20, 21, 22, 22, 21, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 20, 21, 22, 22, 21, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 19, 20,
+ 21, 21, 20, 19, 18, 17, 16, 15, 14, 14, 13, 13, 13, 13, 18, 19, 20, 20,
+ 19, 18, 17, 16, 15, 14, 14, 13, 13, 12, 12, 12, 17, 18, 19, 19, 19, 17,
+ 17, 16, 15, 14, 13, 13, 12, 12, 12, 12, 16, 17, 18, 18, 18, 17, 16, 15,
+ 14, 14, 13, 12, 12, 11, 11, 11, 16, 17, 17, 18, 17, 17, 16, 15, 14, 13,
+ 13, 12, 12, 11, 11, 11, 15, 16, 17, 17, 17, 17, 16, 15, 14, 13, 13, 12,
+ 12, 11, 11, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 26, 25, 21, 21, 21, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 33,
+ 32, 30, 28, 27, 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 33, 33, 33, 33, 32, 29, 28, 26,
+ 25, 24, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 34, 33, 33, 32, 31, 29, 27, 26, 24, 24, 22, 22,
+ 23, 23, 23, 23, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 33, 32, 32, 31, 30, 28, 26, 25, 24, 24, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 31, 30,
+ 29, 29, 28, 26, 25, 24, 23, 23, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 29, 28, 28, 27, 26, 25,
+ 23, 22, 22, 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 19, 19, 18, 18, 18, 18, 17, 17, 28, 27, 26, 26, 25, 24, 22, 22, 22, 22,
+ 21, 22, 22, 22, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19,
+ 18, 18, 18, 18, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 21, 21, 21,
+ 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17,
+ 25, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22,
+ 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 16, 16, 16, 16, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+ 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 16, 16, 21, 22, 22, 23, 23, 22, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 15, 15, 21, 22,
+ 22, 23, 23, 23, 23, 23, 21, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 15, 15, 20, 22, 22, 23, 22, 22,
+ 22, 22, 21, 21, 19, 19, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+ 14, 14, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 20, 20, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 14, 19, 20, 20, 21,
+ 21, 21, 21, 21, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14,
+ 14, 14, 13, 13, 13, 13, 13, 13, 13, 13, 19, 20, 20, 21, 21, 21, 21, 21,
+ 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13,
+ 13, 13, 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 18, 18,
+ 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12,
+ 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 12, 17, 18,
+ 18, 19, 19, 19, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19,
+ 19, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18,
+ 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11,
+ 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 16, 16, 17, 17,
+ 17, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11,
+ 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ 10, 11, 15, 16, 16, 17, 17, 17, 17, 18, 17, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 10,
+ /* Size 4x8 */
+ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 23, 19, 18, 16, 22, 19,
+ 16, 14, 20, 18, 15, 12, 18, 17, 14, 11, 17, 16, 13, 11,
+ /* Size 8x4 */
+ 33, 28, 24, 23, 22, 20, 18, 17, 22, 22, 20, 19, 19, 18, 17, 16, 20, 22,
+ 20, 18, 16, 15, 14, 13, 17, 18, 18, 16, 14, 12, 11, 11,
+ /* Size 8x16 */
+ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 29,
+ 24, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21,
+ 21, 20, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22, 21, 19, 18, 17,
+ 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 20, 22, 21, 18, 16, 15, 14, 14,
+ 20, 21, 20, 18, 16, 14, 14, 13, 19, 20, 20, 17, 15, 14, 13, 13, 18, 20,
+ 19, 17, 15, 13, 12, 12, 17, 19, 18, 16, 14, 13, 12, 12, 16, 18, 18, 16,
+ 14, 12, 12, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13,
+ 12, 11,
+ /* Size 16x8 */
+ 32, 33, 33, 29, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 15, 32, 31,
+ 29, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 17, 26, 25, 24, 22,
+ 21, 20, 21, 21, 21, 20, 20, 19, 18, 18, 17, 17, 21, 22, 22, 22, 21, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 20, 21, 22, 22, 21, 19, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 14, 18, 19, 20, 20, 20, 18, 17, 16, 15, 14,
+ 14, 13, 13, 12, 12, 13, 16, 17, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+ 12, 12, 11, 12, 15, 16, 17, 18, 17, 17, 16, 15, 14, 13, 13, 12, 12, 11,
+ 11, 11,
+ /* Size 16x32 */
+ 32, 33, 32, 28, 26, 21, 21, 21, 20, 20, 18, 18, 16, 16, 15, 15, 33, 33,
+ 31, 27, 25, 22, 22, 22, 21, 20, 19, 19, 17, 17, 16, 16, 33, 33, 31, 27,
+ 25, 22, 22, 22, 21, 21, 19, 19, 17, 17, 16, 16, 34, 32, 31, 26, 24, 22,
+ 23, 23, 22, 21, 20, 20, 18, 18, 17, 17, 33, 31, 29, 25, 24, 22, 22, 23,
+ 22, 21, 20, 20, 18, 18, 17, 17, 31, 28, 28, 24, 23, 22, 22, 22, 22, 22,
+ 20, 20, 18, 18, 17, 17, 29, 27, 26, 23, 22, 22, 22, 23, 22, 22, 20, 20,
+ 19, 18, 18, 17, 28, 26, 25, 22, 22, 22, 22, 23, 22, 22, 20, 20, 19, 19,
+ 18, 18, 25, 24, 24, 22, 21, 21, 21, 21, 21, 20, 20, 19, 18, 18, 17, 18,
+ 24, 24, 24, 22, 21, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 21, 22,
+ 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 22, 21,
+ 20, 19, 19, 19, 19, 19, 18, 18, 17, 17, 16, 16, 21, 22, 22, 22, 21, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 21, 23, 22, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 16, 16, 16, 15, 21, 23, 23, 22, 21, 19, 18, 18, 17, 17,
+ 16, 16, 15, 15, 15, 15, 21, 22, 22, 22, 21, 19, 18, 17, 17, 17, 16, 16,
+ 15, 15, 15, 15, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14,
+ 14, 14, 20, 22, 22, 22, 21, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14,
+ 20, 21, 21, 22, 20, 19, 18, 17, 16, 16, 14, 14, 14, 14, 13, 14, 19, 20,
+ 21, 21, 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 19, 20, 20, 21,
+ 20, 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 13, 18, 20, 20, 20, 20, 18,
+ 17, 16, 15, 15, 13, 13, 12, 12, 12, 12, 18, 20, 20, 20, 19, 18, 17, 16,
+ 15, 14, 13, 13, 12, 12, 12, 12, 17, 19, 19, 20, 19, 18, 17, 16, 14, 14,
+ 13, 13, 12, 12, 12, 12, 17, 18, 19, 19, 18, 17, 16, 16, 14, 14, 13, 13,
+ 12, 12, 12, 12, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11,
+ 11, 11, 16, 18, 18, 19, 18, 17, 16, 15, 14, 14, 12, 12, 12, 11, 11, 11,
+ 16, 17, 18, 18, 18, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11, 11, 16, 17,
+ 17, 18, 17, 17, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18,
+ 17, 16, 16, 15, 14, 13, 12, 12, 11, 11, 11, 11, 15, 17, 17, 18, 17, 16,
+ 16, 14, 14, 13, 13, 12, 12, 11, 11, 11, 15, 17, 17, 17, 17, 16, 16, 14,
+ 14, 13, 13, 12, 12, 11, 11, 10,
+ /* Size 32x16 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 33, 33, 33, 32,
+ 31, 28, 27, 26, 24, 24, 22, 22, 22, 23, 23, 22, 22, 22, 21, 20, 20, 20,
+ 20, 19, 18, 18, 18, 17, 17, 17, 17, 17, 32, 31, 31, 31, 29, 28, 26, 25,
+ 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18,
+ 18, 18, 17, 17, 17, 17, 28, 27, 27, 26, 25, 24, 23, 22, 22, 22, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 26, 25, 25, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 21, 22, 22, 23, 22, 22,
+ 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+ 16, 16, 16, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 22, 23, 23, 21, 21,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 20, 20, 21, 21, 21, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 18, 19, 19, 20,
+ 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+ 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 19, 19, 18, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 18, 18, 17, 17, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16,
+ 16, 17, 17, 17, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 15, 16, 16, 17, 17, 17,
+ 17, 18, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 11, 10,
+ /* Size 4x16 */
+ 33, 21, 20, 16, 33, 22, 21, 17, 31, 22, 21, 18, 27, 22, 22, 18, 24, 21,
+ 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15, 22, 19, 16, 14,
+ 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 14, 12, 18, 17, 14, 12, 18, 17,
+ 14, 11, 17, 17, 13, 11, 17, 16, 13, 11,
+ /* Size 16x4 */
+ 33, 33, 31, 27, 24, 22, 22, 23, 22, 21, 20, 20, 18, 18, 17, 17, 21, 22,
+ 22, 22, 21, 19, 19, 19, 19, 19, 19, 18, 17, 17, 17, 16, 20, 21, 21, 22,
+ 20, 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 13, 16, 17, 18, 18, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11, 11, 11,
+ /* Size 8x32 */
+ 32, 32, 26, 21, 20, 18, 16, 15, 33, 31, 25, 22, 21, 19, 17, 16, 33, 31,
+ 25, 22, 21, 19, 17, 16, 34, 31, 24, 23, 22, 20, 18, 17, 33, 29, 24, 22,
+ 22, 20, 18, 17, 31, 28, 23, 22, 22, 20, 18, 17, 29, 26, 22, 22, 22, 20,
+ 19, 18, 28, 25, 22, 22, 22, 20, 19, 18, 25, 24, 21, 21, 21, 20, 18, 17,
+ 24, 24, 21, 21, 20, 19, 18, 17, 21, 22, 20, 19, 19, 18, 17, 17, 21, 22,
+ 20, 19, 19, 18, 17, 16, 21, 22, 21, 19, 18, 17, 16, 16, 21, 22, 21, 19,
+ 18, 17, 16, 16, 21, 23, 21, 18, 17, 16, 15, 15, 21, 22, 21, 18, 17, 16,
+ 15, 15, 20, 22, 21, 18, 16, 15, 14, 14, 20, 22, 21, 18, 16, 15, 14, 14,
+ 20, 21, 20, 18, 16, 14, 14, 13, 19, 21, 20, 17, 15, 14, 13, 13, 19, 20,
+ 20, 17, 15, 14, 13, 13, 18, 20, 20, 17, 15, 13, 12, 12, 18, 20, 19, 17,
+ 15, 13, 12, 12, 17, 19, 19, 17, 14, 13, 12, 12, 17, 19, 18, 16, 14, 13,
+ 12, 12, 16, 18, 18, 16, 14, 12, 12, 11, 16, 18, 18, 16, 14, 12, 12, 11,
+ 16, 18, 18, 16, 14, 12, 11, 11, 16, 17, 17, 16, 14, 12, 11, 11, 15, 17,
+ 17, 16, 14, 12, 11, 11, 15, 17, 17, 16, 14, 13, 12, 11, 15, 17, 17, 16,
+ 14, 13, 12, 11,
+ /* Size 32x8 */
+ 32, 33, 33, 34, 33, 31, 29, 28, 25, 24, 21, 21, 21, 21, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 32, 31, 31, 31,
+ 29, 28, 26, 25, 24, 24, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20,
+ 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 26, 25, 25, 24, 24, 23, 22, 22,
+ 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18,
+ 18, 18, 17, 17, 17, 17, 21, 22, 22, 23, 22, 22, 22, 22, 21, 21, 19, 19,
+ 19, 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+ 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 14, 14, 18, 19,
+ 19, 20, 20, 20, 20, 20, 20, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14,
+ 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 13, 13, 16, 17, 17, 18, 18, 18,
+ 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 12, 12, 11, 11, 11, 12, 12, 15, 16, 16, 17, 17, 17, 18, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 11, 11, 11 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 30, 21, 14, 30, 21, 17, 13, 21, 17, 12, 10, 14, 13, 10, 8,
+ /* Size 8x8 */
+ 32, 32, 30, 27, 22, 18, 15, 13, 32, 31, 29, 26, 23, 19, 16, 14, 30, 29,
+ 26, 23, 20, 18, 15, 13, 27, 26, 23, 19, 17, 15, 13, 12, 22, 23, 20, 17,
+ 14, 13, 11, 10, 18, 19, 18, 15, 13, 11, 10, 9, 15, 16, 15, 13, 11, 10,
+ 9, 8, 13, 14, 13, 12, 10, 9, 8, 7,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 30, 28, 26, 23, 21, 19, 17, 16, 14, 13, 12, 33, 32,
+ 32, 32, 32, 30, 29, 27, 24, 22, 20, 18, 17, 15, 13, 13, 33, 32, 32, 32,
+ 32, 31, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 33, 32, 32, 31, 30, 29,
+ 28, 26, 24, 23, 20, 19, 17, 16, 14, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+ 24, 22, 21, 19, 18, 16, 15, 14, 30, 30, 31, 29, 28, 26, 24, 23, 22, 20,
+ 19, 18, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 20, 19, 18, 17, 16,
+ 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 19, 18, 17, 16, 15, 14, 13,
+ 12, 12, 23, 24, 25, 24, 24, 22, 19, 18, 16, 15, 14, 14, 13, 12, 11, 11,
+ 21, 22, 23, 23, 22, 20, 18, 17, 15, 14, 13, 13, 12, 11, 11, 10, 19, 20,
+ 21, 20, 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 17, 18, 19, 19,
+ 19, 18, 16, 15, 14, 13, 12, 11, 10, 10, 9, 9, 16, 17, 17, 17, 18, 16,
+ 15, 14, 13, 12, 11, 10, 10, 9, 9, 8, 14, 15, 16, 16, 16, 15, 14, 13, 12,
+ 11, 11, 10, 9, 9, 8, 8, 13, 13, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9,
+ 9, 8, 8, 7, 12, 13, 14, 14, 14, 13, 13, 12, 11, 10, 10, 9, 8, 8, 7, 7,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18,
+ 18, 17, 17, 15, 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 30, 30, 29, 29, 27, 27, 24, 24, 22, 22, 20, 20, 18, 18, 17, 17, 15,
+ 15, 13, 13, 13, 13, 12, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14,
+ 14, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 28, 28, 25,
+ 25, 23, 23, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20,
+ 20, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 31,
+ 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 20, 20, 19, 19, 17,
+ 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28,
+ 28, 27, 27, 26, 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15,
+ 15, 14, 14, 14, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+ 26, 24, 24, 22, 22, 21, 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14,
+ 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20,
+ 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 30, 30, 30, 31,
+ 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+ 18, 16, 16, 15, 15, 14, 14, 13, 13, 13, 28, 29, 29, 30, 30, 28, 28, 27,
+ 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 13, 12, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 26, 27,
+ 27, 28, 28, 26, 26, 26, 26, 23, 23, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 23, 24, 24, 25, 25, 24,
+ 24, 24, 24, 22, 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 12, 12, 11, 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22,
+ 22, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17,
+ 17, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10,
+ 21, 22, 22, 23, 23, 23, 23, 22, 22, 20, 20, 18, 18, 17, 17, 15, 15, 14,
+ 14, 13, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 10, 19, 20, 20, 21,
+ 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21, 21, 20, 20, 21,
+ 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 10, 10, 10, 10, 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 9, 17, 18, 18, 19, 19, 19, 19, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14,
+ 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 16, 17, 17, 17,
+ 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9,
+ 8, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12,
+ 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 14, 15, 15, 16,
+ 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10,
+ 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14,
+ 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7,
+ 7, 7, 13, 13, 13, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 12, 13, 13, 14, 14,
+ 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 7, 12, 13, 13, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7, 12,
+ 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+ 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ /* Size 4x8 */
+ 32, 29, 20, 14, 32, 28, 20, 14, 30, 24, 19, 14, 28, 20, 16, 12, 23, 18,
+ 13, 11, 19, 16, 12, 9, 16, 14, 11, 8, 14, 13, 10, 8,
+ /* Size 8x4 */
+ 32, 32, 30, 28, 23, 19, 16, 14, 29, 28, 24, 20, 18, 16, 14, 13, 20, 20,
+ 19, 16, 13, 12, 11, 10, 14, 14, 14, 12, 11, 9, 8, 8,
+ /* Size 8x16 */
+ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+ 31, 30, 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 31, 29, 27,
+ 24, 21, 18, 15, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30, 27, 21, 19, 17,
+ 15, 13, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14, 13, 11,
+ 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 18, 19,
+ 19, 16, 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 14, 16, 16, 14,
+ 12, 11, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13, 11, 10, 8,
+ 8,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 32, 30, 28, 26, 23, 21, 19, 18, 16, 14, 13, 12, 33, 32,
+ 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 32, 32, 31, 30,
+ 29, 28, 27, 26, 24, 22, 20, 19, 18, 16, 15, 14, 28, 29, 30, 28, 27, 24,
+ 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 23, 24, 25, 24, 24, 21, 19, 18,
+ 16, 15, 14, 14, 13, 12, 11, 11, 19, 20, 21, 20, 21, 19, 17, 16, 14, 13,
+ 12, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 16, 15, 14, 13, 12, 11, 10,
+ 10, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+ /* Size 16x32 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 23, 19, 19, 16, 16, 13, 13, 12, 33, 32,
+ 32, 32, 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 32,
+ 32, 29, 29, 24, 24, 20, 20, 17, 17, 14, 14, 12, 33, 32, 32, 31, 31, 30,
+ 30, 25, 25, 21, 21, 17, 17, 14, 14, 13, 33, 32, 32, 31, 31, 30, 30, 25,
+ 25, 21, 21, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20,
+ 20, 17, 17, 14, 14, 13, 32, 32, 32, 30, 30, 28, 28, 24, 24, 20, 20, 17,
+ 17, 14, 14, 13, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15,
+ 15, 14, 32, 31, 31, 29, 29, 27, 27, 24, 24, 21, 21, 18, 18, 15, 15, 14,
+ 30, 30, 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 30, 30,
+ 30, 28, 28, 24, 24, 21, 21, 19, 19, 16, 16, 14, 14, 13, 28, 30, 30, 27,
+ 27, 21, 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 28, 30, 30, 27, 27, 21,
+ 21, 19, 19, 17, 17, 15, 15, 13, 13, 12, 26, 28, 28, 26, 26, 20, 20, 18,
+ 18, 16, 16, 14, 14, 12, 12, 12, 26, 28, 28, 26, 26, 20, 20, 18, 18, 16,
+ 16, 14, 14, 12, 12, 12, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13,
+ 13, 11, 11, 11, 23, 25, 25, 24, 24, 19, 19, 16, 16, 14, 14, 13, 13, 11,
+ 11, 11, 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10,
+ 21, 23, 23, 22, 22, 18, 18, 15, 15, 13, 13, 12, 12, 11, 11, 10, 19, 21,
+ 21, 20, 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 19, 21, 21, 20,
+ 20, 17, 17, 14, 14, 12, 12, 11, 11, 10, 10, 9, 18, 19, 19, 19, 19, 16,
+ 16, 14, 14, 12, 12, 10, 10, 9, 9, 9, 18, 19, 19, 19, 19, 16, 16, 14, 14,
+ 12, 12, 10, 10, 9, 9, 9, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10,
+ 10, 9, 9, 8, 16, 17, 17, 18, 18, 15, 15, 13, 13, 11, 11, 10, 10, 9, 9,
+ 8, 14, 16, 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 14, 16,
+ 16, 16, 16, 14, 14, 12, 12, 11, 11, 9, 9, 8, 8, 8, 13, 14, 14, 15, 15,
+ 13, 13, 11, 11, 10, 10, 9, 9, 8, 8, 7, 13, 14, 14, 15, 15, 13, 13, 11,
+ 11, 10, 10, 9, 9, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10,
+ 8, 8, 8, 8, 7, 12, 14, 14, 14, 14, 13, 13, 11, 11, 10, 10, 8, 8, 8, 8,
+ 7, 12, 13, 13, 13, 13, 12, 12, 11, 11, 9, 9, 8, 8, 7, 7, 7,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+ 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 33, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19, 19, 17, 17, 16,
+ 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27,
+ 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14,
+ 14, 13, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+ 24, 22, 22, 20, 20, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 13, 28, 29,
+ 29, 30, 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12, 28, 29, 29, 30, 30, 28,
+ 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21,
+ 21, 19, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11,
+ 11, 11, 11, 11, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+ 18, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11,
+ 19, 20, 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13,
+ 13, 12, 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 19, 20, 20, 21,
+ 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17, 17, 18,
+ 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9,
+ 9, 9, 9, 8, 8, 8, 16, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15,
+ 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 8, 8, 8, 13,
+ 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11,
+ 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7, 13, 14, 14, 14, 14, 14, 14, 15,
+ 15, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8,
+ 8, 8, 8, 7, 12, 12, 12, 13, 13, 13, 13, 14, 14, 13, 13, 12, 12, 12, 12,
+ 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 7, 7, 7, 7, 7,
+ /* Size 4x16 */
+ 33, 28, 19, 13, 32, 29, 20, 14, 32, 30, 21, 14, 32, 28, 20, 14, 31, 27,
+ 21, 15, 30, 24, 19, 14, 30, 21, 17, 13, 28, 20, 16, 12, 25, 19, 14, 11,
+ 23, 18, 13, 11, 21, 17, 12, 10, 19, 16, 12, 9, 17, 15, 11, 9, 16, 14,
+ 11, 8, 14, 13, 10, 8, 14, 13, 10, 8,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 31, 30, 30, 28, 25, 23, 21, 19, 17, 16, 14, 14, 28, 29,
+ 30, 28, 27, 24, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 19, 20, 21, 20,
+ 21, 19, 17, 16, 14, 13, 12, 12, 11, 11, 10, 10, 13, 14, 14, 14, 15, 14,
+ 13, 12, 11, 11, 10, 9, 9, 8, 8, 8,
+ /* Size 8x32 */
+ 32, 33, 32, 28, 23, 19, 16, 13, 33, 32, 32, 29, 24, 20, 17, 14, 33, 32,
+ 32, 29, 24, 20, 17, 14, 33, 32, 31, 30, 25, 21, 17, 14, 33, 32, 31, 30,
+ 25, 21, 17, 14, 32, 32, 30, 28, 24, 20, 17, 14, 32, 32, 30, 28, 24, 20,
+ 17, 14, 32, 31, 29, 27, 24, 21, 18, 15, 32, 31, 29, 27, 24, 21, 18, 15,
+ 30, 30, 28, 24, 21, 19, 16, 14, 30, 30, 28, 24, 21, 19, 16, 14, 28, 30,
+ 27, 21, 19, 17, 15, 13, 28, 30, 27, 21, 19, 17, 15, 13, 26, 28, 26, 20,
+ 18, 16, 14, 12, 26, 28, 26, 20, 18, 16, 14, 12, 23, 25, 24, 19, 16, 14,
+ 13, 11, 23, 25, 24, 19, 16, 14, 13, 11, 21, 23, 22, 18, 15, 13, 12, 11,
+ 21, 23, 22, 18, 15, 13, 12, 11, 19, 21, 20, 17, 14, 12, 11, 10, 19, 21,
+ 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 10, 9, 18, 19, 19, 16,
+ 14, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9, 16, 17, 18, 15, 13, 11,
+ 10, 9, 14, 16, 16, 14, 12, 11, 9, 8, 14, 16, 16, 14, 12, 11, 9, 8, 13,
+ 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10, 9, 8, 12, 14, 14, 13,
+ 11, 10, 8, 8, 12, 14, 14, 13, 11, 10, 8, 8, 12, 13, 13, 12, 11, 9, 8, 7,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 23, 21,
+ 21, 19, 19, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 25, 23, 23, 21, 21, 19,
+ 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 27, 27, 26, 26, 24, 24, 22, 22, 20, 20, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 13, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 13, 12, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 16,
+ 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 11, 11, 11, 11, 11, 19, 20,
+ 20, 21, 21, 20, 20, 21, 21, 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 10, 10, 10, 10, 9, 16, 17, 17, 17, 17, 17,
+ 17, 18, 18, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 9, 8, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 14, 14, 13,
+ 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 7 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 22, 22, 18, 22, 19, 19, 17, 22, 19, 16, 14, 18, 17, 14, 12,
+ /* Size 8x8 */
+ 33, 30, 24, 22, 21, 20, 18, 17, 30, 26, 23, 22, 22, 21, 19, 18, 24, 23,
+ 21, 21, 20, 20, 19, 18, 22, 22, 21, 19, 18, 18, 17, 16, 21, 22, 20, 18,
+ 17, 16, 15, 14, 20, 21, 20, 18, 16, 14, 14, 13, 18, 19, 19, 17, 15, 14,
+ 12, 12, 17, 18, 18, 16, 14, 13, 12, 11,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 28, 25, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+ 33, 30, 27, 24, 22, 22, 22, 21, 20, 20, 19, 18, 17, 17, 34, 33, 32, 29,
+ 26, 24, 22, 23, 23, 22, 22, 21, 20, 19, 18, 18, 31, 30, 29, 26, 24, 23,
+ 22, 22, 23, 22, 22, 21, 20, 19, 18, 18, 28, 27, 26, 24, 22, 22, 21, 22,
+ 23, 22, 22, 21, 20, 20, 19, 19, 25, 24, 24, 23, 22, 21, 20, 21, 21, 20,
+ 20, 20, 19, 19, 18, 18, 21, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19,
+ 18, 18, 17, 17, 21, 22, 23, 22, 22, 21, 19, 19, 19, 18, 18, 18, 17, 17,
+ 16, 16, 21, 22, 23, 23, 23, 21, 19, 19, 18, 17, 17, 17, 16, 16, 15, 15,
+ 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 20,
+ 22, 22, 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 19, 20, 21, 21,
+ 21, 20, 19, 18, 17, 16, 15, 14, 14, 14, 13, 13, 18, 19, 20, 20, 20, 19,
+ 18, 17, 16, 15, 15, 14, 13, 13, 12, 12, 17, 18, 19, 19, 20, 19, 18, 17,
+ 16, 15, 14, 14, 13, 12, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14,
+ 14, 13, 12, 12, 12, 11, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13,
+ 12, 12, 11, 11,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 33, 33, 33, 33,
+ 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 17, 16, 33, 33, 33, 33, 33, 30, 30, 27,
+ 27, 24, 24, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22,
+ 22, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18,
+ 18, 17, 34, 33, 33, 32, 32, 29, 29, 26, 26, 24, 24, 22, 22, 23, 23, 23,
+ 23, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30,
+ 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 18, 18, 17, 31, 30, 30, 29, 29, 26,
+ 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 18, 18, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22,
+ 22, 21, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 19, 19, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+ 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 18,
+ 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20,
+ 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 25, 24, 24, 24,
+ 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20,
+ 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 17, 17, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22,
+ 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16, 21, 22, 22, 23, 23, 23,
+ 23, 23, 23, 21, 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 15, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21,
+ 21, 19, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18,
+ 18, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14,
+ 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 22,
+ 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 22, 22, 22, 22, 22,
+ 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14,
+ 14, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13,
+ 13, 13, 19, 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 18, 19,
+ 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 12,
+ 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18,
+ 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19,
+ 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12,
+ 12, 11, 11, 11, 11, 11, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 11,
+ 11, 11, 15, 16, 16, 17, 17, 17, 17, 18, 18, 17, 17, 17, 17, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 11, 11,
+ /* Size 4x8 */
+ 33, 22, 20, 17, 28, 22, 22, 18, 24, 20, 20, 18, 22, 19, 18, 16, 22, 19,
+ 16, 14, 20, 19, 15, 13, 19, 18, 14, 12, 17, 17, 14, 11,
+ /* Size 8x4 */
+ 33, 28, 24, 22, 22, 20, 19, 17, 22, 22, 20, 19, 19, 19, 18, 17, 20, 22,
+ 20, 18, 16, 15, 14, 14, 17, 18, 18, 16, 14, 13, 12, 11,
+ /* Size 8x16 */
+ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 34, 32,
+ 26, 22, 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 28, 26, 22, 22,
+ 23, 22, 20, 19, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22, 21, 19, 19, 19,
+ 18, 17, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17, 16, 15,
+ 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 19, 20,
+ 21, 19, 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 17, 19, 20, 18,
+ 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17, 18, 17, 15, 14,
+ 12, 11,
+ /* Size 16x8 */
+ 32, 33, 34, 31, 28, 24, 21, 21, 21, 20, 20, 19, 18, 17, 16, 16, 33, 33,
+ 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 28, 27, 26, 24,
+ 22, 22, 21, 22, 22, 22, 22, 21, 20, 20, 19, 18, 21, 22, 22, 22, 22, 20,
+ 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 23, 22, 23, 21, 19, 18,
+ 18, 17, 17, 17, 16, 16, 15, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 18, 19, 20, 20, 20, 19, 18, 17, 16, 15, 14, 14,
+ 13, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 16, 15, 14, 14, 13, 12, 12,
+ 12, 11,
+ /* Size 16x32 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 21, 20, 20, 18, 18, 16, 16, 16, 33, 33,
+ 33, 27, 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 33, 33, 33, 27,
+ 27, 22, 22, 22, 22, 20, 20, 19, 19, 17, 17, 16, 34, 32, 32, 26, 26, 22,
+ 22, 23, 23, 21, 21, 20, 20, 18, 18, 17, 34, 32, 32, 26, 26, 22, 22, 23,
+ 23, 21, 21, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22,
+ 22, 20, 20, 18, 18, 17, 31, 28, 28, 24, 24, 22, 22, 22, 22, 22, 22, 20,
+ 20, 18, 18, 17, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19,
+ 19, 18, 28, 26, 26, 22, 22, 22, 22, 23, 23, 22, 22, 20, 20, 19, 19, 18,
+ 24, 24, 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 24, 24,
+ 24, 22, 22, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 17, 21, 22, 22, 21,
+ 21, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 21, 21, 19,
+ 19, 19, 19, 19, 19, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 19, 19, 18,
+ 18, 18, 18, 17, 17, 16, 16, 16, 21, 22, 22, 22, 22, 19, 19, 18, 18, 18,
+ 18, 17, 17, 16, 16, 16, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16,
+ 16, 15, 15, 15, 21, 23, 23, 22, 22, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 15, 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 20, 22, 22, 22, 22, 19, 19, 17, 17, 16, 16, 15, 15, 14, 14, 14, 20, 21,
+ 21, 22, 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 20, 21, 21, 22,
+ 22, 19, 19, 17, 17, 16, 16, 14, 14, 14, 14, 13, 19, 20, 20, 21, 21, 19,
+ 19, 17, 17, 15, 15, 14, 14, 13, 13, 13, 19, 20, 20, 21, 21, 19, 19, 17,
+ 17, 15, 15, 14, 14, 13, 13, 13, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15,
+ 15, 13, 13, 12, 12, 12, 18, 20, 20, 20, 20, 18, 18, 16, 16, 15, 15, 13,
+ 13, 12, 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12,
+ 12, 12, 17, 19, 19, 20, 20, 18, 18, 16, 16, 14, 14, 13, 13, 12, 12, 12,
+ 16, 18, 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 18,
+ 18, 19, 19, 17, 17, 15, 15, 14, 14, 12, 12, 12, 12, 11, 16, 17, 17, 18,
+ 18, 17, 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 17,
+ 17, 15, 15, 14, 14, 12, 12, 11, 11, 11, 16, 17, 17, 18, 18, 16, 16, 15,
+ 15, 13, 13, 12, 12, 11, 11, 11,
+ /* Size 32x16 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 33, 33, 33, 32, 32, 28, 28, 26,
+ 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18,
+ 18, 18, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21,
+ 21, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 15, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15,
+ 20, 20, 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 20, 20, 20, 21,
+ 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15,
+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+ 13, 12, 12, 12, 12, 12, 18, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 17,
+ 17, 18, 18, 18, 18, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14,
+ 14, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 16, 17, 17, 17,
+ 17, 18, 18, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 13, 12,
+ 12, 12, 12, 11, 11, 11, 11, 11,
+ /* Size 4x16 */
+ 33, 21, 20, 16, 33, 22, 20, 17, 32, 22, 21, 18, 28, 22, 22, 18, 26, 22,
+ 22, 19, 24, 20, 20, 18, 22, 19, 19, 17, 22, 19, 18, 16, 23, 19, 17, 15,
+ 22, 19, 16, 14, 21, 19, 16, 14, 20, 19, 15, 13, 20, 18, 15, 12, 19, 18,
+ 14, 12, 18, 17, 14, 12, 17, 17, 14, 11,
+ /* Size 16x4 */
+ 33, 33, 32, 28, 26, 24, 22, 22, 23, 22, 21, 20, 20, 19, 18, 17, 21, 22,
+ 22, 22, 22, 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 17, 20, 20, 21, 22,
+ 22, 20, 19, 18, 17, 16, 16, 15, 15, 14, 14, 14, 16, 17, 18, 18, 19, 18,
+ 17, 16, 15, 14, 14, 13, 12, 12, 12, 11,
+ /* Size 8x32 */
+ 32, 33, 28, 21, 21, 20, 18, 16, 33, 33, 27, 22, 22, 20, 19, 17, 33, 33,
+ 27, 22, 22, 20, 19, 17, 34, 32, 26, 22, 23, 21, 20, 18, 34, 32, 26, 22,
+ 23, 21, 20, 18, 31, 28, 24, 22, 22, 22, 20, 18, 31, 28, 24, 22, 22, 22,
+ 20, 18, 28, 26, 22, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 20, 19,
+ 24, 24, 22, 20, 21, 20, 19, 18, 24, 24, 22, 20, 21, 20, 19, 18, 21, 22,
+ 21, 19, 19, 19, 18, 17, 21, 22, 21, 19, 19, 19, 18, 17, 21, 22, 22, 19,
+ 18, 18, 17, 16, 21, 22, 22, 19, 18, 18, 17, 16, 21, 23, 22, 19, 18, 17,
+ 16, 15, 21, 23, 22, 19, 18, 17, 16, 15, 20, 22, 22, 19, 17, 16, 15, 14,
+ 20, 22, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19, 17, 16, 14, 14, 20, 21,
+ 22, 19, 17, 16, 14, 14, 19, 20, 21, 19, 17, 15, 14, 13, 19, 20, 21, 19,
+ 17, 15, 14, 13, 18, 20, 20, 18, 16, 15, 13, 12, 18, 20, 20, 18, 16, 15,
+ 13, 12, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19, 20, 18, 16, 14, 13, 12,
+ 16, 18, 19, 17, 15, 14, 12, 12, 16, 18, 19, 17, 15, 14, 12, 12, 16, 17,
+ 18, 17, 15, 14, 12, 11, 16, 17, 18, 17, 15, 14, 12, 11, 16, 17, 18, 16,
+ 15, 13, 12, 11,
+ /* Size 32x8 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 16, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 28, 27, 27, 26, 26, 24, 24, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 19, 19, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17,
+ 17, 16, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 15, 20, 20,
+ 20, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 18, 19, 19, 20, 20, 20,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 18,
+ 18, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12,
+ 12, 11, 11, 11 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 31, 23, 17, 31, 26, 20, 16, 23, 20, 14, 12, 17, 16, 12, 9,
+ /* Size 8x8 */
+ 33, 32, 32, 29, 24, 20, 17, 15, 32, 32, 31, 29, 25, 21, 18, 16, 32, 31,
+ 29, 27, 24, 21, 18, 16, 29, 29, 27, 21, 19, 17, 16, 14, 24, 25, 24, 19,
+ 16, 14, 13, 12, 20, 21, 21, 17, 14, 13, 12, 11, 17, 18, 18, 16, 13, 12,
+ 10, 9, 15, 16, 16, 14, 12, 11, 9, 9,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 30, 29, 27, 25, 23, 21, 19, 17, 16, 14, 13, 33, 32,
+ 32, 32, 32, 30, 29, 28, 26, 24, 22, 20, 18, 17, 15, 13, 33, 32, 32, 32,
+ 32, 31, 30, 28, 27, 25, 23, 21, 19, 17, 16, 14, 33, 32, 32, 31, 30, 29,
+ 28, 27, 26, 24, 23, 20, 19, 17, 16, 14, 32, 32, 32, 30, 29, 28, 27, 26,
+ 25, 24, 22, 21, 19, 18, 16, 15, 30, 30, 31, 29, 28, 26, 24, 23, 22, 21,
+ 20, 19, 18, 16, 15, 14, 29, 29, 30, 28, 27, 24, 22, 21, 20, 19, 19, 17,
+ 17, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 19, 18, 17, 16, 15, 14,
+ 13, 12, 25, 26, 27, 26, 25, 22, 20, 19, 18, 17, 16, 15, 14, 14, 13, 12,
+ 23, 24, 25, 24, 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 21, 22,
+ 23, 23, 22, 20, 19, 17, 16, 15, 14, 13, 13, 12, 11, 11, 19, 20, 21, 20,
+ 21, 19, 17, 16, 15, 14, 13, 12, 12, 11, 11, 10, 17, 18, 19, 19, 19, 18,
+ 17, 15, 14, 13, 13, 12, 11, 10, 10, 9, 16, 17, 17, 17, 18, 16, 15, 14,
+ 14, 13, 12, 11, 10, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12,
+ 11, 11, 10, 9, 9, 8, 13, 13, 14, 14, 15, 14, 13, 12, 12, 11, 11, 10, 9,
+ 9, 8, 8,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 17, 17, 16, 15, 14, 14, 13, 13, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20,
+ 20, 19, 18, 17, 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 30, 30, 29, 29, 28, 27, 26, 24, 24, 22, 22, 20, 20, 19, 18, 17,
+ 17, 16, 15, 15, 13, 13, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 28, 27, 26, 25, 24, 23, 23, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 14, 14, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 28, 28,
+ 27, 25, 25, 23, 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 28, 27, 26, 25, 24, 23,
+ 23, 21, 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 20, 20,
+ 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 21, 21, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 27, 27,
+ 26, 26, 25, 24, 24, 22, 22, 21, 21, 20, 19, 19, 18, 17, 16, 16, 15, 15,
+ 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 25, 25, 24,
+ 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30, 30, 31,
+ 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19,
+ 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 30, 30, 30, 31, 31, 30, 29, 29,
+ 28, 28, 26, 26, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 29, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24,
+ 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 17, 17, 17, 16, 15, 15, 14, 14,
+ 13, 13, 28, 29, 29, 30, 30, 29, 28, 28, 27, 27, 24, 24, 22, 21, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 27, 28,
+ 28, 28, 28, 28, 27, 27, 26, 26, 23, 23, 21, 20, 20, 20, 19, 18, 18, 17,
+ 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12, 26, 27, 27, 27, 28, 27,
+ 26, 26, 26, 25, 23, 23, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 25, 26, 26, 26, 27, 26, 26, 26, 25, 25,
+ 22, 22, 20, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+ 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 22, 22, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+ 23, 24, 24, 24, 25, 24, 24, 24, 24, 24, 21, 21, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 21, 22, 22, 23,
+ 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14,
+ 13, 13, 13, 12, 12, 12, 11, 11, 11, 11, 21, 22, 22, 23, 23, 23, 23, 23,
+ 22, 22, 20, 20, 19, 18, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 11, 11, 11, 11, 19, 20, 20, 21, 21, 21, 21, 21, 21, 21, 19, 19,
+ 18, 17, 17, 16, 15, 14, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11,
+ 10, 10, 19, 20, 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16,
+ 15, 14, 14, 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 18, 19,
+ 19, 19, 20, 20, 20, 20, 20, 20, 18, 18, 17, 17, 16, 15, 15, 14, 14, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 17, 18, 18, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 17, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11,
+ 11, 11, 10, 10, 10, 10, 9, 9, 17, 17, 17, 18, 18, 18, 18, 18, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 12, 11, 11, 10, 10, 10,
+ 10, 9, 9, 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14,
+ 14, 14, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 15, 16,
+ 16, 17, 17, 17, 17, 17, 17, 17, 16, 16, 15, 15, 14, 14, 13, 13, 12, 12,
+ 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9, 14, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10,
+ 10, 9, 9, 9, 9, 8, 8, 14, 15, 15, 16, 16, 16, 16, 16, 16, 16, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+ 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11,
+ 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8, 8, 8, 13, 13, 13, 14, 14, 14,
+ 14, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 10, 10, 9,
+ 9, 9, 9, 9, 8, 8, 8, 8,
+ /* Size 4x8 */
+ 32, 30, 24, 17, 32, 30, 24, 17, 31, 28, 23, 18, 29, 24, 19, 15, 25, 21,
+ 16, 13, 21, 19, 14, 11, 18, 17, 13, 10, 16, 15, 12, 9,
+ /* Size 8x4 */
+ 32, 32, 31, 29, 25, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 24, 24,
+ 23, 19, 16, 14, 13, 12, 17, 17, 18, 15, 13, 11, 10, 9,
+ /* Size 8x16 */
+ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+ 31, 30, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20, 18, 16, 32, 31, 29, 27,
+ 24, 21, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17,
+ 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 25, 26, 25, 20, 17, 15, 14, 13,
+ 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 19, 21,
+ 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12, 11, 10, 16, 17, 18, 15,
+ 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 13, 14, 15, 13, 11, 10, 9,
+ 8,
+ /* Size 16x8 */
+ 32, 33, 33, 32, 32, 30, 29, 27, 25, 23, 21, 19, 18, 16, 14, 13, 33, 32,
+ 32, 32, 31, 30, 30, 28, 26, 24, 23, 21, 19, 17, 16, 14, 32, 32, 31, 30,
+ 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 16, 15, 28, 29, 30, 28, 27, 24,
+ 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 23, 24, 25, 24, 24, 21, 20, 18,
+ 17, 16, 15, 14, 14, 13, 12, 11, 19, 20, 21, 20, 21, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 17, 17, 18, 18, 18, 17, 16, 15, 14, 13, 12, 11,
+ 11, 10, 9, 9, 14, 15, 16, 16, 16, 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+ 8,
+ /* Size 16x32 */
+ 32, 33, 33, 32, 32, 30, 28, 27, 23, 23, 19, 19, 17, 16, 14, 13, 33, 32,
+ 32, 32, 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32,
+ 32, 30, 29, 28, 24, 24, 20, 20, 17, 17, 15, 14, 33, 32, 32, 32, 32, 31,
+ 29, 28, 25, 24, 20, 20, 18, 17, 15, 14, 33, 32, 32, 32, 31, 31, 30, 28,
+ 25, 25, 21, 21, 18, 17, 16, 14, 33, 32, 32, 31, 31, 30, 29, 28, 25, 24,
+ 21, 21, 18, 17, 16, 14, 32, 32, 32, 31, 30, 29, 28, 27, 24, 24, 20, 20,
+ 18, 17, 16, 14, 32, 32, 32, 30, 30, 29, 28, 27, 24, 24, 21, 21, 18, 17,
+ 16, 15, 32, 32, 31, 30, 29, 28, 27, 26, 24, 24, 21, 21, 18, 18, 16, 15,
+ 32, 31, 31, 30, 29, 28, 26, 26, 24, 23, 20, 20, 18, 18, 16, 15, 30, 30,
+ 30, 28, 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 30, 30, 30, 28,
+ 28, 26, 24, 23, 21, 21, 19, 19, 17, 16, 15, 14, 29, 30, 30, 28, 27, 24,
+ 22, 21, 20, 19, 17, 17, 16, 15, 14, 13, 28, 29, 30, 28, 27, 24, 21, 21,
+ 19, 19, 17, 17, 16, 15, 14, 13, 27, 28, 28, 27, 26, 23, 21, 20, 18, 18,
+ 16, 16, 15, 14, 13, 13, 26, 27, 28, 26, 26, 23, 20, 20, 18, 18, 16, 16,
+ 14, 14, 13, 12, 25, 26, 26, 25, 25, 22, 20, 19, 17, 17, 15, 15, 14, 13,
+ 13, 12, 23, 25, 25, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11,
+ 23, 24, 24, 24, 24, 21, 19, 18, 16, 16, 14, 14, 13, 13, 12, 11, 21, 23,
+ 23, 22, 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 21, 23, 23, 22,
+ 22, 20, 18, 17, 15, 15, 13, 13, 12, 12, 11, 11, 19, 21, 21, 21, 21, 19,
+ 17, 17, 14, 14, 13, 13, 12, 11, 10, 10, 19, 20, 21, 20, 20, 19, 17, 16,
+ 14, 14, 12, 12, 11, 11, 10, 10, 18, 19, 20, 20, 20, 18, 17, 16, 14, 14,
+ 12, 12, 11, 11, 10, 9, 18, 19, 19, 19, 19, 18, 16, 15, 14, 13, 12, 12,
+ 11, 10, 10, 9, 17, 18, 18, 18, 18, 17, 16, 15, 13, 13, 12, 12, 10, 10,
+ 9, 9, 16, 17, 17, 17, 18, 16, 15, 14, 13, 13, 11, 11, 10, 10, 9, 9, 15,
+ 17, 17, 17, 17, 16, 15, 14, 13, 12, 11, 11, 10, 10, 9, 9, 14, 16, 16,
+ 16, 16, 15, 14, 13, 12, 12, 11, 11, 9, 9, 9, 8, 14, 16, 16, 16, 16, 15,
+ 14, 13, 12, 12, 10, 10, 9, 9, 9, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11,
+ 11, 10, 10, 9, 9, 8, 8, 13, 14, 14, 14, 15, 14, 13, 12, 11, 11, 10, 10,
+ 9, 9, 8, 8,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 21,
+ 20, 19, 19, 18, 17, 17, 16, 16, 14, 14, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21, 21, 20, 19, 18,
+ 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28,
+ 28, 28, 27, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 17, 17, 16, 16,
+ 14, 14, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26,
+ 25, 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 17, 16, 16, 15, 15, 30, 30,
+ 30, 31, 31, 30, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 21, 21, 20,
+ 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 28, 29, 29, 29, 30, 29,
+ 28, 28, 27, 26, 24, 24, 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17,
+ 16, 16, 15, 15, 14, 14, 13, 13, 27, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+ 23, 23, 21, 21, 20, 20, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 12, 12, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19,
+ 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11,
+ 23, 24, 24, 24, 25, 24, 24, 24, 24, 23, 21, 21, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 19, 20, 20, 20,
+ 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 19, 20, 20, 20, 21, 21, 20, 21,
+ 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 17, 17,
+ 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ 9, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 16, 16, 15, 15, 14, 14, 13,
+ 13, 13, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15,
+ 16, 16, 16, 16, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10,
+ 10, 10, 10, 9, 9, 9, 9, 9, 8, 8, 13, 14, 14, 14, 14, 14, 14, 15, 15, 15,
+ 14, 14, 13, 13, 13, 12, 12, 11, 11, 11, 11, 10, 10, 9, 9, 9, 9, 9, 8, 8,
+ 8, 8,
+ /* Size 4x16 */
+ 33, 30, 23, 16, 32, 30, 24, 17, 32, 31, 25, 17, 32, 29, 24, 17, 32, 28,
+ 24, 18, 30, 26, 21, 16, 30, 24, 19, 15, 28, 23, 18, 14, 26, 22, 17, 13,
+ 24, 21, 16, 13, 23, 20, 15, 12, 20, 19, 14, 11, 19, 18, 13, 10, 17, 16,
+ 13, 10, 16, 15, 12, 9, 14, 14, 11, 9,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 32, 30, 30, 28, 26, 24, 23, 20, 19, 17, 16, 14, 30, 30,
+ 31, 29, 28, 26, 24, 23, 22, 21, 20, 19, 18, 16, 15, 14, 23, 24, 25, 24,
+ 24, 21, 19, 18, 17, 16, 15, 14, 13, 13, 12, 11, 16, 17, 17, 17, 18, 16,
+ 15, 14, 13, 13, 12, 11, 10, 10, 9, 9,
+ /* Size 8x32 */
+ 32, 33, 32, 28, 23, 19, 17, 14, 33, 32, 32, 29, 24, 20, 17, 15, 33, 32,
+ 32, 29, 24, 20, 17, 15, 33, 32, 32, 29, 25, 20, 18, 15, 33, 32, 31, 30,
+ 25, 21, 18, 16, 33, 32, 31, 29, 25, 21, 18, 16, 32, 32, 30, 28, 24, 20,
+ 18, 16, 32, 32, 30, 28, 24, 21, 18, 16, 32, 31, 29, 27, 24, 21, 18, 16,
+ 32, 31, 29, 26, 24, 20, 18, 16, 30, 30, 28, 24, 21, 19, 17, 15, 30, 30,
+ 28, 24, 21, 19, 17, 15, 29, 30, 27, 22, 20, 17, 16, 14, 28, 30, 27, 21,
+ 19, 17, 16, 14, 27, 28, 26, 21, 18, 16, 15, 13, 26, 28, 26, 20, 18, 16,
+ 14, 13, 25, 26, 25, 20, 17, 15, 14, 13, 23, 25, 24, 19, 16, 14, 13, 12,
+ 23, 24, 24, 19, 16, 14, 13, 12, 21, 23, 22, 18, 15, 13, 12, 11, 21, 23,
+ 22, 18, 15, 13, 12, 11, 19, 21, 21, 17, 14, 13, 12, 10, 19, 21, 20, 17,
+ 14, 12, 11, 10, 18, 20, 20, 17, 14, 12, 11, 10, 18, 19, 19, 16, 14, 12,
+ 11, 10, 17, 18, 18, 16, 13, 12, 10, 9, 16, 17, 18, 15, 13, 11, 10, 9,
+ 15, 17, 17, 15, 13, 11, 10, 9, 14, 16, 16, 14, 12, 11, 9, 9, 14, 16, 16,
+ 14, 12, 10, 9, 9, 13, 14, 15, 13, 11, 10, 9, 8, 13, 14, 15, 13, 11, 10,
+ 9, 8,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 27, 26, 25, 23,
+ 23, 21, 21, 19, 19, 18, 18, 17, 16, 15, 14, 14, 13, 13, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 24, 23, 23, 21,
+ 21, 20, 19, 18, 17, 17, 16, 16, 14, 14, 32, 32, 32, 32, 31, 31, 30, 30,
+ 29, 29, 28, 28, 27, 27, 26, 26, 25, 24, 24, 22, 22, 21, 20, 20, 19, 18,
+ 18, 17, 16, 16, 15, 15, 28, 29, 29, 29, 30, 29, 28, 28, 27, 26, 24, 24,
+ 22, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14,
+ 13, 13, 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 21, 21, 20, 19, 18, 18,
+ 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 11, 11, 19, 20,
+ 20, 20, 21, 21, 20, 21, 21, 20, 19, 19, 17, 17, 16, 16, 15, 14, 14, 13,
+ 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 17, 18, 18, 18,
+ 18, 18, 18, 18, 17, 17, 16, 16, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 9, 9, 9, 9, 14, 15, 15, 15, 16, 16, 16, 16, 16, 16, 15,
+ 15, 14, 14, 13, 13, 13, 12, 12, 11, 11, 10, 10, 10, 10, 9, 9, 9, 9, 9,
+ 8, 8 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 24, 22, 19, 24, 21, 20, 19, 22, 20, 17, 15, 19, 19, 15, 13,
+ /* Size 8x8 */
+ 33, 32, 27, 21, 22, 20, 19, 18, 32, 29, 24, 22, 23, 22, 20, 19, 27, 24,
+ 22, 21, 23, 22, 21, 20, 21, 22, 21, 19, 19, 19, 18, 18, 22, 23, 23, 19,
+ 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 15, 14, 19, 20, 21, 18, 16, 15,
+ 14, 13, 18, 19, 20, 18, 16, 14, 13, 12,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 28, 25, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+ 33, 30, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 18, 17, 34, 33, 32, 29,
+ 26, 24, 23, 22, 23, 23, 22, 22, 21, 20, 19, 18, 31, 30, 29, 26, 24, 23,
+ 22, 22, 22, 23, 22, 22, 21, 20, 19, 18, 28, 27, 26, 24, 22, 22, 22, 22,
+ 22, 23, 22, 22, 21, 20, 20, 19, 25, 24, 24, 23, 22, 21, 20, 20, 21, 21,
+ 20, 20, 20, 19, 19, 18, 22, 22, 23, 22, 22, 20, 20, 20, 20, 20, 19, 19,
+ 19, 18, 18, 17, 21, 22, 22, 22, 22, 20, 20, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16,
+ 21, 22, 23, 23, 23, 21, 20, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 21,
+ 22, 22, 22, 20, 19, 19, 18, 17, 17, 16, 16, 15, 15, 14, 20, 20, 22, 22,
+ 22, 20, 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 19, 20, 21, 21, 21, 20,
+ 19, 18, 17, 16, 16, 15, 14, 14, 14, 13, 18, 19, 20, 20, 20, 19, 18, 18,
+ 17, 16, 15, 15, 14, 13, 13, 12, 17, 18, 19, 19, 20, 19, 18, 17, 16, 16,
+ 15, 14, 14, 13, 12, 12, 16, 17, 18, 18, 19, 18, 17, 17, 16, 15, 14, 14,
+ 13, 12, 12, 12,
+ /* Size 32x32 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 25, 25, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16, 33, 33, 33, 33,
+ 33, 32, 30, 29, 27, 27, 24, 24, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 33, 33, 33, 33, 33, 31, 30, 29,
+ 27, 26, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 34, 33, 33, 33, 33, 31, 29, 28, 26, 26, 24, 24,
+ 22, 22, 22, 22, 22, 23, 22, 22, 22, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+ 18, 18, 34, 33, 33, 33, 32, 31, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23,
+ 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 32, 32,
+ 31, 31, 31, 29, 28, 27, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 18, 18, 31, 30, 30, 29, 29, 28,
+ 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 19, 19, 18, 18, 30, 29, 29, 28, 28, 27, 26, 25, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 19, 19, 19, 19, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 21,
+ 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+ 28, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+ 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 25, 24, 24, 24,
+ 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 25, 24, 24, 24, 24, 24, 23, 23,
+ 22, 22, 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 18, 18, 22, 22, 22, 22, 23, 22, 22, 22, 22, 21, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18,
+ 17, 17, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 21, 22, 22, 22, 23, 22,
+ 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 16, 16, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+ 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17,
+ 16, 16, 16, 16, 21, 22, 22, 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 21, 22, 22, 22, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 21, 21, 22,
+ 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 20, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 15, 15, 15, 15, 14, 14, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 20, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14,
+ 14, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 19, 20,
+ 20, 20, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 13, 13, 19, 20, 20, 20, 21, 21,
+ 21, 21, 21, 21, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 15,
+ 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21,
+ 20, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14,
+ 13, 13, 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18,
+ 18, 17, 17, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13, 13, 13, 13, 12, 12,
+ 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18, 18, 19,
+ 19, 19, 19, 19, 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 19, 19, 19, 19, 19,
+ 20, 20, 19, 19, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13,
+ 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12,
+ 12, 12, 16, 17, 17, 18, 18, 18, 18, 19, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 12, 12,
+ /* Size 4x8 */
+ 33, 24, 22, 19, 31, 23, 23, 20, 26, 22, 22, 20, 22, 20, 19, 18, 23, 21,
+ 17, 16, 21, 20, 17, 15, 20, 20, 16, 14, 19, 19, 16, 13,
+ /* Size 8x4 */
+ 33, 31, 26, 22, 23, 21, 20, 19, 24, 23, 22, 20, 21, 20, 20, 19, 22, 23,
+ 22, 19, 17, 17, 16, 16, 19, 20, 20, 18, 16, 15, 14, 13,
+ /* Size 8x16 */
+ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 34, 32,
+ 26, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22, 20, 19, 28, 26, 22, 22,
+ 23, 22, 21, 20, 24, 24, 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19,
+ 19, 18, 21, 22, 22, 19, 19, 18, 18, 17, 21, 23, 22, 19, 18, 17, 17, 16,
+ 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 21,
+ 22, 19, 17, 16, 15, 14, 19, 20, 21, 19, 17, 15, 14, 13, 18, 20, 20, 18,
+ 16, 15, 14, 13, 17, 19, 20, 18, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14,
+ 13, 12,
+ /* Size 16x8 */
+ 32, 33, 34, 31, 28, 24, 22, 21, 21, 21, 20, 20, 19, 18, 17, 16, 33, 33,
+ 32, 28, 26, 24, 22, 22, 23, 23, 22, 21, 20, 20, 19, 18, 28, 27, 26, 24,
+ 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 20, 19, 21, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 23, 22, 23, 21, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 15, 20, 20, 21, 22, 22, 20, 19, 18, 17, 17,
+ 16, 16, 15, 15, 14, 14, 18, 19, 20, 20, 21, 19, 19, 18, 17, 16, 16, 15,
+ 14, 14, 13, 13, 17, 18, 19, 19, 20, 18, 18, 17, 16, 16, 15, 14, 13, 13,
+ 12, 12,
+ /* Size 16x32 */
+ 32, 33, 33, 29, 28, 24, 21, 21, 21, 21, 20, 20, 18, 18, 17, 16, 33, 33,
+ 33, 28, 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 33, 33, 33, 28,
+ 27, 24, 22, 22, 22, 22, 20, 20, 19, 19, 18, 17, 34, 32, 32, 28, 26, 24,
+ 22, 22, 22, 22, 21, 21, 20, 20, 18, 18, 34, 32, 32, 28, 26, 24, 22, 22,
+ 23, 23, 21, 21, 20, 20, 19, 18, 32, 31, 30, 26, 25, 23, 22, 22, 23, 23,
+ 21, 21, 20, 20, 19, 18, 31, 29, 28, 26, 24, 23, 22, 22, 22, 22, 22, 22,
+ 20, 20, 19, 18, 30, 28, 28, 24, 23, 23, 22, 22, 23, 22, 22, 22, 20, 20,
+ 19, 19, 28, 26, 26, 23, 22, 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 19,
+ 28, 26, 26, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 24, 24,
+ 24, 22, 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 24, 24, 24, 22,
+ 22, 21, 20, 20, 21, 21, 20, 20, 19, 19, 18, 18, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 18, 18, 17, 21, 22, 22, 22, 21, 20, 19, 19,
+ 19, 19, 19, 19, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+ 18, 18, 18, 18, 17, 17, 21, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18,
+ 17, 17, 17, 16, 21, 22, 23, 22, 22, 21, 19, 19, 18, 18, 17, 17, 17, 17,
+ 16, 16, 21, 23, 23, 23, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15,
+ 21, 22, 23, 22, 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 20, 22,
+ 22, 22, 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 22, 22, 22,
+ 22, 20, 19, 19, 17, 17, 16, 16, 16, 15, 15, 14, 20, 21, 21, 22, 22, 20,
+ 19, 18, 17, 17, 16, 16, 15, 15, 14, 14, 20, 21, 21, 22, 22, 20, 19, 18,
+ 17, 17, 16, 16, 15, 14, 14, 14, 19, 20, 21, 21, 21, 20, 19, 18, 17, 17,
+ 15, 15, 14, 14, 14, 13, 19, 20, 20, 21, 21, 20, 19, 18, 17, 16, 15, 15,
+ 14, 14, 13, 13, 19, 20, 20, 20, 21, 20, 18, 18, 16, 16, 15, 15, 14, 14,
+ 13, 13, 18, 20, 20, 20, 20, 19, 18, 18, 16, 16, 15, 15, 14, 13, 13, 12,
+ 18, 19, 19, 20, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 13, 12, 17, 19,
+ 19, 19, 20, 19, 18, 17, 16, 16, 14, 14, 13, 13, 12, 12, 17, 19, 19, 19,
+ 19, 19, 17, 17, 16, 16, 14, 14, 13, 13, 12, 12, 16, 18, 18, 18, 19, 18,
+ 17, 17, 15, 15, 14, 14, 13, 12, 12, 12, 16, 18, 18, 18, 19, 18, 17, 17,
+ 15, 15, 14, 14, 13, 12, 12, 12,
+ /* Size 32x16 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+ 32, 31, 29, 28, 26, 26, 24, 24, 22, 22, 22, 22, 22, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 33, 33, 33, 32, 32, 30, 28, 28,
+ 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20,
+ 20, 19, 19, 19, 18, 18, 29, 28, 28, 28, 28, 26, 26, 24, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19,
+ 18, 18, 28, 27, 27, 26, 26, 25, 24, 23, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 24, 24,
+ 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 21, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 17, 17, 17, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17,
+ 17, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19,
+ 19, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 21, 21, 19, 19, 19, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 15, 15, 20, 20, 20, 21,
+ 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 20, 20, 20, 21, 21, 21, 22, 22,
+ 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20, 20, 20, 21, 21, 19, 19,
+ 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13,
+ 13, 13, 18, 19, 19, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 17, 18,
+ 18, 18, 19, 19, 19, 19, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 15,
+ 15, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 16, 17, 17, 18, 18, 18,
+ 18, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 14, 13,
+ 13, 13, 12, 12, 12, 12, 12, 12,
+ /* Size 4x16 */
+ 33, 24, 21, 18, 33, 24, 22, 19, 32, 24, 23, 20, 29, 23, 22, 20, 26, 22,
+ 22, 20, 24, 21, 21, 19, 22, 20, 19, 18, 22, 20, 19, 18, 22, 21, 18, 17,
+ 22, 21, 17, 16, 22, 20, 17, 15, 21, 20, 17, 14, 20, 20, 16, 14, 20, 19,
+ 16, 13, 19, 19, 16, 13, 18, 18, 15, 12,
+ /* Size 16x4 */
+ 33, 33, 32, 29, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 18, 24, 24,
+ 24, 23, 22, 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 18, 21, 22, 23, 22,
+ 22, 21, 19, 19, 18, 17, 17, 17, 16, 16, 16, 15, 18, 19, 20, 20, 20, 19,
+ 18, 18, 17, 16, 15, 14, 14, 13, 13, 12,
+ /* Size 8x32 */
+ 32, 33, 28, 21, 21, 20, 18, 17, 33, 33, 27, 22, 22, 20, 19, 18, 33, 33,
+ 27, 22, 22, 20, 19, 18, 34, 32, 26, 22, 22, 21, 20, 18, 34, 32, 26, 22,
+ 23, 21, 20, 19, 32, 30, 25, 22, 23, 21, 20, 19, 31, 28, 24, 22, 22, 22,
+ 20, 19, 30, 28, 23, 22, 23, 22, 20, 19, 28, 26, 22, 22, 23, 22, 21, 20,
+ 28, 26, 22, 21, 22, 22, 21, 19, 24, 24, 22, 20, 21, 20, 19, 18, 24, 24,
+ 22, 20, 21, 20, 19, 18, 22, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 19,
+ 19, 19, 18, 17, 21, 22, 22, 19, 19, 18, 18, 17, 21, 22, 22, 19, 18, 18,
+ 17, 17, 21, 23, 22, 19, 18, 17, 17, 16, 21, 23, 22, 19, 18, 17, 16, 16,
+ 21, 23, 22, 19, 18, 17, 16, 16, 20, 22, 22, 19, 17, 16, 16, 15, 20, 22,
+ 22, 19, 17, 16, 16, 15, 20, 21, 22, 19, 17, 16, 15, 14, 20, 21, 22, 19,
+ 17, 16, 15, 14, 19, 21, 21, 19, 17, 15, 14, 14, 19, 20, 21, 19, 17, 15,
+ 14, 13, 19, 20, 21, 18, 16, 15, 14, 13, 18, 20, 20, 18, 16, 15, 14, 13,
+ 18, 19, 20, 18, 16, 14, 13, 13, 17, 19, 20, 18, 16, 14, 13, 12, 17, 19,
+ 19, 17, 16, 14, 13, 12, 16, 18, 19, 17, 15, 14, 13, 12, 16, 18, 19, 17,
+ 15, 14, 13, 12,
+ /* Size 32x8 */
+ 32, 33, 33, 34, 34, 32, 31, 30, 28, 28, 24, 24, 22, 21, 21, 21, 21, 21,
+ 21, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16, 33, 33, 33, 32,
+ 32, 30, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 23, 23, 22, 22, 21,
+ 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 28, 27, 27, 26, 26, 25, 24, 23,
+ 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+ 17, 17, 21, 22, 22, 22, 23, 23, 22, 23, 23, 22, 21, 21, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 20, 20,
+ 20, 21, 21, 21, 22, 22, 22, 22, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 18, 19, 19, 20, 20, 20,
+ 20, 20, 21, 21, 19, 19, 19, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 14,
+ 14, 14, 14, 13, 13, 13, 13, 13, 17, 18, 18, 18, 19, 19, 19, 19, 20, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13,
+ 12, 12, 12, 12 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 31, 24, 19, 31, 27, 22, 18, 24, 22, 16, 14, 19, 18, 14, 11,
+ /* Size 8x8 */
+ 33, 32, 32, 30, 27, 22, 20, 16, 32, 32, 32, 30, 28, 23, 21, 17, 32, 32,
+ 29, 28, 26, 23, 21, 18, 30, 30, 28, 24, 22, 20, 18, 16, 27, 28, 26, 22,
+ 19, 17, 16, 14, 22, 23, 23, 20, 17, 15, 14, 12, 20, 21, 21, 18, 16, 14,
+ 12, 11, 16, 17, 18, 16, 14, 12, 11, 10,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+ 32, 32, 32, 32, 30, 29, 27, 26, 24, 22, 20, 19, 18, 17, 33, 32, 32, 32,
+ 32, 32, 31, 30, 28, 27, 25, 23, 21, 19, 18, 17, 33, 32, 32, 31, 31, 31,
+ 29, 28, 27, 26, 24, 23, 21, 19, 18, 17, 32, 32, 32, 31, 30, 30, 28, 28,
+ 26, 26, 24, 23, 21, 19, 19, 17, 32, 32, 32, 31, 30, 29, 28, 27, 26, 25,
+ 24, 22, 21, 20, 19, 18, 30, 30, 31, 29, 28, 28, 26, 24, 23, 22, 22, 20,
+ 19, 18, 17, 16, 28, 29, 30, 28, 28, 27, 24, 21, 20, 20, 19, 18, 17, 16,
+ 16, 15, 27, 27, 28, 27, 26, 26, 23, 20, 20, 19, 18, 17, 16, 15, 15, 14,
+ 25, 26, 27, 26, 26, 25, 22, 20, 19, 18, 17, 16, 15, 15, 14, 14, 23, 24,
+ 25, 24, 24, 24, 22, 19, 18, 17, 16, 15, 14, 14, 13, 13, 21, 22, 23, 23,
+ 23, 22, 20, 18, 17, 16, 15, 14, 13, 13, 12, 12, 19, 20, 21, 21, 21, 21,
+ 19, 17, 16, 15, 14, 13, 12, 12, 12, 11, 18, 19, 19, 19, 19, 20, 18, 16,
+ 15, 15, 14, 13, 12, 11, 11, 11, 17, 18, 18, 18, 19, 19, 17, 16, 15, 14,
+ 13, 12, 12, 11, 11, 10, 16, 17, 17, 17, 17, 18, 16, 15, 14, 14, 13, 12,
+ 11, 11, 10, 10,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 17, 17, 16, 16, 15, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 22,
+ 22, 21, 20, 20, 18, 18, 17, 16, 16, 15, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 29, 29, 27, 27, 26, 24, 24, 23, 22, 21, 20, 20,
+ 19, 18, 18, 17, 17, 15, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 29, 29, 28, 27, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+ 17, 16, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 28, 28, 27, 25, 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 28, 27, 25,
+ 25, 23, 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22,
+ 21, 21, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+ 30, 29, 29, 28, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19,
+ 18, 17, 17, 16, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28,
+ 28, 28, 26, 26, 26, 24, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 16,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26,
+ 25, 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23,
+ 22, 22, 21, 21, 20, 19, 19, 18, 18, 17, 31, 31, 31, 31, 31, 31, 30, 29,
+ 29, 28, 28, 27, 26, 26, 24, 24, 24, 23, 23, 22, 22, 21, 20, 20, 19, 19,
+ 18, 18, 17, 17, 17, 16, 30, 30, 30, 30, 31, 31, 29, 29, 28, 28, 28, 26,
+ 26, 25, 24, 24, 23, 23, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 16,
+ 16, 15, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24, 23, 23,
+ 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 28, 29,
+ 29, 29, 30, 30, 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19,
+ 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 28, 29, 29, 29, 30, 30,
+ 28, 28, 28, 27, 27, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 14, 27, 27, 27, 28, 28, 28, 27, 27, 26, 26,
+ 26, 24, 23, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15,
+ 15, 14, 14, 13, 26, 27, 27, 27, 28, 28, 26, 26, 26, 26, 26, 23, 23, 22,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 25, 26, 26, 26, 27, 27, 26, 26, 26, 25, 25, 23, 22, 21, 20, 20, 19, 19,
+ 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 23, 24, 24, 24,
+ 25, 25, 24, 24, 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 24, 25, 25, 24, 24,
+ 24, 24, 24, 22, 22, 20, 19, 19, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14,
+ 14, 14, 13, 13, 13, 12, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 21,
+ 20, 20, 18, 18, 17, 17, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 13, 12,
+ 12, 12, 21, 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 20, 20, 19, 18, 18,
+ 17, 17, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 12, 20, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 16, 15,
+ 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21,
+ 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 11, 11, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21,
+ 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12,
+ 12, 11, 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17,
+ 16, 16, 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10,
+ 17, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15,
+ 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 17, 17, 18, 18,
+ 18, 18, 18, 18, 19, 19, 19, 17, 17, 17, 16, 16, 15, 15, 14, 13, 13, 13,
+ 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 16, 16, 17, 17, 17, 17, 17, 17,
+ 17, 18, 18, 17, 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11,
+ 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17, 17, 17, 17, 18, 18, 17,
+ 16, 16, 15, 15, 14, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 10, 10, 10,
+ 10, 9, 15, 15, 15, 16, 16, 16, 16, 16, 16, 17, 17, 16, 15, 15, 14, 14,
+ 13, 13, 13, 12, 12, 12, 12, 11, 11, 11, 10, 10, 10, 9, 9, 9,
+ /* Size 4x8 */
+ 32, 32, 24, 18, 32, 31, 25, 19, 32, 29, 24, 20, 30, 28, 20, 17, 27, 26,
+ 18, 15, 23, 23, 16, 13, 20, 20, 14, 12, 17, 18, 13, 11,
+ /* Size 8x4 */
+ 32, 32, 32, 30, 27, 23, 20, 17, 32, 31, 29, 28, 26, 23, 20, 18, 24, 25,
+ 24, 20, 18, 16, 14, 13, 18, 19, 20, 17, 15, 13, 12, 11,
+ /* Size 8x16 */
+ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 17, 33, 32,
+ 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24, 21, 17, 32, 32, 30, 28,
+ 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 30, 30, 28, 25, 23, 21,
+ 19, 16, 28, 30, 27, 22, 20, 19, 17, 15, 27, 28, 26, 22, 20, 18, 16, 14,
+ 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 21, 23,
+ 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17,
+ 15, 14, 12, 11, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17, 18, 16, 14, 13,
+ 11, 10,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 32, 32, 30, 28, 27, 25, 23, 21, 19, 18, 17, 16, 33, 32,
+ 32, 32, 32, 31, 30, 30, 28, 26, 25, 23, 21, 19, 18, 17, 32, 32, 31, 30,
+ 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 29, 29, 30, 29, 28, 28,
+ 25, 22, 22, 21, 20, 19, 18, 17, 16, 16, 26, 27, 28, 27, 26, 26, 23, 20,
+ 20, 19, 18, 17, 16, 15, 15, 14, 23, 24, 25, 24, 24, 24, 21, 19, 18, 17,
+ 16, 15, 14, 14, 13, 13, 19, 20, 21, 21, 21, 21, 19, 17, 16, 15, 14, 13,
+ 12, 12, 12, 11, 16, 17, 17, 17, 18, 18, 16, 15, 14, 13, 13, 12, 11, 11,
+ 10, 10,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 26, 23, 23, 20, 19, 18, 16, 16, 33, 32,
+ 32, 32, 32, 32, 29, 29, 27, 24, 24, 21, 20, 18, 16, 16, 33, 32, 32, 32,
+ 32, 32, 29, 29, 27, 24, 24, 21, 20, 19, 17, 17, 33, 32, 32, 32, 32, 32,
+ 30, 29, 28, 25, 25, 21, 20, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30,
+ 28, 25, 25, 22, 21, 19, 17, 17, 33, 32, 32, 32, 31, 31, 30, 30, 28, 25,
+ 25, 22, 21, 19, 17, 17, 33, 32, 32, 31, 30, 30, 29, 28, 27, 24, 24, 21,
+ 21, 19, 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 21, 20, 19,
+ 17, 17, 32, 32, 32, 31, 30, 30, 28, 28, 26, 24, 24, 21, 21, 19, 18, 18,
+ 32, 32, 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 32, 32,
+ 31, 30, 29, 29, 28, 27, 26, 24, 24, 21, 21, 20, 18, 18, 31, 31, 31, 29,
+ 28, 28, 26, 25, 24, 22, 22, 20, 19, 18, 17, 17, 30, 30, 30, 29, 28, 28,
+ 25, 24, 23, 21, 21, 19, 19, 18, 16, 16, 30, 30, 30, 29, 28, 28, 24, 23,
+ 22, 20, 20, 19, 18, 17, 16, 16, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19,
+ 19, 18, 17, 16, 15, 15, 28, 29, 30, 28, 27, 27, 22, 21, 20, 19, 19, 18,
+ 17, 16, 15, 15, 27, 28, 28, 27, 26, 26, 22, 20, 20, 18, 18, 17, 16, 15,
+ 14, 14, 26, 27, 28, 26, 26, 26, 21, 20, 19, 18, 18, 16, 16, 15, 14, 14,
+ 25, 26, 26, 26, 25, 25, 21, 20, 19, 17, 17, 16, 15, 15, 13, 13, 23, 25,
+ 25, 24, 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 23, 25, 25, 24,
+ 24, 24, 20, 19, 18, 16, 16, 15, 14, 14, 13, 13, 22, 23, 23, 23, 23, 23,
+ 19, 18, 17, 16, 16, 14, 14, 13, 12, 12, 21, 23, 23, 23, 22, 22, 19, 18,
+ 17, 15, 15, 14, 13, 13, 12, 12, 20, 22, 22, 22, 22, 22, 19, 18, 17, 15,
+ 15, 13, 13, 12, 12, 12, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13,
+ 12, 12, 11, 11, 19, 20, 21, 20, 20, 20, 18, 17, 16, 14, 14, 13, 12, 12,
+ 11, 11, 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 11, 11,
+ 18, 19, 19, 19, 19, 19, 17, 16, 15, 14, 14, 12, 12, 11, 10, 10, 17, 18,
+ 18, 18, 18, 18, 16, 16, 15, 13, 13, 12, 12, 11, 10, 10, 16, 17, 17, 17,
+ 18, 18, 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 16, 17, 17, 17, 18, 18,
+ 16, 15, 14, 13, 13, 12, 11, 11, 10, 10, 15, 16, 16, 16, 17, 17, 15, 14,
+ 13, 12, 12, 11, 11, 10, 9, 9,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 27, 26, 25, 25, 23,
+ 23, 22, 20, 20, 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23, 23, 22, 21, 21,
+ 19, 19, 18, 17, 17, 16, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29,
+ 29, 29, 28, 28, 27, 26, 26, 24, 24, 23, 23, 22, 20, 20, 19, 19, 18, 17,
+ 17, 16, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27,
+ 26, 26, 25, 24, 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 32, 32,
+ 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24,
+ 24, 23, 22, 22, 20, 20, 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30,
+ 29, 28, 28, 28, 28, 26, 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19,
+ 18, 18, 17, 17, 16, 16, 16, 15, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+ 27, 25, 24, 23, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 16, 16,
+ 16, 15, 15, 14, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22,
+ 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 23, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18,
+ 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 23, 24, 24, 25,
+ 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 20, 21, 21, 21, 22, 22, 21, 21,
+ 21, 21, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12, 12, 11, 19, 20, 20, 20, 21, 21, 21, 20, 21, 21, 21, 19,
+ 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13, 12, 12, 12, 12, 12, 11,
+ 11, 11, 18, 18, 19, 19, 19, 19, 19, 19, 19, 20, 20, 18, 18, 17, 16, 16,
+ 15, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 11, 11, 11, 10, 16, 16,
+ 17, 17, 17, 17, 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 11, 11, 11, 10, 10, 10, 10, 9, 16, 16, 17, 17, 17, 17,
+ 17, 17, 18, 18, 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12,
+ 11, 11, 11, 10, 10, 10, 10, 9,
+ /* Size 4x16 */
+ 33, 32, 23, 18, 32, 32, 24, 19, 32, 31, 25, 19, 32, 30, 24, 19, 32, 30,
+ 24, 19, 32, 29, 24, 20, 30, 28, 21, 18, 29, 27, 19, 16, 28, 26, 18, 15,
+ 26, 25, 17, 15, 25, 24, 16, 14, 23, 22, 15, 13, 20, 20, 14, 12, 19, 19,
+ 14, 11, 18, 18, 13, 11, 17, 18, 13, 11,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 32, 32, 30, 29, 28, 26, 25, 23, 20, 19, 18, 17, 32, 32,
+ 31, 30, 30, 29, 28, 27, 26, 25, 24, 22, 20, 19, 18, 18, 23, 24, 25, 24,
+ 24, 24, 21, 19, 18, 17, 16, 15, 14, 14, 13, 13, 18, 19, 19, 19, 19, 20,
+ 18, 16, 15, 15, 14, 13, 12, 11, 11, 11,
+ /* Size 8x32 */
+ 32, 33, 32, 29, 26, 23, 19, 16, 33, 32, 32, 29, 27, 24, 20, 16, 33, 32,
+ 32, 29, 27, 24, 20, 17, 33, 32, 32, 30, 28, 25, 20, 17, 33, 32, 31, 30,
+ 28, 25, 21, 17, 33, 32, 31, 30, 28, 25, 21, 17, 33, 32, 30, 29, 27, 24,
+ 21, 17, 32, 32, 30, 28, 27, 24, 20, 17, 32, 32, 30, 28, 26, 24, 21, 18,
+ 32, 31, 29, 28, 26, 24, 21, 18, 32, 31, 29, 28, 26, 24, 21, 18, 31, 31,
+ 28, 26, 24, 22, 19, 17, 30, 30, 28, 25, 23, 21, 19, 16, 30, 30, 28, 24,
+ 22, 20, 18, 16, 28, 30, 27, 22, 20, 19, 17, 15, 28, 30, 27, 22, 20, 19,
+ 17, 15, 27, 28, 26, 22, 20, 18, 16, 14, 26, 28, 26, 21, 19, 18, 16, 14,
+ 25, 26, 25, 21, 19, 17, 15, 13, 23, 25, 24, 20, 18, 16, 14, 13, 23, 25,
+ 24, 20, 18, 16, 14, 13, 22, 23, 23, 19, 17, 16, 14, 12, 21, 23, 22, 19,
+ 17, 15, 13, 12, 20, 22, 22, 19, 17, 15, 13, 12, 19, 21, 20, 18, 16, 14,
+ 12, 11, 19, 21, 20, 18, 16, 14, 12, 11, 18, 19, 19, 17, 15, 14, 12, 11,
+ 18, 19, 19, 17, 15, 14, 12, 10, 17, 18, 18, 16, 15, 13, 12, 10, 16, 17,
+ 18, 16, 14, 13, 11, 10, 16, 17, 18, 16, 14, 13, 11, 10, 15, 16, 17, 15,
+ 13, 12, 11, 9,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 26,
+ 25, 23, 23, 22, 21, 20, 19, 19, 18, 18, 17, 16, 16, 15, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 26, 25, 25, 23,
+ 23, 22, 21, 21, 19, 19, 18, 17, 17, 16, 32, 32, 32, 32, 31, 31, 30, 30,
+ 30, 29, 29, 28, 28, 28, 27, 27, 26, 26, 25, 24, 24, 23, 22, 22, 20, 20,
+ 19, 19, 18, 18, 18, 17, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 18, 18, 17, 17, 16, 16,
+ 16, 15, 26, 27, 27, 28, 28, 28, 27, 27, 26, 26, 26, 24, 23, 22, 20, 20,
+ 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 14, 14, 13, 23, 24,
+ 24, 25, 25, 25, 24, 24, 24, 24, 24, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+ 16, 16, 15, 15, 14, 14, 14, 14, 13, 13, 13, 12, 19, 20, 20, 20, 21, 21,
+ 21, 20, 21, 21, 21, 19, 19, 18, 17, 17, 16, 16, 15, 14, 14, 14, 13, 13,
+ 12, 12, 12, 12, 12, 11, 11, 11, 16, 16, 17, 17, 17, 17, 17, 17, 18, 18,
+ 18, 17, 16, 16, 15, 15, 14, 14, 13, 13, 13, 12, 12, 12, 11, 11, 11, 10,
+ 10, 10, 10, 9 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 25, 22, 20, 25, 21, 21, 20, 22, 21, 18, 17, 20, 20, 17, 14,
+ /* Size 8x8 */
+ 33, 33, 27, 23, 22, 21, 20, 19, 33, 32, 26, 23, 23, 22, 22, 20, 27, 26,
+ 22, 22, 22, 22, 22, 20, 23, 23, 22, 20, 20, 20, 20, 19, 22, 23, 22, 20,
+ 19, 18, 18, 17, 21, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20, 18, 16,
+ 16, 15, 19, 20, 20, 19, 17, 16, 15, 13,
+ /* Size 16x16 */
+ 32, 33, 34, 31, 30, 28, 25, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+ 33, 30, 28, 27, 24, 22, 22, 22, 22, 21, 20, 20, 19, 19, 34, 33, 32, 30,
+ 28, 26, 24, 22, 23, 23, 23, 22, 22, 21, 20, 20, 31, 30, 30, 28, 26, 24,
+ 23, 22, 22, 22, 23, 22, 22, 21, 20, 20, 30, 28, 28, 26, 24, 23, 22, 22,
+ 22, 22, 23, 22, 22, 21, 21, 20, 28, 27, 26, 24, 23, 22, 22, 21, 22, 22,
+ 23, 22, 22, 21, 21, 20, 25, 24, 24, 23, 22, 22, 21, 20, 20, 21, 21, 20,
+ 20, 20, 20, 19, 21, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 21, 22, 23, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18, 18, 17,
+ 21, 22, 23, 22, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 17, 21, 22,
+ 23, 23, 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22,
+ 22, 22, 20, 19, 18, 18, 17, 17, 16, 16, 16, 15, 20, 20, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 16, 16, 15, 15, 15, 19, 20, 21, 21, 21, 21, 20, 19,
+ 18, 17, 17, 16, 15, 15, 14, 14, 19, 19, 20, 20, 21, 21, 20, 18, 18, 17,
+ 16, 16, 15, 14, 14, 14, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15,
+ 15, 14, 14, 13,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+ 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 22, 22, 22, 22, 22, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 30,
+ 28, 27, 27, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20,
+ 20, 20, 19, 19, 19, 18, 33, 33, 33, 33, 33, 33, 30, 29, 28, 26, 26, 25,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 19, 19, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22,
+ 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 34, 33,
+ 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 31, 31, 30, 30, 30, 30,
+ 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22,
+ 22, 22, 21, 21, 20, 20, 20, 19, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21,
+ 20, 20, 20, 19, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20,
+ 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27, 27, 26,
+ 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 26, 25, 25, 25, 24, 24, 23, 23,
+ 23, 22, 22, 21, 21, 21, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 20, 19, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 21,
+ 21, 21, 20, 20, 20, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 19,
+ 19, 19, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18,
+ 18, 17, 17, 17, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17,
+ 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 21, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 17, 16, 16, 16, 16, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 16,
+ 16, 15, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15, 20, 20, 20, 21, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 15, 15, 15, 15, 14, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15,
+ 15, 15, 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14,
+ 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19, 18, 18,
+ 17, 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 14, 19, 19, 19, 20,
+ 20, 20, 20, 20, 21, 21, 21, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 16, 15, 15, 15, 14, 14, 14, 14, 14, 13, 18, 19, 19, 19, 20, 20, 20, 20,
+ 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15,
+ 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 13,
+ 13, 13, 17, 18, 18, 19, 19, 19, 19, 19, 20, 20, 20, 19, 19, 18, 18, 18,
+ 17, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 14, 13, 13, 13, 13,
+ /* Size 4x8 */
+ 33, 27, 22, 20, 32, 26, 23, 21, 26, 22, 23, 21, 23, 22, 20, 19, 22, 22,
+ 18, 18, 22, 22, 17, 16, 21, 22, 17, 15, 19, 20, 16, 14,
+ /* Size 8x4 */
+ 33, 32, 26, 23, 22, 22, 21, 19, 27, 26, 22, 22, 22, 22, 22, 20, 22, 23,
+ 23, 20, 18, 17, 17, 16, 20, 21, 21, 19, 18, 16, 15, 14,
+ /* Size 8x16 */
+ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 34, 32,
+ 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23, 22, 20, 29, 28, 23, 22,
+ 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 24, 24, 22, 21, 20, 21,
+ 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 22, 20, 19, 19, 18, 17,
+ 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 20, 22,
+ 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19,
+ 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20, 20, 19, 17, 16,
+ 15, 13,
+ /* Size 16x8 */
+ 32, 33, 34, 31, 29, 28, 24, 21, 21, 21, 21, 20, 20, 19, 19, 18, 33, 33,
+ 32, 29, 28, 26, 24, 22, 22, 23, 23, 22, 21, 21, 20, 20, 28, 27, 26, 24,
+ 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 23, 23, 23, 22, 22, 22,
+ 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22, 23, 22, 22, 22, 20, 19,
+ 19, 19, 19, 18, 18, 18, 18, 17, 21, 22, 23, 23, 23, 23, 21, 19, 19, 18,
+ 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 20, 19, 18, 17, 17, 16,
+ 16, 15, 15, 15, 18, 19, 20, 20, 20, 20, 19, 18, 17, 17, 16, 15, 14, 14,
+ 14, 13,
+ /* Size 16x32 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 21, 21, 21, 20, 20, 19, 18, 18, 33, 33,
+ 33, 30, 27, 27, 23, 22, 22, 22, 22, 20, 20, 20, 19, 19, 33, 33, 33, 30,
+ 27, 27, 23, 22, 22, 22, 22, 21, 20, 20, 19, 19, 33, 33, 32, 30, 26, 26,
+ 23, 22, 22, 22, 22, 21, 21, 20, 19, 19, 34, 32, 32, 29, 26, 26, 23, 22,
+ 23, 23, 23, 22, 21, 21, 20, 20, 34, 32, 32, 29, 26, 26, 23, 22, 23, 23,
+ 23, 22, 21, 21, 20, 20, 31, 30, 29, 28, 24, 24, 22, 22, 22, 23, 23, 22,
+ 22, 21, 20, 20, 31, 29, 28, 27, 24, 24, 22, 22, 22, 22, 22, 22, 22, 21,
+ 20, 20, 29, 28, 28, 26, 23, 23, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20,
+ 28, 26, 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 28, 26,
+ 26, 24, 22, 22, 22, 22, 22, 23, 23, 22, 22, 21, 20, 20, 25, 24, 24, 23,
+ 22, 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 24, 24, 24, 23, 22, 22,
+ 21, 20, 20, 21, 21, 20, 20, 20, 19, 19, 23, 23, 23, 23, 22, 22, 20, 20,
+ 20, 20, 20, 20, 20, 19, 19, 19, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19,
+ 19, 19, 19, 19, 18, 18, 21, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19,
+ 19, 19, 18, 18, 21, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+ 21, 22, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 17, 21, 22,
+ 23, 23, 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 21, 22, 23, 23,
+ 22, 22, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 20, 22, 22, 22, 22, 22,
+ 20, 19, 18, 17, 17, 17, 16, 16, 16, 16, 20, 22, 22, 22, 22, 22, 20, 19,
+ 18, 17, 17, 16, 16, 16, 15, 15, 20, 21, 22, 22, 22, 22, 20, 19, 18, 17,
+ 17, 16, 16, 16, 15, 15, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16,
+ 16, 15, 14, 14, 20, 21, 21, 22, 22, 22, 19, 19, 18, 17, 17, 16, 16, 15,
+ 14, 14, 19, 20, 21, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14,
+ 19, 20, 20, 21, 21, 21, 19, 19, 18, 17, 17, 15, 15, 15, 14, 14, 19, 20,
+ 20, 20, 21, 21, 19, 18, 18, 16, 16, 15, 15, 14, 14, 14, 18, 19, 20, 20,
+ 20, 20, 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 18, 19, 20, 20, 20, 20,
+ 19, 18, 17, 16, 16, 15, 15, 14, 13, 13, 17, 19, 19, 19, 20, 20, 18, 18,
+ 17, 16, 16, 15, 14, 14, 13, 13,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 33,
+ 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 33, 33, 33, 32, 32, 32, 29, 28,
+ 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21,
+ 21, 20, 20, 20, 20, 19, 31, 30, 30, 30, 29, 29, 28, 27, 26, 24, 24, 23,
+ 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 20, 20,
+ 20, 19, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 28, 27,
+ 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17,
+ 21, 22, 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 21, 22, 22, 22,
+ 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 15, 15, 15, 15, 15, 15, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 20,
+ 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 15,
+ 15, 14, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19, 19, 19,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 18, 19,
+ 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16,
+ 16, 16, 15, 15, 14, 14, 14, 14, 14, 13, 13, 13, 18, 19, 19, 19, 20, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15,
+ 14, 14, 14, 14, 14, 13, 13, 13,
+ /* Size 4x16 */
+ 33, 28, 21, 19, 33, 27, 22, 20, 32, 26, 23, 21, 30, 24, 23, 21, 28, 23,
+ 23, 21, 26, 22, 23, 21, 24, 22, 21, 20, 22, 21, 19, 19, 22, 22, 19, 18,
+ 22, 22, 18, 17, 22, 22, 18, 17, 22, 22, 17, 16, 21, 22, 17, 15, 20, 21,
+ 17, 15, 20, 21, 16, 14, 19, 20, 16, 14,
+ /* Size 16x4 */
+ 33, 33, 32, 30, 28, 26, 24, 22, 22, 22, 22, 22, 21, 20, 20, 19, 28, 27,
+ 26, 24, 23, 22, 22, 21, 22, 22, 22, 22, 22, 21, 21, 20, 21, 22, 23, 23,
+ 23, 23, 21, 19, 19, 18, 18, 17, 17, 17, 16, 16, 19, 20, 21, 21, 21, 21,
+ 20, 19, 18, 17, 17, 16, 15, 15, 14, 14,
+ /* Size 8x32 */
+ 32, 33, 28, 23, 21, 21, 20, 18, 33, 33, 27, 23, 22, 22, 20, 19, 33, 33,
+ 27, 23, 22, 22, 20, 19, 33, 32, 26, 23, 22, 22, 21, 19, 34, 32, 26, 23,
+ 23, 23, 21, 20, 34, 32, 26, 23, 23, 23, 21, 20, 31, 29, 24, 22, 22, 23,
+ 22, 20, 31, 28, 24, 22, 22, 22, 22, 20, 29, 28, 23, 22, 22, 23, 22, 20,
+ 28, 26, 22, 22, 22, 23, 22, 20, 28, 26, 22, 22, 22, 23, 22, 20, 25, 24,
+ 22, 21, 21, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 19, 23, 23, 22, 20,
+ 20, 20, 20, 19, 21, 22, 21, 20, 19, 19, 19, 18, 21, 22, 21, 20, 19, 19,
+ 19, 18, 21, 22, 22, 20, 19, 19, 18, 17, 21, 22, 22, 20, 19, 18, 18, 17,
+ 21, 23, 22, 20, 19, 18, 17, 17, 21, 23, 22, 20, 19, 18, 17, 16, 21, 23,
+ 22, 20, 19, 18, 17, 16, 20, 22, 22, 20, 18, 17, 16, 16, 20, 22, 22, 20,
+ 18, 17, 16, 15, 20, 22, 22, 20, 18, 17, 16, 15, 20, 21, 22, 19, 18, 17,
+ 16, 14, 20, 21, 22, 19, 18, 17, 16, 14, 19, 21, 21, 19, 18, 17, 15, 14,
+ 19, 20, 21, 19, 18, 17, 15, 14, 19, 20, 21, 19, 18, 16, 15, 14, 18, 20,
+ 20, 19, 17, 16, 15, 13, 18, 20, 20, 19, 17, 16, 15, 13, 17, 19, 20, 18,
+ 17, 16, 14, 13,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 18, 18, 17, 33, 33, 33, 32,
+ 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 22,
+ 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 28, 27, 27, 26, 26, 26, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 21, 22,
+ 22, 22, 23, 23, 23, 22, 23, 23, 23, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 20, 20, 20, 21, 21, 21,
+ 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16,
+ 16, 16, 15, 15, 15, 15, 15, 14, 18, 19, 19, 19, 20, 20, 20, 20, 20, 20,
+ 20, 20, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14,
+ 14, 13, 13, 13 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 27, 20, 32, 29, 26, 21, 27, 26, 19, 16, 20, 21, 16, 13,
+ /* Size 8x8 */
+ 33, 32, 32, 30, 29, 25, 22, 19, 32, 32, 32, 31, 30, 26, 23, 20, 32, 32,
+ 30, 29, 28, 25, 23, 20, 30, 31, 29, 26, 24, 22, 20, 19, 29, 30, 28, 24,
+ 21, 19, 18, 17, 25, 26, 25, 22, 19, 17, 16, 15, 22, 23, 23, 20, 18, 16,
+ 14, 13, 19, 20, 20, 19, 17, 15, 13, 12,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 17, 33, 32,
+ 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 18, 33, 32, 32, 32,
+ 32, 32, 31, 31, 30, 28, 28, 25, 23, 22, 20, 19, 33, 32, 32, 32, 32, 31,
+ 31, 30, 29, 28, 27, 25, 23, 23, 21, 19, 33, 32, 32, 32, 31, 30, 30, 29,
+ 28, 27, 26, 24, 23, 22, 20, 19, 32, 32, 32, 31, 30, 29, 28, 28, 27, 26,
+ 26, 24, 23, 22, 21, 19, 32, 31, 31, 31, 30, 28, 28, 27, 26, 25, 24, 23,
+ 22, 21, 20, 19, 30, 30, 31, 30, 29, 28, 27, 26, 24, 23, 23, 22, 20, 20,
+ 19, 18, 28, 29, 30, 29, 28, 27, 26, 24, 21, 20, 20, 19, 18, 18, 17, 16,
+ 27, 28, 28, 28, 27, 26, 25, 23, 20, 20, 20, 18, 18, 17, 16, 15, 26, 27,
+ 28, 27, 26, 26, 24, 23, 20, 20, 19, 18, 17, 17, 16, 15, 23, 24, 25, 25,
+ 24, 24, 23, 22, 19, 18, 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23,
+ 22, 20, 18, 18, 17, 16, 15, 14, 14, 13, 21, 22, 22, 23, 22, 22, 21, 20,
+ 18, 17, 17, 15, 14, 14, 13, 13, 19, 20, 20, 21, 20, 21, 20, 19, 17, 16,
+ 16, 14, 14, 13, 12, 12, 17, 18, 19, 19, 19, 19, 19, 18, 16, 15, 15, 14,
+ 13, 13, 12, 11,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 17, 17, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 26, 26, 25,
+ 24, 24, 22, 22, 21, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22,
+ 22, 20, 20, 19, 18, 18, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 30, 29, 29, 28, 27, 27, 25, 24, 24, 23, 22, 22, 20, 20, 20,
+ 18, 18, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30,
+ 30, 30, 28, 28, 28, 26, 25, 25, 23, 23, 22, 21, 20, 20, 19, 19, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 28, 28,
+ 28, 26, 25, 25, 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25,
+ 23, 23, 23, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21,
+ 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 28, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 20, 20, 19, 19,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 27, 26, 26, 25, 24, 24, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24,
+ 24, 24, 23, 22, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26, 26, 24, 24, 24, 23, 22,
+ 22, 21, 21, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+ 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20,
+ 19, 19, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25,
+ 24, 24, 23, 23, 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 30, 30,
+ 30, 30, 31, 31, 30, 29, 29, 29, 28, 28, 27, 26, 26, 25, 24, 24, 23, 23,
+ 23, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 29, 30, 30, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 26, 25, 25, 24, 23, 23, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 18, 18, 18, 17, 17, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+ 27, 27, 26, 24, 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 16, 16, 28, 29, 29, 29, 30, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+ 24, 23, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 20, 20,
+ 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 26, 26, 27, 27,
+ 28, 28, 27, 26, 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 26, 26, 27, 27, 28, 28, 27, 26,
+ 26, 26, 26, 26, 24, 23, 23, 22, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17,
+ 17, 16, 16, 16, 15, 15, 24, 25, 25, 25, 26, 26, 26, 25, 25, 25, 24, 24,
+ 23, 22, 22, 21, 19, 19, 19, 18, 18, 17, 17, 17, 16, 16, 16, 15, 15, 15,
+ 14, 14, 23, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 22, 22, 20,
+ 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24,
+ 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+ 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13, 13, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 14, 14, 14,
+ 13, 13, 13, 13, 21, 21, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 21, 20,
+ 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 14, 14, 14, 13, 13, 13, 13, 13,
+ 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 19, 19, 18, 17, 17,
+ 17, 16, 16, 15, 14, 14, 14, 14, 13, 13, 13, 12, 12, 12, 19, 20, 20, 20,
+ 20, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15,
+ 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 19, 19, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 13, 13,
+ 13, 12, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12,
+ 11, 11, 17, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 17,
+ 16, 16, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 12, 11, 11,
+ /* Size 4x8 */
+ 32, 32, 28, 20, 32, 31, 28, 21, 32, 30, 27, 21, 30, 28, 23, 19, 29, 27,
+ 21, 17, 26, 24, 19, 15, 22, 22, 17, 13, 20, 20, 16, 12,
+ /* Size 8x4 */
+ 32, 32, 32, 30, 29, 26, 22, 20, 32, 31, 30, 28, 27, 24, 22, 20, 28, 28,
+ 27, 23, 21, 19, 17, 16, 20, 21, 21, 19, 17, 15, 13, 12,
+ /* Size 8x16 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32,
+ 32, 31, 29, 25, 23, 21, 33, 32, 31, 31, 29, 25, 23, 21, 32, 32, 30, 30,
+ 28, 24, 23, 20, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23,
+ 22, 20, 30, 30, 28, 27, 24, 21, 20, 19, 28, 30, 28, 26, 21, 19, 18, 17,
+ 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 23, 25,
+ 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16, 15, 14, 21, 22, 22, 21,
+ 18, 15, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 18, 19, 19, 19, 16, 14,
+ 13, 12,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 32, 32, 32, 30, 28, 27, 26, 23, 22, 21, 19, 18, 33, 32,
+ 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 23, 22, 21, 19, 32, 32, 32, 31,
+ 30, 29, 29, 28, 28, 26, 26, 24, 23, 22, 20, 19, 32, 31, 31, 31, 30, 28,
+ 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 28, 29, 29, 29, 28, 27, 26, 24,
+ 21, 21, 20, 19, 18, 18, 17, 16, 23, 24, 25, 25, 24, 24, 23, 21, 19, 18,
+ 18, 16, 16, 15, 14, 14, 22, 23, 23, 23, 23, 23, 22, 20, 18, 18, 17, 16,
+ 15, 14, 14, 13, 19, 20, 21, 21, 20, 21, 20, 19, 17, 16, 16, 14, 14, 13,
+ 12, 12,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 32, 32, 32, 29, 28, 27, 23, 23, 22, 19, 19, 17, 33, 32,
+ 32, 32, 32, 32, 31, 29, 29, 28, 24, 24, 22, 20, 20, 18, 33, 32, 32, 32,
+ 32, 32, 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32,
+ 31, 29, 29, 28, 24, 24, 23, 20, 20, 18, 33, 32, 32, 32, 32, 32, 31, 30,
+ 29, 28, 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 32, 31, 31, 30, 30, 28,
+ 25, 25, 23, 21, 21, 19, 33, 32, 32, 32, 31, 31, 31, 29, 29, 28, 25, 25,
+ 23, 21, 21, 19, 32, 32, 32, 32, 31, 30, 30, 28, 28, 27, 24, 24, 23, 21,
+ 21, 19, 32, 32, 32, 31, 30, 30, 30, 28, 28, 27, 24, 24, 23, 20, 20, 19,
+ 32, 32, 32, 31, 30, 30, 29, 28, 28, 27, 24, 24, 23, 21, 21, 19, 32, 32,
+ 31, 31, 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 32, 31, 31,
+ 29, 29, 28, 27, 27, 26, 24, 24, 23, 21, 21, 19, 32, 31, 31, 31, 29, 28,
+ 28, 26, 26, 25, 23, 23, 22, 20, 20, 19, 30, 30, 30, 30, 28, 28, 27, 24,
+ 24, 23, 21, 21, 20, 19, 19, 18, 30, 30, 30, 30, 28, 28, 27, 24, 24, 23,
+ 21, 21, 20, 19, 19, 18, 29, 30, 30, 30, 28, 28, 26, 23, 23, 22, 20, 20,
+ 19, 18, 18, 17, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17,
+ 17, 16, 28, 29, 30, 29, 28, 27, 26, 22, 21, 21, 19, 19, 18, 17, 17, 16,
+ 27, 28, 28, 28, 26, 26, 25, 21, 21, 20, 18, 18, 18, 16, 16, 15, 26, 27,
+ 28, 27, 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 26, 27, 28, 27,
+ 26, 26, 24, 21, 20, 20, 18, 18, 17, 16, 16, 15, 24, 26, 26, 26, 24, 24,
+ 23, 20, 20, 19, 17, 17, 16, 15, 15, 14, 23, 24, 25, 25, 24, 24, 23, 20,
+ 19, 18, 16, 16, 16, 14, 14, 14, 23, 24, 25, 25, 24, 24, 23, 20, 19, 18,
+ 16, 16, 16, 14, 14, 13, 22, 23, 23, 23, 23, 23, 22, 19, 18, 18, 16, 16,
+ 15, 14, 14, 13, 21, 22, 23, 23, 22, 22, 21, 19, 18, 17, 15, 15, 15, 13,
+ 13, 13, 21, 22, 22, 22, 22, 22, 21, 18, 18, 17, 15, 15, 14, 13, 13, 13,
+ 19, 20, 21, 21, 21, 21, 20, 18, 17, 17, 14, 14, 14, 13, 13, 12, 19, 20,
+ 21, 21, 20, 20, 20, 17, 17, 16, 14, 14, 14, 12, 12, 12, 19, 20, 20, 20,
+ 20, 20, 19, 17, 17, 16, 14, 14, 13, 12, 12, 12, 18, 19, 19, 19, 19, 19,
+ 19, 17, 16, 15, 14, 14, 13, 12, 12, 11, 18, 19, 19, 19, 19, 19, 19, 17,
+ 16, 15, 14, 14, 13, 12, 12, 11,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 28, 27, 27, 26,
+ 24, 24, 23, 22, 22, 20, 20, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 23, 23,
+ 22, 21, 21, 20, 19, 19, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 23, 23, 22, 21, 21, 20,
+ 19, 19, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 28,
+ 28, 28, 26, 26, 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 28, 27, 27, 26, 26,
+ 26, 24, 24, 24, 23, 22, 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31,
+ 31, 30, 30, 29, 28, 28, 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23,
+ 22, 21, 21, 20, 20, 19, 19, 19, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28,
+ 27, 27, 26, 24, 24, 23, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18,
+ 17, 17, 17, 17, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24,
+ 24, 23, 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 27, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 25, 23, 23, 22, 21, 21,
+ 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15, 15, 23, 24, 24, 24,
+ 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17,
+ 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 23, 24, 24, 24, 25, 25, 25, 24,
+ 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15,
+ 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16, 15, 15, 14, 14, 14, 13,
+ 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18,
+ 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 19, 20,
+ 20, 20, 21, 21, 21, 21, 20, 21, 21, 21, 20, 19, 19, 18, 17, 17, 16, 16,
+ 16, 15, 14, 14, 14, 13, 13, 13, 12, 12, 12, 12, 17, 18, 18, 18, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 17, 16, 16, 15, 15, 15, 14, 14, 13,
+ 13, 13, 13, 12, 12, 12, 11, 11,
+ /* Size 4x16 */
+ 33, 32, 27, 19, 32, 32, 28, 20, 32, 32, 28, 21, 32, 31, 28, 21, 32, 30,
+ 27, 20, 32, 29, 26, 21, 31, 28, 25, 20, 30, 28, 23, 19, 29, 27, 21, 17,
+ 28, 26, 20, 16, 27, 26, 20, 16, 24, 24, 18, 14, 23, 23, 18, 14, 22, 22,
+ 17, 13, 20, 20, 16, 12, 19, 19, 15, 12,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 32, 32, 31, 30, 29, 28, 27, 24, 23, 22, 20, 19, 32, 32,
+ 32, 31, 30, 29, 28, 28, 27, 26, 26, 24, 23, 22, 20, 19, 27, 28, 28, 28,
+ 27, 26, 25, 23, 21, 20, 20, 18, 18, 17, 16, 15, 19, 20, 21, 21, 20, 21,
+ 20, 19, 17, 16, 16, 14, 14, 13, 12, 12,
+ /* Size 8x32 */
+ 32, 33, 32, 32, 28, 23, 22, 19, 33, 32, 32, 31, 29, 24, 22, 20, 33, 32,
+ 32, 31, 29, 24, 23, 20, 33, 32, 32, 31, 29, 24, 23, 20, 33, 32, 32, 31,
+ 29, 25, 23, 21, 33, 32, 32, 31, 30, 25, 23, 21, 33, 32, 31, 31, 29, 25,
+ 23, 21, 32, 32, 31, 30, 28, 24, 23, 21, 32, 32, 30, 30, 28, 24, 23, 20,
+ 32, 32, 30, 29, 28, 24, 23, 21, 32, 31, 29, 28, 27, 24, 23, 21, 32, 31,
+ 29, 28, 27, 24, 23, 21, 32, 31, 29, 28, 26, 23, 22, 20, 30, 30, 28, 27,
+ 24, 21, 20, 19, 30, 30, 28, 27, 24, 21, 20, 19, 29, 30, 28, 26, 23, 20,
+ 19, 18, 28, 30, 28, 26, 21, 19, 18, 17, 28, 30, 28, 26, 21, 19, 18, 17,
+ 27, 28, 26, 25, 21, 18, 18, 16, 26, 28, 26, 24, 20, 18, 17, 16, 26, 28,
+ 26, 24, 20, 18, 17, 16, 24, 26, 24, 23, 20, 17, 16, 15, 23, 25, 24, 23,
+ 19, 16, 16, 14, 23, 25, 24, 23, 19, 16, 16, 14, 22, 23, 23, 22, 18, 16,
+ 15, 14, 21, 23, 22, 21, 18, 15, 15, 13, 21, 22, 22, 21, 18, 15, 14, 13,
+ 19, 21, 21, 20, 17, 14, 14, 13, 19, 21, 20, 20, 17, 14, 14, 12, 19, 20,
+ 20, 19, 17, 14, 13, 12, 18, 19, 19, 19, 16, 14, 13, 12, 18, 19, 19, 19,
+ 16, 14, 13, 12,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 29, 28, 28,
+ 27, 26, 26, 24, 23, 23, 22, 21, 21, 19, 19, 19, 18, 18, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 28, 28, 28, 26,
+ 25, 25, 23, 23, 22, 21, 21, 20, 19, 19, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 22,
+ 22, 21, 20, 20, 19, 19, 32, 31, 31, 31, 31, 31, 31, 30, 30, 29, 28, 28,
+ 28, 27, 27, 26, 26, 26, 25, 24, 24, 23, 23, 23, 22, 21, 21, 20, 20, 19,
+ 19, 19, 28, 29, 29, 29, 29, 30, 29, 28, 28, 28, 27, 27, 26, 24, 24, 23,
+ 21, 21, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 23, 24,
+ 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 21, 21, 20, 19, 19, 18, 18,
+ 18, 17, 16, 16, 16, 15, 15, 14, 14, 14, 14, 14, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 20, 20, 19, 18, 18, 18, 17, 17, 16, 16, 16,
+ 15, 15, 14, 14, 14, 13, 13, 13, 19, 20, 20, 20, 21, 21, 21, 21, 20, 21,
+ 21, 21, 20, 19, 19, 18, 17, 17, 16, 16, 16, 15, 14, 14, 14, 13, 13, 13,
+ 12, 12, 12, 12 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 27, 22, 21, 27, 22, 22, 22, 22, 22, 19, 18, 21, 22, 18, 16,
+ /* Size 8x8 */
+ 33, 33, 29, 24, 21, 22, 21, 20, 33, 32, 28, 24, 22, 23, 22, 21, 29, 28,
+ 25, 23, 22, 23, 22, 21, 24, 24, 23, 21, 20, 21, 20, 20, 21, 22, 22, 20,
+ 19, 19, 19, 19, 22, 23, 23, 21, 19, 18, 17, 17, 21, 22, 22, 20, 19, 17,
+ 17, 16, 20, 21, 21, 20, 19, 17, 16, 15,
+ /* Size 16x16 */
+ 32, 33, 34, 33, 31, 28, 27, 25, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 33, 32, 30, 27, 26, 24, 22, 22, 22, 22, 21, 21, 20, 20, 34, 33, 33, 32,
+ 29, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 33, 32, 32, 31, 28, 26,
+ 25, 24, 22, 22, 23, 23, 22, 22, 22, 21, 31, 30, 29, 28, 26, 24, 23, 23,
+ 22, 22, 22, 23, 22, 22, 22, 21, 28, 27, 26, 26, 24, 22, 22, 22, 21, 22,
+ 22, 23, 22, 22, 22, 21, 27, 26, 25, 25, 23, 22, 22, 21, 21, 21, 21, 22,
+ 22, 22, 21, 21, 25, 24, 24, 24, 23, 22, 21, 21, 20, 20, 21, 21, 20, 20,
+ 20, 20, 21, 22, 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 21, 22,
+ 22, 23, 22, 22, 21, 21, 19, 19, 19, 19, 18, 18, 18, 18, 21, 22, 23, 23,
+ 23, 23, 22, 21, 19, 19, 19, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22,
+ 22, 20, 19, 19, 18, 17, 17, 17, 16, 16, 20, 21, 22, 22, 22, 22, 22, 20,
+ 19, 19, 18, 17, 17, 17, 16, 16, 20, 20, 21, 22, 22, 22, 21, 20, 19, 18,
+ 18, 17, 16, 16, 16, 15, 19, 20, 20, 21, 21, 21, 21, 20, 19, 18, 18, 17,
+ 16, 16, 15, 14,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 27, 25, 25, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 33, 33, 33, 30, 30, 29, 27, 27, 26, 24, 24, 23, 21, 21, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 33, 33, 33, 33, 33, 33, 32, 30,
+ 30, 29, 27, 27, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 20, 20, 20, 20, 33, 33, 33, 33, 33, 33, 32, 30, 30, 28, 27, 27,
+ 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20,
+ 20, 20, 34, 33, 33, 33, 33, 33, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23,
+ 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 34, 33,
+ 33, 33, 33, 32, 32, 29, 29, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23,
+ 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 33, 33, 32, 32, 32, 32,
+ 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 29, 27, 27, 26,
+ 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+ 22, 21, 21, 21, 31, 30, 30, 30, 29, 29, 28, 27, 26, 26, 24, 24, 23, 23,
+ 23, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+ 30, 29, 29, 28, 28, 28, 28, 26, 26, 25, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27, 27, 27,
+ 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 27, 26, 26, 26, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 21, 21, 27, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21,
+ 20, 20, 20, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 25, 24,
+ 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 21, 20, 20, 20, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 23, 23, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22,
+ 22, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 21, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 17, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+ 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 16, 16, 20, 21, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16,
+ 16, 16, 16, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+ 20, 20, 19, 19, 19, 18, 18, 17, 17, 17, 17, 17, 17, 16, 16, 16, 16, 16,
+ 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20, 20, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17,
+ 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21, 21, 21,
+ 21, 21, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16,
+ 16, 15, 15, 15, 15, 15, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+ 21, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15,
+ 14, 14, 19, 19, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 19,
+ 19, 19, 18, 18, 18, 17, 17, 16, 16, 16, 16, 15, 15, 15, 14, 14,
+ /* Size 4x8 */
+ 33, 27, 22, 20, 33, 26, 22, 21, 28, 23, 22, 22, 24, 22, 20, 20, 22, 21,
+ 19, 19, 22, 22, 19, 17, 21, 22, 19, 16, 20, 21, 18, 15,
+ /* Size 8x4 */
+ 33, 33, 28, 24, 22, 22, 21, 20, 27, 26, 23, 22, 21, 22, 22, 21, 22, 22,
+ 22, 20, 19, 19, 19, 18, 20, 21, 22, 20, 19, 17, 16, 15,
+ /* Size 8x16 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32,
+ 27, 26, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23, 22, 21, 31, 28, 25, 23,
+ 22, 22, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22,
+ 22, 21, 24, 24, 22, 21, 20, 21, 20, 20, 21, 22, 21, 21, 19, 19, 19, 19,
+ 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 23,
+ 23, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21,
+ 19, 17, 17, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 20, 21, 20, 19, 17,
+ 16, 15,
+ /* Size 16x8 */
+ 32, 33, 34, 33, 31, 28, 26, 24, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 32, 31, 28, 26, 25, 24, 22, 22, 22, 23, 22, 22, 21, 20, 29, 28, 27, 27,
+ 25, 23, 22, 22, 21, 22, 22, 23, 22, 22, 22, 21, 27, 26, 26, 25, 23, 22,
+ 22, 21, 21, 21, 21, 22, 21, 21, 21, 20, 21, 22, 22, 22, 22, 22, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 23, 23, 22, 23, 22, 21, 19, 19,
+ 18, 18, 17, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 20, 19, 19, 18, 17,
+ 17, 17, 16, 16, 20, 20, 21, 21, 22, 22, 21, 20, 19, 18, 18, 17, 16, 16,
+ 16, 15,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 29, 28, 27, 22, 21, 21, 21, 21, 20, 20, 20, 19, 33, 33,
+ 33, 32, 28, 27, 26, 22, 22, 22, 21, 21, 21, 20, 20, 19, 33, 33, 33, 32,
+ 28, 27, 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 33, 33, 33, 32, 28, 27,
+ 26, 22, 22, 22, 22, 22, 21, 20, 20, 20, 34, 33, 32, 32, 27, 26, 26, 23,
+ 22, 22, 23, 23, 22, 21, 21, 20, 34, 33, 32, 31, 27, 26, 25, 23, 22, 22,
+ 23, 23, 22, 21, 21, 20, 33, 32, 31, 31, 27, 26, 25, 23, 22, 22, 23, 23,
+ 22, 21, 21, 20, 31, 29, 29, 28, 25, 24, 24, 22, 22, 22, 23, 23, 22, 22,
+ 22, 21, 31, 29, 28, 28, 25, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 30, 28, 28, 28, 24, 23, 23, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26,
+ 26, 25, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 28, 26, 26, 25,
+ 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 21, 26, 26, 25, 24, 22, 22,
+ 22, 21, 21, 21, 22, 22, 22, 21, 21, 20, 24, 24, 24, 24, 22, 22, 21, 20,
+ 20, 20, 21, 21, 20, 20, 20, 20, 24, 24, 24, 24, 22, 22, 21, 20, 20, 20,
+ 21, 21, 20, 20, 20, 20, 23, 23, 23, 23, 22, 22, 21, 20, 20, 20, 20, 20,
+ 20, 20, 20, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 21, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 21, 20, 19, 19, 19, 19, 19, 18, 18, 18, 21, 22,
+ 22, 22, 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 22, 22,
+ 22, 22, 21, 20, 19, 19, 18, 18, 18, 18, 18, 17, 21, 22, 23, 23, 22, 22,
+ 22, 20, 19, 19, 18, 18, 18, 17, 17, 17, 21, 22, 23, 23, 23, 22, 22, 20,
+ 19, 19, 18, 18, 17, 17, 17, 17, 21, 22, 23, 23, 22, 22, 22, 20, 19, 19,
+ 18, 18, 17, 17, 17, 16, 20, 22, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17,
+ 17, 16, 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16,
+ 16, 16, 20, 21, 22, 22, 22, 22, 21, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+ 20, 21, 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 20, 21,
+ 21, 21, 22, 22, 21, 19, 19, 18, 17, 17, 16, 16, 16, 15, 19, 20, 21, 21,
+ 21, 21, 21, 19, 19, 18, 17, 17, 16, 15, 15, 15, 19, 20, 20, 20, 21, 21,
+ 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 19, 20, 20, 20, 21, 21, 20, 19,
+ 19, 18, 17, 17, 16, 15, 15, 14,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 33, 33, 32, 29, 29, 28, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 33, 33, 33, 33, 32, 32, 31, 29,
+ 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 21, 21, 21, 20, 20, 33, 32, 32, 32, 32, 31, 31, 28, 28, 28, 25, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 21, 21, 21,
+ 20, 20, 29, 28, 28, 28, 27, 27, 27, 25, 25, 24, 23, 23, 22, 22, 22, 22,
+ 21, 21, 22, 22, 22, 22, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 28, 27,
+ 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25,
+ 25, 24, 23, 23, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 21, 21, 21, 21, 21, 21, 20, 20, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 21, 22, 22,
+ 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 21, 21, 22, 22, 23, 23, 23, 23,
+ 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18, 18, 18, 18, 18, 17, 17,
+ 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16,
+ 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 20, 20,
+ 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 18, 18,
+ 18, 17, 17, 17, 16, 16, 16, 16, 16, 15, 15, 15, 19, 19, 20, 20, 20, 20,
+ 20, 21, 21, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 17, 17, 17, 17, 16,
+ 16, 16, 16, 15, 15, 15, 14, 14,
+ /* Size 4x16 */
+ 33, 28, 21, 20, 33, 27, 22, 20, 33, 26, 22, 21, 32, 26, 22, 21, 29, 24,
+ 22, 22, 26, 22, 22, 22, 26, 22, 21, 21, 24, 22, 20, 20, 22, 21, 19, 19,
+ 22, 22, 19, 18, 22, 22, 19, 18, 22, 22, 19, 17, 22, 22, 19, 16, 21, 22,
+ 19, 16, 21, 22, 18, 16, 20, 21, 18, 15,
+ /* Size 16x4 */
+ 33, 33, 33, 32, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 20, 28, 27,
+ 26, 26, 24, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22,
+ 22, 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 18, 20, 20, 21, 21, 22, 22,
+ 21, 20, 19, 18, 18, 17, 16, 16, 16, 15,
+ /* Size 8x32 */
+ 32, 33, 29, 27, 21, 21, 20, 20, 33, 33, 28, 26, 22, 21, 21, 20, 33, 33,
+ 28, 26, 22, 22, 21, 20, 33, 33, 28, 26, 22, 22, 21, 20, 34, 32, 27, 26,
+ 22, 23, 22, 21, 34, 32, 27, 25, 22, 23, 22, 21, 33, 31, 27, 25, 22, 23,
+ 22, 21, 31, 29, 25, 24, 22, 23, 22, 22, 31, 28, 25, 23, 22, 22, 22, 22,
+ 30, 28, 24, 23, 22, 23, 22, 22, 28, 26, 23, 22, 22, 23, 22, 22, 28, 26,
+ 23, 22, 22, 23, 22, 22, 26, 25, 22, 22, 21, 22, 22, 21, 24, 24, 22, 21,
+ 20, 21, 20, 20, 24, 24, 22, 21, 20, 21, 20, 20, 23, 23, 22, 21, 20, 20,
+ 20, 20, 21, 22, 21, 21, 19, 19, 19, 19, 21, 22, 21, 21, 19, 19, 19, 19,
+ 21, 22, 22, 21, 19, 19, 19, 18, 21, 22, 22, 21, 19, 18, 18, 18, 21, 22,
+ 22, 21, 19, 18, 18, 18, 21, 23, 22, 22, 19, 18, 18, 17, 21, 23, 23, 22,
+ 19, 18, 17, 17, 21, 23, 22, 22, 19, 18, 17, 17, 20, 22, 22, 21, 19, 17,
+ 17, 16, 20, 22, 22, 21, 19, 17, 17, 16, 20, 22, 22, 21, 19, 17, 17, 16,
+ 20, 21, 22, 21, 19, 17, 16, 16, 20, 21, 22, 21, 19, 17, 16, 16, 19, 21,
+ 21, 21, 19, 17, 16, 15, 19, 20, 21, 20, 19, 17, 16, 15, 19, 20, 21, 20,
+ 19, 17, 16, 15,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 34, 34, 33, 31, 31, 30, 28, 28, 26, 24, 24, 23, 21, 21,
+ 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 33, 33, 33, 33,
+ 32, 32, 31, 29, 28, 28, 26, 26, 25, 24, 24, 23, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 29, 28, 28, 28, 27, 27, 27, 25,
+ 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 27, 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21,
+ 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+ 22, 22, 23, 23, 23, 23, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 18,
+ 18, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17,
+ 17, 17, 17, 16, 16, 16, 16, 16, 20, 20, 20, 20, 21, 21, 21, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16,
+ 16, 15, 15, 15 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 29, 24, 32, 30, 28, 24, 29, 28, 21, 19, 24, 24, 19, 16,
+ /* Size 8x8 */
+ 33, 33, 32, 32, 30, 28, 24, 22, 33, 32, 32, 32, 30, 28, 25, 23, 32, 32,
+ 31, 30, 29, 27, 24, 23, 32, 32, 30, 29, 28, 26, 24, 22, 30, 30, 29, 28,
+ 25, 23, 21, 20, 28, 28, 27, 26, 23, 20, 18, 17, 24, 25, 24, 24, 21, 18,
+ 16, 15, 22, 23, 23, 22, 20, 17, 15, 14,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+ 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 29, 27, 26, 24, 23, 23, 33, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 27, 25, 23, 23, 33, 32, 32, 32, 31, 31, 31, 30,
+ 29, 28, 28, 26, 26, 24, 23, 23, 33, 32, 32, 32, 31, 31, 30, 30, 29, 28,
+ 28, 26, 26, 24, 23, 23, 32, 32, 32, 32, 31, 30, 29, 28, 28, 27, 27, 26,
+ 25, 24, 23, 22, 32, 31, 31, 31, 30, 30, 28, 28, 27, 26, 26, 24, 24, 23,
+ 22, 22, 30, 30, 30, 31, 29, 29, 28, 27, 26, 24, 24, 23, 22, 22, 20, 20,
+ 29, 29, 30, 30, 28, 28, 27, 26, 24, 22, 22, 21, 20, 20, 19, 19, 28, 29,
+ 29, 30, 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 26, 27, 27, 28,
+ 26, 26, 26, 24, 23, 21, 20, 19, 19, 18, 17, 17, 25, 26, 26, 27, 26, 26,
+ 25, 24, 22, 20, 20, 19, 18, 17, 17, 16, 23, 24, 24, 25, 24, 24, 24, 23,
+ 22, 20, 19, 18, 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19,
+ 18, 17, 17, 16, 15, 15, 21, 22, 23, 23, 23, 23, 22, 22, 20, 19, 18, 17,
+ 16, 15, 15, 14,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+ 26, 26, 26, 24, 24, 23, 22, 22, 22, 20, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24,
+ 24, 24, 23, 22, 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 22,
+ 22, 21, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 30, 29, 29, 28, 27, 27, 26, 24, 24, 24, 23, 23, 23, 21, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+ 30, 28, 28, 28, 27, 25, 25, 25, 23, 23, 23, 22, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+ 27, 25, 25, 25, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 28, 27, 27, 26, 25, 25, 24,
+ 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27,
+ 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 28, 28, 26, 26, 26, 25, 24,
+ 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 21, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 29, 28, 28,
+ 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28, 28, 27, 27, 27, 26, 26,
+ 26, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 20, 30, 30, 30, 30, 30, 31,
+ 31, 30, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23,
+ 22, 22, 22, 21, 20, 20, 20, 19, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29,
+ 29, 28, 28, 28, 27, 26, 26, 26, 24, 24, 24, 23, 23, 23, 22, 22, 22, 21,
+ 20, 20, 20, 19, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 27, 26, 26, 25, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19,
+ 29, 29, 29, 29, 30, 30, 30, 30, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+ 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 19, 18, 28, 29, 29, 29,
+ 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29, 29, 30, 30, 29,
+ 28, 28, 28, 28, 27, 27, 26, 24, 24, 23, 22, 21, 21, 20, 20, 20, 20, 19,
+ 19, 19, 18, 18, 18, 18, 27, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 26,
+ 26, 26, 25, 23, 23, 23, 21, 20, 20, 20, 20, 20, 19, 18, 18, 18, 18, 17,
+ 17, 17, 26, 26, 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23,
+ 23, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 26, 26,
+ 27, 27, 27, 28, 28, 27, 26, 26, 26, 26, 26, 26, 24, 23, 23, 22, 21, 20,
+ 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 25, 26, 26, 26, 26, 27,
+ 27, 26, 26, 26, 26, 25, 25, 25, 24, 22, 22, 22, 20, 20, 20, 19, 19, 19,
+ 18, 17, 17, 17, 17, 16, 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24,
+ 24, 24, 24, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16,
+ 16, 15, 15, 15, 23, 24, 24, 24, 24, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+ 23, 22, 22, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15,
+ 23, 23, 24, 24, 24, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21,
+ 19, 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 15, 22, 22, 23, 23,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18,
+ 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 21, 22, 22, 22, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15,
+ 15, 15, 15, 14, 14, 14, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 20, 20, 20, 19, 18, 18, 17, 17, 17, 16, 15, 15, 15, 15, 14,
+ 14, 14, 20, 20, 21, 21, 21, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20, 19,
+ 19, 19, 18, 18, 18, 17, 16, 16, 16, 15, 15, 15, 14, 14, 14, 13,
+ /* Size 4x8 */
+ 33, 32, 29, 24, 32, 31, 30, 25, 32, 30, 28, 24, 32, 29, 27, 24, 30, 28,
+ 24, 21, 28, 26, 21, 18, 24, 24, 19, 16, 22, 22, 18, 15,
+ /* Size 8x4 */
+ 33, 32, 32, 32, 30, 28, 24, 22, 32, 31, 30, 29, 28, 26, 24, 22, 29, 30,
+ 28, 27, 24, 21, 19, 18, 24, 25, 24, 24, 21, 18, 16, 15,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32,
+ 32, 32, 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 31, 30,
+ 29, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27,
+ 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24, 21, 20,
+ 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 26, 28,
+ 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20, 17, 17, 23, 25, 24, 24,
+ 20, 19, 16, 16, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23, 23, 22, 19, 18,
+ 15, 15,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 30, 29, 28, 26, 25, 23, 22, 21, 33, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 30, 28, 26, 25, 23, 23, 33, 32, 32, 32,
+ 31, 31, 30, 30, 29, 28, 28, 26, 26, 24, 23, 23, 32, 32, 32, 31, 30, 30,
+ 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 29, 29, 30, 30, 29, 28, 28, 26,
+ 25, 23, 22, 21, 21, 20, 19, 19, 28, 29, 29, 30, 28, 28, 27, 26, 24, 22,
+ 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24, 24, 23, 21, 20, 19, 18,
+ 17, 16, 16, 15, 22, 23, 23, 23, 23, 23, 23, 22, 20, 19, 18, 17, 17, 16,
+ 15, 15,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 29, 28, 28, 26, 23, 23, 22, 19, 33, 33,
+ 32, 32, 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 22, 20, 33, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32,
+ 32, 31, 29, 29, 29, 26, 24, 24, 23, 20, 33, 32, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 26, 25, 25, 23, 20, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 30, 27, 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 27,
+ 25, 25, 23, 21, 33, 32, 32, 32, 32, 31, 31, 31, 30, 29, 29, 27, 25, 25,
+ 23, 21, 33, 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 26, 24, 24, 23, 21,
+ 32, 32, 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32,
+ 32, 32, 31, 30, 30, 30, 28, 28, 28, 26, 24, 24, 23, 20, 32, 32, 32, 32,
+ 31, 29, 29, 29, 28, 28, 28, 26, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29,
+ 29, 28, 28, 27, 27, 25, 24, 24, 23, 21, 32, 32, 31, 31, 30, 29, 29, 28,
+ 28, 27, 27, 25, 24, 24, 23, 21, 32, 31, 31, 31, 30, 28, 28, 28, 26, 26,
+ 26, 24, 23, 23, 22, 20, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23,
+ 21, 21, 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 25, 24, 24, 23, 21, 21,
+ 20, 19, 30, 30, 30, 30, 29, 28, 28, 27, 24, 24, 24, 22, 21, 21, 20, 19,
+ 29, 29, 30, 30, 28, 27, 27, 26, 23, 22, 22, 20, 20, 20, 19, 17, 28, 29,
+ 30, 30, 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 28, 29, 30, 30,
+ 28, 27, 27, 26, 22, 21, 21, 20, 19, 19, 18, 17, 27, 28, 28, 28, 28, 26,
+ 26, 25, 22, 21, 21, 19, 18, 18, 18, 16, 26, 27, 28, 28, 26, 26, 26, 24,
+ 21, 20, 20, 19, 18, 18, 17, 16, 26, 27, 28, 28, 26, 26, 26, 24, 21, 20,
+ 20, 19, 18, 18, 17, 16, 25, 26, 26, 26, 26, 25, 25, 24, 21, 20, 20, 18,
+ 17, 17, 17, 15, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16,
+ 16, 14, 23, 24, 25, 25, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 16, 14,
+ 23, 24, 24, 24, 24, 24, 24, 23, 20, 19, 19, 17, 16, 16, 15, 14, 22, 23,
+ 23, 23, 23, 23, 23, 22, 19, 18, 18, 17, 16, 16, 15, 14, 21, 22, 23, 23,
+ 23, 22, 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 21, 22, 23, 23, 23, 22,
+ 22, 21, 19, 18, 18, 17, 15, 15, 15, 13, 20, 21, 22, 22, 21, 21, 21, 20,
+ 18, 18, 18, 16, 15, 15, 14, 13,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 29, 29, 29, 28,
+ 27, 27, 26, 24, 24, 24, 23, 22, 22, 21, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25,
+ 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 28, 28, 28, 26, 25, 25, 24, 23, 23,
+ 23, 22, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 26, 26, 26, 24, 24, 24, 23, 23, 23, 21, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 21, 32, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 25, 24, 24, 24, 23, 22, 22, 21, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 29, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 22, 21, 21, 20, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28,
+ 26, 25, 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18,
+ 28, 29, 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24,
+ 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 28, 29, 29, 29,
+ 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21, 21, 21,
+ 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 26, 26, 26, 26, 26, 27, 27, 27,
+ 26, 26, 26, 26, 25, 25, 24, 23, 23, 22, 20, 20, 20, 19, 19, 19, 18, 17,
+ 17, 17, 17, 17, 17, 16, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24,
+ 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15,
+ 15, 15, 23, 24, 24, 24, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 23, 21,
+ 21, 21, 20, 19, 19, 18, 18, 18, 17, 16, 16, 16, 16, 15, 15, 15, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 20, 20, 20, 19, 18,
+ 18, 18, 17, 17, 17, 16, 16, 15, 15, 15, 15, 14, 19, 20, 20, 20, 20, 21,
+ 21, 21, 21, 20, 20, 21, 21, 21, 20, 19, 19, 19, 17, 17, 17, 16, 16, 16,
+ 15, 14, 14, 14, 14, 13, 13, 13,
+ /* Size 4x16 */
+ 33, 32, 28, 23, 32, 32, 29, 24, 32, 32, 29, 25, 32, 31, 30, 25, 32, 30,
+ 28, 24, 32, 30, 28, 24, 32, 29, 27, 24, 31, 28, 26, 23, 30, 28, 24, 21,
+ 29, 27, 22, 20, 29, 27, 21, 19, 27, 26, 20, 18, 26, 25, 20, 17, 24, 24,
+ 19, 16, 23, 23, 18, 16, 22, 22, 18, 15,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 29, 27, 26, 24, 23, 22, 32, 32,
+ 32, 31, 30, 30, 29, 28, 28, 27, 27, 26, 25, 24, 23, 22, 28, 29, 29, 30,
+ 28, 28, 27, 26, 24, 22, 21, 20, 20, 19, 18, 18, 23, 24, 25, 25, 24, 24,
+ 24, 23, 21, 20, 19, 18, 17, 16, 16, 15,
+ /* Size 8x32 */
+ 32, 33, 33, 32, 29, 28, 23, 22, 33, 32, 32, 32, 29, 29, 24, 22, 33, 32,
+ 32, 32, 29, 29, 24, 23, 33, 32, 32, 32, 29, 29, 24, 23, 33, 32, 32, 32,
+ 30, 29, 25, 23, 33, 32, 32, 31, 30, 30, 25, 23, 33, 32, 32, 31, 30, 30,
+ 25, 23, 33, 32, 32, 31, 30, 29, 25, 23, 33, 32, 31, 30, 29, 28, 24, 23,
+ 32, 32, 31, 30, 28, 28, 24, 23, 32, 32, 31, 30, 28, 28, 24, 23, 32, 32,
+ 31, 29, 28, 28, 24, 23, 32, 31, 30, 29, 28, 27, 24, 23, 32, 31, 30, 29,
+ 28, 27, 24, 23, 32, 31, 30, 28, 26, 26, 23, 22, 30, 30, 29, 28, 25, 24,
+ 21, 20, 30, 30, 29, 28, 25, 24, 21, 20, 30, 30, 29, 28, 24, 24, 21, 20,
+ 29, 30, 28, 27, 23, 22, 20, 19, 28, 30, 28, 27, 22, 21, 19, 18, 28, 30,
+ 28, 27, 22, 21, 19, 18, 27, 28, 28, 26, 22, 21, 18, 18, 26, 28, 26, 26,
+ 21, 20, 18, 17, 26, 28, 26, 26, 21, 20, 18, 17, 25, 26, 26, 25, 21, 20,
+ 17, 17, 23, 25, 24, 24, 20, 19, 16, 16, 23, 25, 24, 24, 20, 19, 16, 16,
+ 23, 24, 24, 24, 20, 19, 16, 15, 22, 23, 23, 23, 19, 18, 16, 15, 21, 23,
+ 23, 22, 19, 18, 15, 15, 21, 23, 23, 22, 19, 18, 15, 15, 20, 22, 21, 21,
+ 18, 18, 15, 14,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 30, 30,
+ 29, 28, 28, 27, 26, 26, 25, 23, 23, 23, 22, 21, 21, 20, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 26, 25, 25, 24, 23, 23, 23, 22, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 26, 26, 26, 24,
+ 24, 24, 23, 23, 23, 21, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 21, 29, 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 26, 25,
+ 25, 24, 23, 22, 22, 22, 21, 21, 21, 20, 20, 20, 19, 19, 19, 18, 28, 29,
+ 29, 29, 29, 30, 30, 29, 28, 28, 28, 28, 27, 27, 26, 24, 24, 24, 22, 21,
+ 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 18, 23, 24, 24, 24, 25, 25,
+ 25, 25, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 18, 18, 18,
+ 17, 16, 16, 16, 16, 15, 15, 15, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 20, 20, 20, 19, 18, 18, 18, 17, 17, 17, 16, 16, 15,
+ 15, 15, 15, 14 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 28, 22, 22, 28, 23, 22, 23, 22, 22, 19, 19, 22, 23, 19, 17,
+ /* Size 8x8 */
+ 33, 33, 30, 28, 24, 21, 22, 21, 33, 32, 29, 26, 24, 22, 23, 22, 30, 29,
+ 26, 24, 23, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 24, 24, 23, 22,
+ 21, 20, 20, 20, 21, 22, 22, 22, 20, 19, 19, 19, 22, 23, 23, 23, 20, 19,
+ 18, 17, 21, 22, 22, 22, 20, 19, 17, 17,
+ /* Size 16x16 */
+ 32, 33, 33, 34, 31, 31, 28, 27, 25, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+ 33, 33, 30, 30, 27, 26, 24, 22, 22, 22, 22, 22, 21, 21, 33, 33, 33, 33,
+ 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 32, 30, 29,
+ 26, 25, 24, 23, 22, 23, 23, 23, 22, 22, 31, 30, 30, 30, 28, 27, 24, 24,
+ 23, 22, 22, 22, 22, 23, 22, 22, 31, 30, 29, 29, 27, 26, 24, 23, 23, 22,
+ 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 22,
+ 22, 23, 22, 22, 27, 26, 26, 25, 24, 23, 22, 22, 21, 21, 21, 21, 22, 22,
+ 22, 22, 25, 24, 24, 24, 23, 23, 22, 21, 21, 20, 20, 21, 21, 21, 20, 20,
+ 22, 22, 22, 23, 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 19, 19, 21, 22,
+ 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23,
+ 22, 22, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 21, 22, 22, 23, 22, 22,
+ 22, 22, 21, 20, 19, 19, 19, 18, 18, 18, 21, 22, 22, 23, 23, 23, 23, 22,
+ 21, 20, 19, 19, 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19,
+ 19, 18, 18, 17, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18,
+ 18, 17, 17, 17,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 25, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 24, 24, 22, 21, 21, 21,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 30, 30, 28, 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 28,
+ 27, 27, 26, 24, 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 33, 33, 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24,
+ 24, 24, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 34, 33,
+ 33, 33, 33, 32, 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 34, 33, 33, 33, 33, 32,
+ 32, 32, 30, 29, 29, 27, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 23, 23,
+ 23, 23, 23, 23, 22, 22, 22, 22, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28,
+ 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24,
+ 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22,
+ 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 31, 30, 30, 30,
+ 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 29, 28, 28, 28, 28, 27, 27, 27,
+ 25, 25, 25, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22,
+ 22, 22, 28, 28, 27, 27, 26, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 27, 26,
+ 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23,
+ 23, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 18, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22,
+ 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19,
+ 19, 18, 18, 18, 18, 18, 18, 18, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 21, 21, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 21, 21, 20,
+ 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19,
+ 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17,
+ 17, 17, 17, 17, 17, 16, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17, 17, 17,
+ 17, 16, 20, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20,
+ 20, 20, 19, 19, 19, 18, 18, 18, 18, 17, 17, 17, 17, 16, 16, 16,
+ /* Size 4x8 */
+ 33, 27, 22, 21, 33, 26, 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 24, 22,
+ 20, 20, 22, 22, 19, 19, 22, 22, 19, 18, 21, 22, 19, 17,
+ /* Size 8x4 */
+ 33, 33, 29, 26, 24, 22, 22, 21, 27, 26, 24, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 20, 19, 19, 19, 21, 23, 22, 23, 20, 19, 18, 17,
+ /* Size 8x16 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32,
+ 30, 26, 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24,
+ 22, 22, 23, 22, 31, 28, 27, 24, 22, 22, 22, 22, 28, 26, 24, 22, 22, 22,
+ 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20, 21, 20,
+ 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+ 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19, 18, 18, 21, 23, 23, 22,
+ 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19,
+ 17, 17,
+ /* Size 16x8 */
+ 32, 33, 33, 34, 31, 31, 28, 26, 24, 22, 21, 21, 21, 21, 20, 20, 33, 33,
+ 32, 32, 29, 28, 26, 25, 24, 22, 22, 22, 23, 23, 22, 22, 31, 30, 30, 29,
+ 28, 27, 24, 24, 23, 22, 22, 22, 22, 23, 22, 22, 28, 27, 26, 26, 24, 24,
+ 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 23, 22, 21, 19, 19, 18,
+ 18, 18, 17, 17, 20, 21, 22, 22, 22, 22, 22, 22, 20, 19, 19, 18, 18, 17,
+ 17, 17,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 23, 21, 21, 21, 21, 21, 20, 20, 33, 33,
+ 33, 33, 31, 27, 27, 26, 23, 22, 22, 21, 21, 21, 21, 20, 33, 33, 33, 33,
+ 30, 27, 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 33, 33, 30, 27,
+ 27, 26, 23, 22, 22, 22, 22, 22, 21, 20, 33, 33, 32, 32, 30, 26, 26, 26,
+ 23, 22, 22, 22, 22, 22, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22,
+ 22, 23, 23, 23, 22, 21, 34, 33, 32, 32, 29, 26, 26, 25, 23, 22, 22, 23,
+ 23, 23, 22, 21, 33, 32, 31, 31, 29, 26, 26, 25, 23, 22, 22, 23, 23, 23,
+ 22, 21, 31, 30, 29, 29, 28, 24, 24, 24, 22, 22, 22, 22, 23, 23, 22, 22,
+ 31, 29, 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 31, 29,
+ 28, 28, 27, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27,
+ 25, 23, 23, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22,
+ 22, 22, 22, 22, 22, 22, 23, 23, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 21, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21,
+ 21, 21, 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21,
+ 20, 20, 24, 24, 24, 24, 23, 22, 22, 21, 20, 20, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 21, 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 21, 22,
+ 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+ 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 21, 20, 19, 19, 19, 19, 19, 19, 18, 21, 22, 22, 22, 22, 22, 22, 21,
+ 20, 19, 19, 19, 18, 18, 18, 18, 21, 22, 22, 22, 22, 22, 22, 21, 20, 19,
+ 19, 19, 18, 18, 18, 18, 21, 22, 23, 23, 22, 22, 22, 22, 20, 19, 19, 19,
+ 18, 18, 18, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18,
+ 17, 17, 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17,
+ 21, 22, 23, 23, 23, 22, 22, 22, 20, 19, 19, 18, 18, 18, 17, 17, 20, 21,
+ 22, 22, 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22,
+ 22, 22, 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22,
+ 22, 21, 20, 19, 19, 18, 17, 17, 17, 16, 20, 21, 22, 22, 22, 22, 22, 21,
+ 20, 19, 19, 18, 17, 17, 17, 16,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 24, 24, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 33, 33, 33, 33, 32, 32, 32, 31,
+ 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23,
+ 23, 23, 22, 22, 22, 22, 33, 33, 33, 33, 32, 32, 32, 31, 29, 28, 28, 27,
+ 26, 26, 25, 24, 24, 24, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 22, 22,
+ 22, 22, 31, 31, 30, 30, 30, 29, 29, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 28, 27,
+ 27, 27, 26, 26, 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+ 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
+ 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20,
+ 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23,
+ 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17,
+ 17, 17, 21, 21, 22, 22, 22, 23, 23, 23, 23, 22, 22, 23, 23, 23, 22, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 20, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 20, 19, 19,
+ 19, 19, 18, 18, 18, 17, 17, 17, 17, 17, 17, 17, 20, 20, 20, 20, 21, 21,
+ 21, 21, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 17, 17, 17, 17, 16, 16, 16, 16,
+ /* Size 4x16 */
+ 33, 28, 21, 21, 33, 27, 22, 22, 33, 26, 22, 22, 33, 26, 22, 23, 30, 24,
+ 22, 23, 29, 24, 22, 22, 26, 22, 22, 23, 26, 22, 21, 22, 24, 22, 20, 21,
+ 22, 21, 20, 19, 22, 21, 19, 19, 22, 22, 19, 18, 22, 22, 19, 18, 22, 22,
+ 19, 18, 21, 22, 19, 17, 21, 22, 19, 17,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 30, 29, 26, 26, 24, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+ 26, 26, 24, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 22, 21, 22, 22, 22,
+ 22, 22, 22, 21, 20, 20, 19, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22,
+ 23, 22, 21, 19, 19, 18, 18, 18, 17, 17,
+ /* Size 8x32 */
+ 32, 33, 31, 28, 23, 21, 21, 20, 33, 33, 31, 27, 23, 22, 21, 21, 33, 33,
+ 30, 27, 23, 22, 22, 21, 33, 33, 30, 27, 23, 22, 22, 21, 33, 32, 30, 26,
+ 23, 22, 22, 22, 34, 32, 29, 26, 23, 22, 23, 22, 34, 32, 29, 26, 23, 22,
+ 23, 22, 33, 31, 29, 26, 23, 22, 23, 22, 31, 29, 28, 24, 22, 22, 23, 22,
+ 31, 28, 27, 24, 22, 22, 22, 22, 31, 28, 27, 24, 22, 22, 22, 22, 29, 27,
+ 25, 23, 22, 22, 23, 22, 28, 26, 24, 22, 22, 22, 23, 22, 28, 26, 24, 22,
+ 22, 22, 23, 22, 26, 25, 24, 22, 21, 21, 22, 22, 24, 24, 23, 22, 21, 20,
+ 21, 20, 24, 24, 23, 22, 21, 20, 21, 20, 24, 24, 23, 22, 20, 20, 20, 20,
+ 22, 22, 22, 21, 20, 20, 19, 19, 21, 22, 22, 21, 20, 19, 19, 19, 21, 22,
+ 22, 21, 20, 19, 19, 19, 21, 22, 22, 22, 20, 19, 19, 19, 21, 22, 22, 22,
+ 20, 19, 18, 18, 21, 22, 22, 22, 20, 19, 18, 18, 21, 23, 22, 22, 20, 19,
+ 18, 18, 21, 23, 23, 22, 20, 19, 18, 17, 21, 23, 23, 22, 20, 19, 18, 17,
+ 21, 23, 23, 22, 20, 19, 18, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22,
+ 22, 22, 20, 19, 17, 17, 20, 22, 22, 22, 20, 19, 17, 17, 20, 22, 22, 22,
+ 20, 19, 17, 17,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 24, 24,
+ 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 24, 24, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 22, 22, 22, 22, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 27, 27, 25, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 23,
+ 23, 23, 23, 22, 22, 23, 23, 23, 22, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 18, 17, 17, 17, 17, 20, 21, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 20, 20, 20, 19, 19, 19, 19, 18, 18, 18, 17, 17, 17,
+ 17, 17, 17, 17 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 30, 27, 32, 31, 29, 26, 30, 29, 26, 23, 27, 26, 23, 19,
+ /* Size 8x8 */
+ 33, 33, 32, 32, 31, 30, 28, 25, 33, 32, 32, 32, 31, 30, 28, 26, 32, 32,
+ 32, 31, 30, 29, 28, 26, 32, 32, 31, 30, 29, 28, 27, 25, 31, 31, 30, 29,
+ 28, 26, 25, 23, 30, 30, 29, 28, 26, 24, 22, 21, 28, 28, 28, 27, 25, 22,
+ 20, 19, 25, 26, 26, 25, 23, 21, 19, 18,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 28, 28, 25, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 26, 26, 24, 33, 32, 32, 32, 32, 31, 31, 30, 30, 29, 29, 28,
+ 28, 26, 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26,
+ 26, 24, 32, 32, 32, 32, 32, 30, 30, 29, 29, 28, 28, 27, 27, 26, 26, 24,
+ 30, 30, 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 30, 30,
+ 30, 31, 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 22, 28, 29, 29, 30,
+ 30, 28, 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28,
+ 28, 27, 27, 24, 24, 21, 21, 20, 20, 19, 26, 27, 27, 28, 28, 26, 26, 26,
+ 26, 23, 23, 20, 20, 19, 19, 18, 26, 27, 27, 28, 28, 26, 26, 26, 26, 23,
+ 23, 20, 20, 19, 19, 18, 23, 24, 24, 25, 25, 24, 24, 24, 24, 22, 22, 19,
+ 19, 18, 18, 16,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+ 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+ 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30,
+ 30, 30, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28,
+ 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 27, 27, 27, 26, 25, 25, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 26, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 26,
+ 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28,
+ 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 26, 25, 24, 24, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 26, 25, 24, 24, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30,
+ 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25,
+ 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 30, 30, 30, 30, 30, 30, 31, 31,
+ 31, 30, 29, 29, 29, 28, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+ 23, 23, 23, 22, 22, 22, 29, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+ 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+ 20, 20, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27,
+ 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29,
+ 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+ 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+ 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+ 27, 27, 27, 27, 26, 26, 26, 25, 23, 23, 23, 22, 21, 21, 21, 20, 20, 20,
+ 20, 19, 18, 18, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26,
+ 26, 26, 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18,
+ 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24,
+ 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 26, 26, 27, 27,
+ 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 24, 23, 23, 23, 22,
+ 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 26, 26, 25, 25, 25, 25, 23, 22, 22, 22, 21, 20, 20, 20, 19,
+ 18, 18, 18, 18, 17, 17, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24,
+ 24, 24, 24, 24, 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17,
+ 16, 16, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24,
+ 24, 23, 22, 22, 22, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16,
+ /* Size 4x8 */
+ 33, 32, 30, 26, 32, 32, 30, 27, 32, 31, 30, 27, 32, 31, 28, 26, 31, 30,
+ 27, 24, 30, 28, 25, 22, 28, 27, 23, 20, 26, 26, 22, 18,
+ /* Size 8x4 */
+ 33, 32, 32, 32, 31, 30, 28, 26, 32, 32, 31, 31, 30, 28, 27, 26, 30, 30,
+ 30, 28, 27, 25, 23, 22, 26, 27, 27, 26, 24, 22, 20, 18,
+ /* Size 8x16 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32,
+ 32, 32, 32, 29, 29, 24, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31,
+ 31, 30, 30, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28,
+ 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27, 27, 24,
+ 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 28, 30,
+ 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 26, 28, 28, 26,
+ 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 23, 25, 25, 24, 24, 19,
+ 19, 16,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 30, 28, 28, 26, 26, 23, 33, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 33, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 28, 28, 25, 32, 32, 32, 31, 31, 30,
+ 30, 29, 29, 28, 28, 27, 27, 26, 26, 24, 32, 32, 32, 31, 31, 30, 30, 29,
+ 29, 28, 28, 27, 27, 26, 26, 24, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24,
+ 24, 21, 21, 20, 20, 19, 28, 29, 29, 30, 30, 28, 28, 27, 27, 24, 24, 21,
+ 21, 20, 20, 19, 23, 24, 24, 25, 25, 24, 24, 24, 24, 21, 21, 19, 19, 18,
+ 18, 16,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 26, 23, 23, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 30, 29, 29, 29, 26, 24, 24, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 30, 29, 29, 29, 27, 24, 24, 33, 32, 32, 32, 32, 32, 32, 32, 32, 30,
+ 29, 29, 29, 27, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 28, 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28,
+ 25, 25, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 28, 25, 25,
+ 33, 32, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 27, 25, 25, 32, 32,
+ 32, 32, 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32,
+ 32, 31, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 32, 32, 32, 32, 32, 31, 30, 30,
+ 30, 28, 28, 28, 28, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28,
+ 27, 27, 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27,
+ 27, 26, 24, 24, 32, 32, 31, 31, 31, 30, 29, 29, 29, 28, 27, 27, 27, 26,
+ 24, 24, 31, 31, 31, 31, 31, 30, 28, 28, 28, 27, 26, 26, 26, 24, 23, 23,
+ 30, 30, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30,
+ 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 30, 30, 30, 30,
+ 30, 29, 28, 28, 28, 26, 24, 24, 24, 23, 21, 21, 29, 30, 30, 30, 30, 28,
+ 28, 28, 28, 25, 23, 23, 23, 22, 20, 20, 28, 29, 30, 30, 30, 28, 27, 27,
+ 27, 24, 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24,
+ 21, 21, 21, 20, 19, 19, 28, 29, 30, 30, 30, 28, 27, 27, 27, 24, 21, 21,
+ 21, 20, 19, 19, 28, 28, 28, 28, 28, 27, 26, 26, 26, 23, 21, 21, 21, 20,
+ 18, 18, 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18,
+ 26, 27, 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 26, 27,
+ 28, 28, 28, 26, 26, 26, 26, 23, 20, 20, 20, 19, 18, 18, 25, 26, 26, 26,
+ 26, 26, 24, 24, 24, 22, 20, 20, 20, 18, 17, 17, 23, 24, 25, 25, 25, 24,
+ 24, 24, 24, 21, 19, 19, 19, 18, 16, 16, 23, 24, 25, 25, 25, 24, 24, 24,
+ 24, 21, 19, 19, 19, 18, 16, 16,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 27, 27, 27, 26, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26,
+ 25, 25, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 30, 30, 30, 30, 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 28, 28, 28, 28, 27, 26, 26, 26, 26, 24, 24, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27,
+ 27, 26, 26, 26, 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 30, 30, 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26,
+ 26, 24, 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24,
+ 30, 30, 30, 30, 30, 30, 31, 31, 31, 30, 29, 29, 29, 28, 28, 28, 28, 27,
+ 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 21, 21, 28, 29, 29, 29,
+ 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+ 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30,
+ 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21,
+ 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28,
+ 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21, 21, 21, 20, 20, 20, 20,
+ 19, 19, 26, 26, 27, 27, 27, 27, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26,
+ 26, 24, 23, 23, 23, 22, 20, 20, 20, 20, 19, 19, 19, 18, 18, 18, 23, 24,
+ 24, 24, 24, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21,
+ 21, 20, 19, 19, 19, 18, 18, 18, 18, 17, 16, 16, 23, 24, 24, 24, 24, 25,
+ 25, 25, 25, 25, 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19,
+ 19, 18, 18, 18, 18, 17, 16, 16,
+ /* Size 4x16 */
+ 33, 32, 30, 26, 32, 32, 30, 27, 32, 32, 30, 27, 32, 32, 31, 28, 32, 32,
+ 31, 28, 32, 31, 29, 26, 32, 31, 29, 26, 32, 30, 28, 26, 32, 30, 28, 26,
+ 30, 29, 26, 23, 30, 29, 26, 23, 29, 28, 24, 20, 29, 28, 24, 20, 27, 26,
+ 23, 19, 27, 26, 23, 19, 24, 24, 21, 18,
+ /* Size 16x4 */
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 30, 30, 29, 29, 27, 27, 24, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 29, 29, 28, 28, 26, 26, 24, 30, 30, 30, 31,
+ 31, 29, 29, 28, 28, 26, 26, 24, 24, 23, 23, 21, 26, 27, 27, 28, 28, 26,
+ 26, 26, 26, 23, 23, 20, 20, 19, 19, 18,
+ /* Size 8x32 */
+ 32, 33, 33, 32, 32, 28, 28, 23, 33, 33, 33, 32, 32, 29, 29, 24, 33, 32,
+ 32, 32, 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 24, 33, 32, 32, 32,
+ 32, 29, 29, 24, 33, 32, 32, 32, 32, 29, 29, 25, 33, 32, 32, 31, 31, 30,
+ 30, 25, 33, 32, 32, 31, 31, 30, 30, 25, 33, 32, 32, 31, 31, 30, 30, 25,
+ 33, 32, 32, 31, 31, 29, 29, 25, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32,
+ 32, 30, 30, 28, 28, 24, 32, 32, 32, 30, 30, 28, 28, 24, 32, 32, 32, 30,
+ 30, 28, 28, 24, 32, 31, 31, 29, 29, 27, 27, 24, 32, 31, 31, 29, 29, 27,
+ 27, 24, 32, 31, 31, 29, 29, 27, 27, 24, 31, 31, 31, 28, 28, 26, 26, 23,
+ 30, 30, 30, 28, 28, 24, 24, 21, 30, 30, 30, 28, 28, 24, 24, 21, 30, 30,
+ 30, 28, 28, 24, 24, 21, 29, 30, 30, 28, 28, 23, 23, 20, 28, 30, 30, 27,
+ 27, 21, 21, 19, 28, 30, 30, 27, 27, 21, 21, 19, 28, 30, 30, 27, 27, 21,
+ 21, 19, 28, 28, 28, 26, 26, 21, 21, 18, 26, 28, 28, 26, 26, 20, 20, 18,
+ 26, 28, 28, 26, 26, 20, 20, 18, 26, 28, 28, 26, 26, 20, 20, 18, 25, 26,
+ 26, 24, 24, 20, 20, 17, 23, 25, 25, 24, 24, 19, 19, 16, 23, 25, 25, 24,
+ 24, 19, 19, 16,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 30, 30, 29, 28, 28, 28, 28, 26, 26, 26, 25, 23, 23, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 30, 28, 28, 28, 28, 26, 25, 25, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 28, 28, 26, 25, 25, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 30, 29, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24,
+ 24, 24, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 28, 28, 28, 28, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 28, 29,
+ 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24,
+ 24, 23, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19, 28, 29, 29, 29, 29, 29,
+ 30, 30, 30, 29, 28, 28, 28, 28, 27, 27, 27, 26, 24, 24, 24, 23, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25,
+ 24, 24, 24, 24, 24, 24, 24, 23, 21, 21, 21, 20, 19, 19, 19, 18, 18, 18,
+ 18, 17, 16, 16 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 30, 24, 22, 30, 26, 23, 22, 24, 23, 21, 21, 22, 22, 21, 19,
+ /* Size 8x8 */
+ 33, 33, 32, 29, 26, 23, 21, 21, 33, 33, 31, 28, 25, 23, 22, 22, 32, 31,
+ 29, 26, 24, 23, 22, 23, 29, 28, 26, 24, 23, 22, 22, 22, 26, 25, 24, 23,
+ 22, 21, 21, 22, 23, 23, 23, 22, 21, 20, 20, 20, 21, 22, 22, 22, 21, 20,
+ 19, 19, 21, 22, 23, 22, 22, 20, 19, 18,
+ /* Size 16x16 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 25, 25, 21, 21, 21, 21, 21, 33, 33,
+ 33, 33, 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 33, 33, 33, 33,
+ 33, 30, 30, 27, 27, 24, 24, 22, 22, 22, 22, 22, 34, 33, 33, 32, 32, 29,
+ 29, 26, 26, 24, 24, 22, 22, 23, 23, 23, 34, 33, 33, 32, 32, 29, 29, 26,
+ 26, 24, 24, 22, 22, 23, 23, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23,
+ 23, 22, 22, 22, 22, 23, 31, 30, 30, 29, 29, 26, 26, 24, 24, 23, 23, 22,
+ 22, 22, 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22,
+ 22, 23, 28, 27, 27, 26, 26, 24, 24, 22, 22, 22, 22, 21, 21, 22, 22, 23,
+ 25, 24, 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 25, 24,
+ 24, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 21, 21, 20, 20, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22,
+ 22, 21, 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 22, 22, 21,
+ 21, 19, 19, 19, 19, 19, 21, 22, 22, 23, 23, 23, 23, 23, 23, 21, 21, 19,
+ 19, 19, 19, 18,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 25, 25, 25, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 30, 30, 29, 28, 28, 28, 26, 24, 24, 24, 23,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30,
+ 30, 28, 27, 27, 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27,
+ 27, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+ 22, 22, 23, 23, 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31,
+ 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23,
+ 23, 23, 23, 23, 34, 33, 33, 33, 33, 33, 32, 32, 32, 31, 29, 29, 29, 28,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 29, 28, 28, 28, 26, 25, 25, 25, 24,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 26, 26,
+ 26, 25, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 29, 29, 28, 28, 28, 28, 28, 28, 28, 26, 25, 25, 25, 24, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28,
+ 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 23, 23, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24, 23, 23, 23, 23,
+ 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+ 21, 21, 21, 21, 21, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+ 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19,
+ 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
+ 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21,
+ 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19, 19, 19,
+ 19, 19, 19, 18, 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18,
+ 18, 18, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 19, 19, 19, 18, 18, 18,
+ /* Size 4x8 */
+ 33, 30, 24, 21, 33, 29, 24, 22, 31, 28, 23, 22, 28, 25, 22, 22, 26, 23,
+ 21, 21, 23, 22, 21, 20, 22, 22, 20, 19, 22, 22, 21, 19,
+ /* Size 8x4 */
+ 33, 33, 31, 28, 26, 23, 22, 22, 30, 29, 28, 25, 23, 22, 22, 22, 24, 24,
+ 23, 22, 21, 21, 20, 21, 21, 22, 22, 22, 21, 20, 19, 19,
+ /* Size 8x16 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33,
+ 33, 27, 27, 22, 22, 22, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26,
+ 26, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22,
+ 22, 22, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23,
+ 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 21, 22,
+ 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 22,
+ 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19,
+ 19, 18,
+ /* Size 16x8 */
+ 32, 33, 33, 34, 34, 31, 31, 28, 28, 24, 24, 21, 21, 21, 21, 21, 33, 33,
+ 33, 32, 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 33, 33, 33, 32,
+ 32, 28, 28, 26, 26, 24, 24, 22, 22, 22, 22, 23, 28, 27, 27, 26, 26, 24,
+ 24, 22, 22, 22, 22, 21, 21, 22, 22, 22, 28, 27, 27, 26, 26, 24, 24, 22,
+ 22, 22, 22, 21, 21, 22, 22, 22, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20,
+ 20, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 20, 19,
+ 19, 19, 19, 19, 21, 22, 22, 23, 23, 22, 22, 23, 23, 21, 21, 19, 19, 18,
+ 18, 18,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 31, 28, 28, 28, 24, 21, 21, 21, 21, 21, 21, 33, 33,
+ 33, 33, 33, 30, 28, 28, 28, 24, 22, 22, 22, 21, 21, 21, 33, 33, 33, 33,
+ 33, 30, 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30,
+ 27, 27, 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 30, 27, 27,
+ 27, 24, 22, 22, 22, 22, 22, 22, 33, 33, 32, 32, 32, 29, 26, 26, 26, 24,
+ 22, 22, 22, 22, 22, 22, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22,
+ 22, 23, 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23,
+ 23, 23, 34, 33, 32, 32, 32, 29, 26, 26, 26, 24, 22, 22, 22, 23, 23, 23,
+ 32, 31, 30, 30, 30, 28, 25, 25, 25, 23, 22, 22, 22, 22, 23, 23, 31, 30,
+ 28, 28, 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28,
+ 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 31, 30, 28, 28, 28, 26,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 29, 28, 27, 27, 27, 25, 23, 23,
+ 23, 22, 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22,
+ 22, 22, 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22,
+ 22, 22, 23, 23, 28, 27, 26, 26, 26, 24, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 26, 26, 25, 25, 25, 23, 22, 22, 22, 21, 21, 21, 21, 21, 22, 22,
+ 24, 24, 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24,
+ 24, 24, 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 24, 24, 24, 24,
+ 24, 23, 22, 22, 22, 21, 20, 20, 20, 20, 21, 21, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 21, 21,
+ 21, 20, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20,
+ 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 21, 21, 21, 20, 19, 19,
+ 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19,
+ 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 20, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23,
+ 23, 22, 22, 22, 22, 21, 19, 19, 19, 19, 18, 18, 21, 22, 23, 23, 23, 23,
+ 22, 22, 22, 21, 19, 19, 19, 18, 18, 18, 21, 22, 23, 23, 23, 23, 22, 22,
+ 22, 21, 19, 19, 19, 18, 18, 18,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 31, 30, 30, 30, 28, 27, 27, 27, 26, 24, 24, 24, 23,
+ 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28,
+ 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26,
+ 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 26, 26, 26, 25, 24, 24, 24, 23, 23, 23,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 28, 28, 27, 27, 27, 26,
+ 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 21,
+ 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19,
+ 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19,
+ 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 20, 20, 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18, 21, 21,
+ 22, 22, 22, 22, 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21,
+ 21, 20, 19, 19, 19, 19, 18, 18, 18, 18, 18, 18, 21, 21, 22, 22, 22, 22,
+ 23, 23, 23, 23, 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19,
+ 19, 19, 18, 18, 18, 18, 18, 18,
+ /* Size 4x16 */
+ 33, 31, 24, 21, 33, 30, 24, 22, 33, 30, 24, 22, 33, 29, 24, 23, 33, 29,
+ 24, 23, 30, 26, 23, 22, 30, 26, 23, 22, 27, 24, 22, 22, 27, 24, 22, 22,
+ 24, 23, 21, 20, 24, 23, 21, 20, 21, 22, 20, 19, 21, 22, 20, 19, 22, 22,
+ 20, 19, 22, 22, 20, 19, 22, 23, 21, 18,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 30, 30, 27, 27, 24, 24, 21, 21, 22, 22, 22, 31, 30,
+ 30, 29, 29, 26, 26, 24, 24, 23, 23, 22, 22, 22, 22, 23, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 20, 21, 21, 22, 22, 23, 23, 22,
+ 22, 22, 22, 20, 20, 19, 19, 19, 19, 18,
+ /* Size 8x32 */
+ 32, 33, 33, 28, 28, 21, 21, 21, 33, 33, 33, 28, 28, 22, 22, 21, 33, 33,
+ 33, 27, 27, 22, 22, 22, 33, 33, 33, 27, 27, 22, 22, 22, 33, 33, 33, 27,
+ 27, 22, 22, 22, 33, 32, 32, 26, 26, 22, 22, 22, 34, 32, 32, 26, 26, 22,
+ 22, 23, 34, 32, 32, 26, 26, 22, 22, 23, 34, 32, 32, 26, 26, 22, 22, 23,
+ 32, 30, 30, 25, 25, 22, 22, 23, 31, 28, 28, 24, 24, 22, 22, 22, 31, 28,
+ 28, 24, 24, 22, 22, 22, 31, 28, 28, 24, 24, 22, 22, 22, 29, 27, 27, 23,
+ 23, 22, 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 28, 26, 26, 22, 22, 22,
+ 22, 23, 28, 26, 26, 22, 22, 22, 22, 23, 26, 25, 25, 22, 22, 21, 21, 22,
+ 24, 24, 24, 22, 22, 20, 20, 21, 24, 24, 24, 22, 22, 20, 20, 21, 24, 24,
+ 24, 22, 22, 20, 20, 21, 23, 23, 23, 22, 22, 20, 20, 20, 21, 22, 22, 21,
+ 21, 19, 19, 19, 21, 22, 22, 21, 21, 19, 19, 19, 21, 22, 22, 21, 21, 19,
+ 19, 19, 21, 22, 22, 22, 22, 19, 19, 19, 21, 22, 22, 22, 22, 19, 19, 18,
+ 21, 22, 22, 22, 22, 19, 19, 18, 21, 22, 22, 22, 22, 19, 19, 18, 21, 23,
+ 23, 22, 22, 19, 19, 18, 21, 23, 23, 22, 22, 19, 19, 18, 21, 23, 23, 22,
+ 22, 19, 19, 18,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 31, 31, 29, 28, 28, 28, 26,
+ 24, 24, 24, 23, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 30, 28, 28, 28, 27, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22,
+ 22, 22, 22, 23, 23, 23, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 26, 26, 26, 26, 25, 24, 24, 24, 23, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 20, 20, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20, 20, 20, 19, 19,
+ 19, 19, 19, 19, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23,
+ 22, 22, 22, 23, 23, 23, 23, 22, 21, 21, 21, 20, 19, 19, 19, 19, 18, 18,
+ 18, 18, 18, 18 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 29, 32, 32, 31, 29, 32, 31, 29, 27, 29, 29, 27, 22,
+ /* Size 8x8 */
+ 33, 33, 33, 32, 32, 32, 30, 29, 33, 32, 32, 32, 32, 31, 30, 29, 33, 32,
+ 32, 32, 32, 31, 31, 30, 32, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 30,
+ 29, 29, 28, 27, 32, 31, 31, 30, 29, 28, 27, 26, 30, 30, 31, 29, 28, 27,
+ 26, 24, 29, 29, 30, 28, 27, 26, 24, 21,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 30, 28, 28, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+ 29, 29, 28, 28, 33, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 29, 29, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29, 28, 28, 28, 28,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 29, 29, 28, 28, 28, 27, 27, 31, 31, 31, 31,
+ 31, 31, 30, 29, 29, 28, 28, 27, 26, 26, 24, 24, 30, 30, 30, 30, 31, 31,
+ 29, 29, 28, 28, 28, 26, 26, 25, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28,
+ 28, 28, 28, 26, 25, 24, 23, 23, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27,
+ 27, 24, 24, 23, 21, 21, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 24,
+ 24, 23, 21, 21,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 26,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 28, 27, 27, 27, 27, 26, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 26,
+ 26, 25, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29,
+ 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 30, 30,
+ 30, 30, 30, 30, 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28,
+ 28, 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30,
+ 30, 31, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26,
+ 26, 26, 25, 24, 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 24, 24, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 23, 23, 23, 23, 23,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 28,
+ 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 21, 28, 29, 29, 29,
+ 29, 29, 29, 30, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+ 24, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30,
+ 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24,
+ 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 24, 24, 24, 24, 23, 22, 21, 21,
+ 21, 21, 28, 28, 28, 28, 28, 28, 28, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 27, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 21, 21, 21, 21, 20,
+ /* Size 4x8 */
+ 33, 33, 32, 29, 32, 32, 32, 29, 32, 32, 31, 30, 32, 32, 30, 28, 32, 31,
+ 29, 27, 31, 31, 28, 26, 30, 30, 28, 24, 29, 30, 27, 21,
+ /* Size 8x4 */
+ 33, 32, 32, 32, 32, 31, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30, 32, 32,
+ 31, 30, 29, 28, 28, 27, 29, 29, 30, 28, 27, 26, 24, 21,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32,
+ 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32,
+ 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 31, 30, 30,
+ 29, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28,
+ 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 31, 31,
+ 31, 29, 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29,
+ 28, 28, 24, 23, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27,
+ 22, 21,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 30, 30, 28, 28, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 30, 30,
+ 30, 29, 29, 28, 28, 28, 27, 27, 32, 32, 32, 32, 31, 31, 30, 30, 30, 29,
+ 29, 28, 28, 28, 27, 27, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 22, 22, 28, 29, 29, 29, 30, 30, 28, 28, 28, 27, 27, 25, 24, 23,
+ 21, 21,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29,
+ 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 30, 29, 29, 29, 29, 33, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 30, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 30, 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30,
+ 30, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28,
+ 28, 28, 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 28, 27, 27, 27,
+ 32, 32, 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32,
+ 32, 31, 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 32, 32, 31,
+ 31, 31, 30, 29, 29, 29, 29, 28, 28, 27, 27, 27, 32, 31, 31, 31, 31, 31,
+ 30, 29, 28, 28, 28, 28, 26, 26, 26, 26, 31, 31, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 27, 26, 25, 25, 25, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28,
+ 28, 26, 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 25, 24,
+ 24, 24, 30, 30, 30, 30, 30, 30, 29, 28, 28, 28, 28, 26, 24, 23, 23, 23,
+ 29, 29, 30, 30, 30, 30, 28, 28, 27, 27, 27, 25, 23, 22, 22, 22, 28, 29,
+ 29, 30, 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30,
+ 30, 30, 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 29, 29, 30, 30, 30,
+ 28, 28, 27, 27, 27, 24, 22, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 27,
+ 26, 26, 26, 24, 22, 21, 21, 21,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 29, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30,
+ 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+ 30, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 28,
+ 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 29, 29, 29, 29, 29, 29, 30, 30, 30, 30, 30, 29,
+ 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 24, 23, 22, 22,
+ 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28,
+ 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29,
+ 29, 29, 29, 29, 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27,
+ 27, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 28, 29, 29, 29, 29, 29,
+ 29, 29, 30, 30, 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24,
+ 24, 24, 23, 22, 21, 21, 21, 21,
+ /* Size 4x16 */
+ 33, 33, 32, 28, 33, 32, 32, 29, 32, 32, 32, 29, 32, 32, 32, 29, 32, 32,
+ 31, 30, 32, 32, 31, 30, 32, 32, 30, 28, 32, 32, 30, 28, 32, 32, 30, 28,
+ 32, 31, 29, 27, 32, 31, 29, 27, 31, 31, 28, 25, 30, 30, 28, 24, 30, 30,
+ 28, 23, 29, 30, 27, 21, 29, 30, 27, 21,
+ /* Size 16x4 */
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 32, 32, 32, 32,
+ 31, 31, 30, 30, 30, 29, 29, 28, 28, 28, 27, 27, 28, 29, 29, 29, 30, 30,
+ 28, 28, 28, 27, 27, 25, 24, 23, 21, 21,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 32, 32, 29, 28, 33, 33, 33, 32, 32, 32, 29, 29, 33, 32,
+ 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32,
+ 32, 32, 29, 29, 33, 32, 32, 32, 32, 32, 29, 29, 33, 32, 32, 32, 32, 32,
+ 30, 29, 33, 32, 32, 32, 32, 32, 30, 29, 33, 32, 32, 32, 31, 31, 30, 30,
+ 33, 32, 32, 32, 31, 31, 30, 30, 33, 32, 32, 32, 31, 31, 30, 30, 33, 32,
+ 32, 31, 31, 31, 29, 29, 33, 32, 32, 31, 30, 30, 29, 28, 32, 32, 32, 31,
+ 30, 30, 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 32, 31, 30, 30,
+ 28, 28, 32, 32, 32, 31, 30, 30, 28, 28, 32, 32, 31, 31, 29, 29, 28, 27,
+ 32, 32, 31, 30, 29, 29, 28, 27, 32, 32, 31, 30, 29, 29, 28, 27, 32, 32,
+ 31, 30, 29, 29, 28, 27, 32, 31, 31, 30, 28, 28, 26, 26, 31, 31, 31, 29,
+ 28, 28, 26, 25, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28,
+ 25, 24, 30, 30, 30, 29, 28, 28, 25, 24, 30, 30, 30, 29, 28, 28, 24, 23,
+ 29, 30, 30, 28, 27, 27, 23, 22, 28, 29, 30, 28, 27, 27, 22, 21, 28, 29,
+ 30, 28, 27, 27, 22, 21, 28, 29, 30, 28, 27, 27, 22, 21, 28, 28, 28, 28,
+ 26, 26, 22, 21,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 30, 30, 30, 29, 29, 29, 28, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30, 30, 28, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29,
+ 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 29, 29, 29, 29, 29, 29,
+ 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 26, 26, 25,
+ 25, 25, 24, 23, 22, 22, 22, 22, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30,
+ 30, 29, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22,
+ 21, 21, 21, 21 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 32, 27, 22, 32, 30, 25, 22, 27, 25, 22, 22, 22, 22, 22, 20,
+ /* Size 8x8 */
+ 33, 33, 34, 30, 28, 26, 24, 21, 33, 33, 33, 30, 28, 26, 24, 22, 34, 33,
+ 32, 29, 26, 25, 24, 22, 30, 30, 29, 26, 24, 23, 23, 22, 28, 28, 26, 24,
+ 22, 22, 22, 22, 26, 26, 25, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 21,
+ 21, 20, 21, 22, 22, 22, 22, 21, 20, 19,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 30, 28, 28, 26, 25, 23, 21, 21, 33, 33,
+ 33, 33, 33, 33, 31, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33, 33, 33,
+ 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 22, 22, 33, 33, 33, 33, 33, 33,
+ 30, 29, 28, 26, 26, 25, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29,
+ 28, 26, 26, 24, 24, 23, 22, 22, 34, 33, 33, 33, 32, 32, 30, 29, 28, 26,
+ 26, 24, 24, 23, 22, 22, 31, 31, 30, 30, 30, 30, 28, 27, 26, 24, 24, 23,
+ 23, 23, 22, 22, 31, 30, 30, 29, 29, 29, 27, 26, 26, 24, 24, 23, 23, 22,
+ 22, 22, 30, 28, 28, 28, 28, 28, 26, 26, 24, 23, 23, 23, 22, 22, 22, 22,
+ 28, 27, 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 28, 27,
+ 27, 26, 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 26, 25, 25, 25,
+ 24, 24, 23, 23, 23, 22, 22, 21, 21, 21, 20, 20, 25, 24, 24, 24, 24, 24,
+ 23, 23, 22, 22, 22, 21, 21, 21, 20, 20, 23, 23, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 19, 19,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 30, 28,
+ 28, 28, 28, 27, 26, 25, 25, 25, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 29, 28, 28, 28, 28, 26,
+ 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 31, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24,
+ 23, 22, 21, 21, 21, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 30, 30, 30, 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30,
+ 28, 28, 27, 27, 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27,
+ 27, 26, 25, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29,
+ 29, 29, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22,
+ 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 34, 34, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 26, 26, 26, 26, 25,
+ 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 33, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 30, 28, 28, 28, 28, 27, 26, 25, 25, 25, 24, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 28,
+ 28, 27, 27, 27, 26, 25, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+ 26, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24,
+ 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 31, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24, 23, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 27, 26, 26, 26, 26, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26, 26, 26, 25, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 22, 27, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 24,
+ 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 26, 25, 25, 25, 25, 25, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+ 23, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 21, 20, 20, 20, 21, 25, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 21, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 21, 20, 20, 20, 20, 20, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 21, 20,
+ 20, 20, 20, 20, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+ 20, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20,
+ 20, 20, 19, 19, 19, 19, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 20, 19, 19,
+ 19, 19, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19,
+ /* Size 4x8 */
+ 33, 33, 28, 21, 33, 33, 27, 22, 33, 32, 26, 22, 30, 28, 24, 22, 28, 26,
+ 22, 22, 26, 25, 22, 21, 24, 24, 22, 20, 21, 22, 21, 19,
+ /* Size 8x4 */
+ 33, 33, 33, 30, 28, 26, 24, 21, 33, 33, 32, 28, 26, 25, 24, 22, 28, 27,
+ 26, 24, 22, 22, 22, 21, 21, 22, 22, 22, 22, 21, 20, 19,
+ /* Size 8x16 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33,
+ 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29,
+ 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 31, 30, 29, 28, 24, 24,
+ 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 29, 28, 28, 26, 23, 23, 22, 22,
+ 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 25, 24,
+ 24, 23, 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23,
+ 22, 22, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21,
+ 20, 19,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 34, 34, 31, 31, 29, 28, 28, 25, 24, 23, 21, 21, 33, 33,
+ 33, 33, 32, 32, 30, 29, 28, 26, 26, 24, 24, 23, 22, 22, 33, 33, 33, 32,
+ 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 31, 30, 30, 30, 29, 29,
+ 28, 27, 26, 24, 24, 23, 23, 23, 22, 22, 28, 27, 27, 26, 26, 26, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21, 28, 27, 27, 26, 26, 26, 24, 24, 23, 22,
+ 22, 22, 22, 22, 21, 21, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 20, 20,
+ 19, 19,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 26, 23, 21, 21, 21, 33, 33,
+ 33, 33, 33, 33, 31, 28, 28, 28, 28, 25, 23, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33,
+ 30, 28, 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28,
+ 27, 27, 27, 25, 23, 22, 22, 22, 33, 33, 33, 33, 33, 33, 30, 28, 27, 27,
+ 27, 25, 23, 22, 22, 22, 33, 33, 33, 32, 32, 32, 30, 28, 26, 26, 26, 25,
+ 23, 22, 22, 22, 34, 33, 33, 32, 32, 32, 30, 27, 26, 26, 26, 24, 23, 22,
+ 22, 22, 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22,
+ 34, 33, 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 34, 33,
+ 32, 32, 32, 32, 29, 27, 26, 26, 26, 24, 23, 22, 22, 22, 33, 32, 31, 31,
+ 31, 31, 28, 26, 25, 25, 25, 24, 23, 22, 22, 22, 31, 30, 30, 29, 29, 29,
+ 28, 26, 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25,
+ 24, 24, 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24,
+ 24, 23, 22, 22, 22, 22, 31, 30, 29, 28, 28, 28, 27, 25, 24, 24, 24, 23,
+ 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 26, 24, 23, 23, 23, 23, 22, 22,
+ 22, 22, 28, 28, 27, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 27, 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27,
+ 26, 26, 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 27, 26, 26,
+ 26, 26, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 26, 26, 26, 25, 25, 25,
+ 24, 22, 22, 22, 22, 21, 21, 21, 21, 21, 25, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 21, 21, 21, 21, 21, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 21, 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21,
+ 21, 20, 20, 20, 24, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 23, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 20, 20, 20, 20,
+ 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 21, 21,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22,
+ 22, 22, 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 21, 21, 21, 21, 20, 20, 19, 19, 19, 21, 21, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 21, 20, 19, 19, 19,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 28, 28, 27, 27, 27, 26,
+ 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+ 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28,
+ 28, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30,
+ 30, 30, 29, 29, 29, 28, 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23,
+ 23, 23, 23, 22, 22, 22, 22, 22, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27,
+ 27, 26, 26, 25, 25, 25, 24, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 21, 21, 21, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22,
+ 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 26, 25, 25, 25, 25, 25, 25, 24,
+ 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 21, 21, 21, 21, 21,
+ 21, 21, 20, 20, 20, 21, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 21, 20, 20, 20, 20,
+ 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 21, 21, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20,
+ 20, 20, 20, 20, 19, 19, 19, 19,
+ /* Size 4x16 */
+ 33, 33, 28, 21, 33, 33, 27, 22, 33, 33, 27, 22, 33, 32, 26, 22, 33, 32,
+ 26, 22, 33, 32, 26, 22, 30, 29, 24, 22, 30, 28, 24, 22, 28, 28, 23, 22,
+ 27, 26, 22, 22, 27, 26, 22, 22, 25, 24, 22, 21, 24, 24, 22, 20, 23, 23,
+ 22, 20, 21, 22, 21, 19, 21, 22, 21, 19,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 30, 30, 28, 27, 27, 25, 24, 23, 21, 21, 33, 33,
+ 33, 32, 32, 32, 29, 28, 28, 26, 26, 24, 24, 23, 22, 22, 28, 27, 27, 26,
+ 26, 26, 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 21, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 21, 20, 20, 19, 19,
+ /* Size 8x32 */
+ 32, 33, 33, 31, 28, 28, 23, 21, 33, 33, 33, 31, 28, 28, 23, 21, 33, 33,
+ 33, 30, 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 33, 30,
+ 27, 27, 23, 22, 33, 33, 33, 30, 27, 27, 23, 22, 33, 33, 32, 30, 26, 26,
+ 23, 22, 34, 33, 32, 30, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22,
+ 34, 32, 32, 29, 26, 26, 23, 22, 34, 32, 32, 29, 26, 26, 23, 22, 33, 31,
+ 31, 28, 25, 25, 23, 22, 31, 30, 29, 28, 24, 24, 22, 22, 31, 29, 28, 27,
+ 24, 24, 22, 22, 31, 29, 28, 27, 24, 24, 22, 22, 31, 29, 28, 27, 24, 24,
+ 22, 22, 29, 28, 28, 26, 23, 23, 22, 22, 28, 27, 26, 24, 22, 22, 22, 22,
+ 28, 26, 26, 24, 22, 22, 22, 22, 28, 26, 26, 24, 22, 22, 22, 22, 28, 26,
+ 26, 24, 22, 22, 22, 22, 26, 26, 25, 24, 22, 22, 21, 21, 25, 24, 24, 23,
+ 22, 22, 21, 21, 24, 24, 24, 23, 22, 22, 21, 20, 24, 24, 24, 23, 22, 22,
+ 21, 20, 24, 24, 24, 23, 22, 22, 21, 20, 23, 23, 23, 23, 22, 22, 20, 20,
+ 22, 22, 22, 22, 21, 21, 20, 20, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22,
+ 22, 22, 21, 21, 20, 19, 21, 22, 22, 22, 21, 21, 20, 19, 21, 22, 22, 22,
+ 22, 22, 20, 19,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 29, 28,
+ 28, 28, 28, 26, 25, 24, 24, 24, 23, 22, 21, 21, 21, 21, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 28, 27, 26, 26, 26, 26,
+ 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 29, 28, 28, 28, 28, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 27, 27, 27, 26, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 28, 28,
+ 27, 27, 27, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 23, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21, 22, 23, 23, 23, 23, 23, 23,
+ 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 21,
+ 21, 21, 20, 20, 20, 20, 20, 20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 20, 20, 20, 20, 20,
+ 19, 19, 19, 19 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 31, 30, 32, 31, 30, 29,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32,
+ 31, 31, 30, 29, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32, 30, 30,
+ 29, 28, 31, 31, 31, 31, 29, 29, 28, 27,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 30, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 29,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 29, 29, 29, 28, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 30, 29, 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 29, 28, 28, 28, 27, 30, 30, 30, 30, 30, 31, 31, 30, 29, 29, 29, 28,
+ 28, 28, 27, 26,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30,
+ 30, 30, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+ 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29,
+ 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 28, 27, 27, 27, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27,
+ 26, 26, 30, 30, 30, 30, 30, 30, 30, 30, 30, 31, 31, 31, 31, 31, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28, 28, 27, 27, 26, 26,
+ /* Size 4x8 */
+ 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 32, 32,
+ 31, 30, 32, 32, 30, 30, 32, 31, 30, 29, 31, 31, 29, 28,
+ /* Size 8x4 */
+ 33, 33, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32,
+ 32, 32, 31, 30, 30, 29, 32, 32, 32, 31, 30, 30, 29, 28,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 31, 33, 32,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31,
+ 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 30, 30, 30,
+ 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+ 32, 32, 31, 29, 29, 29, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31,
+ 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28,
+ 28, 27,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 30, 30, 30, 29, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 30, 29, 29, 29, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 29,
+ 29, 29, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 29, 28, 28,
+ 28, 27,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 30, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 30, 30,
+ 30, 29, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30, 30, 30, 30, 29, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 30, 30, 30, 30, 29, 29, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 29, 29, 29, 29, 29, 28, 32, 32, 32, 32, 31, 31, 31, 31, 31, 30,
+ 29, 29, 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29,
+ 29, 29, 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29,
+ 28, 28, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28,
+ 32, 32, 32, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 28, 28, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 27, 30, 30, 30, 30, 30, 30,
+ 30, 30, 29, 28, 28, 28, 28, 28, 27, 26, 30, 30, 30, 30, 30, 30, 30, 30,
+ 29, 28, 28, 28, 28, 28, 27, 26,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 30, 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 28, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30,
+ 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 27, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 31, 31, 31, 31, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 28, 28, 28, 27, 27, 26, 26,
+ /* Size 4x16 */
+ 33, 33, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32, 31, 31, 32, 32, 31, 30,
+ 32, 32, 31, 30, 32, 32, 31, 30, 32, 32, 30, 29, 32, 31, 30, 29, 32, 31,
+ 30, 29, 31, 31, 29, 28, 30, 30, 28, 28,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 30, 30, 30, 29, 28, 32, 32, 32, 32, 32, 31,
+ 31, 31, 30, 30, 30, 29, 29, 29, 28, 28,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 31, 33, 33,
+ 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32,
+ 32, 31, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 32, 32, 31,
+ 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32,
+ 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32,
+ 32, 31, 31, 31, 33, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 31, 31,
+ 31, 30, 33, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30,
+ 32, 32, 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 30, 32, 32,
+ 32, 32, 31, 30, 30, 30, 32, 32, 32, 32, 31, 30, 30, 29, 32, 32, 32, 32,
+ 31, 29, 29, 29, 32, 32, 31, 31, 31, 29, 29, 28, 32, 32, 31, 31, 30, 29,
+ 29, 28, 32, 32, 31, 31, 30, 29, 29, 28, 32, 32, 31, 31, 30, 29, 29, 28,
+ 32, 32, 31, 31, 30, 29, 29, 28, 32, 31, 31, 31, 30, 28, 28, 28, 31, 31,
+ 31, 31, 30, 28, 28, 28, 30, 30, 30, 30, 29, 28, 28, 27, 30, 30, 30, 30,
+ 29, 28, 28, 27,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 30, 30, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30,
+ 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 29, 28, 28, 28, 28, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 27, 27 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 30, 27, 33, 32, 29, 26, 30, 29, 26, 24, 27, 26, 24, 22,
+ /* Size 8x8 */
+ 33, 33, 33, 34, 30, 29, 28, 26, 33, 33, 33, 33, 30, 29, 27, 25, 33, 33,
+ 33, 33, 29, 28, 26, 25, 34, 33, 33, 32, 29, 28, 26, 24, 30, 30, 29, 29,
+ 26, 26, 24, 23, 29, 29, 28, 28, 26, 25, 23, 23, 28, 27, 26, 26, 24, 23,
+ 22, 22, 26, 25, 25, 24, 23, 23, 22, 21,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 27, 25, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 30, 30, 28, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 29, 29, 28, 26, 26, 26, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29,
+ 29, 27, 26, 26, 25, 24, 34, 33, 33, 33, 33, 32, 32, 32, 30, 29, 29, 27,
+ 26, 26, 25, 24, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26,
+ 25, 24, 31, 31, 30, 30, 30, 30, 30, 29, 28, 27, 27, 25, 24, 24, 24, 23,
+ 31, 30, 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 31, 30,
+ 30, 30, 29, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 29, 28, 28, 28,
+ 28, 27, 27, 27, 25, 25, 25, 23, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 22, 22, 22, 22, 22, 28, 28, 27, 27, 26, 26, 26, 26,
+ 24, 24, 24, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23,
+ 23, 22, 22, 22, 22, 21, 25, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 22,
+ 22, 22, 21, 21,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25, 25, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29, 28, 28, 28, 28,
+ 28, 27, 26, 26, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 27, 26, 25,
+ 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31,
+ 30, 30, 30, 30, 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30,
+ 30, 29, 28, 27, 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 29, 28, 27,
+ 27, 27, 27, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 27, 26,
+ 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24,
+ 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29,
+ 29, 29, 29, 28, 28, 26, 26, 26, 26, 26, 26, 25, 24, 24, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28,
+ 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26,
+ 26, 26, 25, 24, 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 31,
+ 30, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+ 28, 28, 27, 26, 26, 26, 26, 25, 25, 24, 24, 24, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 25,
+ 25, 25, 25, 24, 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 30, 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24,
+ 24, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29,
+ 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23,
+ 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26,
+ 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30,
+ 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26,
+ 25, 24, 24, 24, 24, 24, 23, 23, 23, 23, 31, 30, 30, 30, 30, 30, 30, 30,
+ 29, 29, 29, 29, 29, 29, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+ 24, 24, 23, 23, 23, 23, 30, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28, 28,
+ 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 23, 23, 23, 23, 23, 23,
+ 23, 23, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 26,
+ 25, 25, 25, 25, 25, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26,
+ 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 22, 22, 28, 28, 28, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26,
+ 26, 25, 24, 24, 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 28, 28, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24,
+ 24, 24, 24, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26,
+ 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23,
+ 22, 22, 22, 22, 22, 22, 22, 21, 21, 21, 26, 26, 26, 25, 25, 25, 25, 25,
+ 25, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22,
+ 22, 22, 21, 21, 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21,
+ 21, 21, 25, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+ 23, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22, 21, 21, 21, 21,
+ /* Size 4x8 */
+ 33, 33, 29, 28, 33, 33, 28, 27, 33, 32, 28, 26, 33, 32, 28, 26, 30, 28,
+ 26, 24, 29, 28, 24, 23, 27, 26, 23, 22, 25, 24, 23, 22,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 30, 29, 27, 25, 33, 33, 32, 32, 28, 28, 26, 24, 29, 28,
+ 28, 28, 26, 24, 23, 23, 28, 27, 26, 26, 24, 23, 22, 22,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 27, 27, 26, 33, 33,
+ 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32,
+ 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26,
+ 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 31, 30, 29, 29, 28, 24, 24, 24,
+ 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 29, 28,
+ 27, 27, 25, 23, 23, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26,
+ 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 24, 24, 24, 24, 23, 22,
+ 22, 21,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 34, 34, 33, 31, 31, 31, 29, 28, 28, 26, 24, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 29, 29, 28, 26, 26, 26, 24, 33, 33, 33, 33,
+ 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 33, 33, 33, 33, 32, 32,
+ 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 31, 31, 30, 30, 30, 29, 29, 29,
+ 28, 27, 27, 25, 24, 24, 24, 23, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24,
+ 24, 23, 22, 22, 22, 22, 28, 27, 27, 27, 26, 26, 26, 26, 24, 24, 24, 23,
+ 22, 22, 22, 22, 27, 26, 26, 26, 26, 25, 25, 25, 24, 23, 23, 22, 22, 22,
+ 22, 21,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 27, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 31, 29, 28, 28, 28, 28, 26, 24, 33, 33, 33, 33,
+ 33, 33, 33, 32, 31, 29, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32,
+ 30, 28, 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28,
+ 27, 27, 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27,
+ 27, 27, 26, 24, 33, 33, 33, 33, 33, 33, 33, 32, 30, 28, 27, 27, 27, 27,
+ 26, 24, 33, 33, 33, 33, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24,
+ 34, 33, 33, 32, 32, 32, 32, 32, 30, 28, 26, 26, 26, 26, 26, 24, 34, 33,
+ 33, 32, 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32,
+ 32, 32, 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32,
+ 32, 31, 29, 28, 26, 26, 26, 26, 25, 24, 34, 33, 33, 32, 32, 32, 32, 31,
+ 29, 28, 26, 26, 26, 26, 25, 24, 33, 33, 32, 32, 31, 31, 31, 31, 29, 27,
+ 26, 26, 26, 26, 25, 24, 32, 32, 31, 31, 30, 30, 30, 30, 28, 26, 25, 25,
+ 25, 25, 24, 23, 31, 31, 30, 29, 29, 29, 29, 29, 28, 26, 24, 24, 24, 24,
+ 24, 23, 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23,
+ 31, 30, 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30,
+ 29, 29, 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 31, 30, 29, 29,
+ 28, 28, 28, 28, 27, 26, 24, 24, 24, 24, 23, 23, 30, 29, 28, 28, 28, 28,
+ 28, 28, 26, 24, 23, 23, 23, 23, 23, 23, 29, 28, 28, 27, 27, 27, 27, 26,
+ 25, 24, 23, 23, 23, 23, 22, 22, 28, 28, 27, 26, 26, 26, 26, 26, 24, 23,
+ 22, 22, 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22,
+ 22, 22, 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22,
+ 22, 22, 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22,
+ 28, 27, 26, 26, 26, 26, 26, 25, 24, 23, 22, 22, 22, 22, 22, 22, 26, 26,
+ 26, 25, 25, 25, 25, 24, 24, 23, 22, 22, 22, 22, 22, 21, 26, 25, 25, 24,
+ 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 21, 24, 24, 24, 24, 24, 24,
+ 24, 24, 23, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24, 24, 24,
+ 23, 22, 22, 22, 22, 22, 21, 21,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 30, 29,
+ 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28, 28, 27, 26, 26,
+ 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 31, 29, 29, 29, 29, 29, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30,
+ 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28,
+ 28, 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26,
+ 26, 26, 26, 26, 25, 24, 24, 24, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 25, 25, 25, 25,
+ 24, 24, 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 28, 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23,
+ 29, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 26, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22, 28, 28, 27, 27,
+ 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23,
+ 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27,
+ 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22,
+ 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26,
+ 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22,
+ 22, 22, 28, 28, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25,
+ 24, 24, 24, 24, 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 27, 26,
+ 26, 26, 26, 26, 26, 26, 26, 26, 25, 25, 25, 25, 25, 24, 24, 23, 23, 23,
+ 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 21, 21, 24, 24, 24, 24, 24, 24,
+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 23, 23, 23, 23, 23, 23, 23, 22, 22,
+ 22, 22, 22, 22, 21, 21, 21, 21,
+ /* Size 4x16 */
+ 33, 33, 29, 28, 33, 33, 29, 27, 33, 33, 28, 27, 33, 33, 28, 27, 33, 32,
+ 28, 26, 33, 32, 28, 26, 33, 32, 28, 26, 33, 31, 27, 26, 31, 29, 26, 24,
+ 30, 28, 26, 24, 30, 28, 26, 24, 28, 27, 24, 23, 27, 26, 23, 22, 27, 26,
+ 23, 22, 26, 25, 23, 22, 24, 24, 22, 22,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 30, 28, 27, 27, 26, 24, 33, 33,
+ 33, 33, 32, 32, 32, 31, 29, 28, 28, 27, 26, 26, 25, 24, 29, 29, 28, 28,
+ 28, 28, 28, 27, 26, 26, 26, 24, 23, 23, 23, 22, 28, 27, 27, 27, 26, 26,
+ 26, 26, 24, 24, 24, 23, 22, 22, 22, 22,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 31, 28, 28, 27, 33, 33, 33, 33, 31, 28, 28, 26, 33, 33,
+ 33, 33, 31, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33,
+ 30, 27, 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 33, 33, 30, 27,
+ 27, 26, 33, 33, 33, 33, 30, 27, 27, 26, 33, 33, 32, 32, 30, 26, 26, 26,
+ 34, 33, 32, 32, 30, 26, 26, 26, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33,
+ 32, 32, 29, 26, 26, 25, 34, 33, 32, 32, 29, 26, 26, 25, 34, 33, 32, 32,
+ 29, 26, 26, 25, 33, 32, 31, 31, 29, 26, 26, 25, 32, 31, 30, 30, 28, 25,
+ 25, 24, 31, 30, 29, 29, 28, 24, 24, 24, 31, 29, 28, 28, 27, 24, 24, 23,
+ 31, 29, 28, 28, 27, 24, 24, 23, 31, 29, 28, 28, 27, 24, 24, 23, 31, 29,
+ 28, 28, 27, 24, 24, 23, 30, 28, 28, 28, 26, 23, 23, 23, 29, 28, 27, 27,
+ 25, 23, 23, 22, 28, 27, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22,
+ 22, 22, 28, 26, 26, 26, 24, 22, 22, 22, 28, 26, 26, 26, 24, 22, 22, 22,
+ 28, 26, 26, 26, 24, 22, 22, 22, 26, 26, 25, 25, 24, 22, 22, 22, 26, 25,
+ 24, 24, 23, 22, 22, 22, 24, 24, 24, 24, 23, 22, 22, 21, 24, 24, 24, 24,
+ 23, 22, 22, 21,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 32, 31, 31,
+ 31, 31, 31, 30, 29, 28, 28, 28, 28, 28, 26, 26, 24, 24, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 29, 29, 29, 28,
+ 28, 27, 26, 26, 26, 26, 26, 25, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 25, 24, 24, 24, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 31, 30, 29, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 25, 24,
+ 24, 24, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+ 28, 27, 27, 27, 27, 26, 25, 24, 24, 24, 24, 24, 24, 23, 23, 23, 28, 28,
+ 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24,
+ 24, 23, 23, 22, 22, 22, 22, 22, 22, 22, 22, 22, 28, 28, 27, 27, 27, 27,
+ 27, 27, 26, 26, 26, 26, 26, 26, 26, 25, 24, 24, 24, 24, 24, 23, 23, 22,
+ 22, 22, 22, 22, 22, 22, 22, 22, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+ 25, 25, 25, 25, 25, 24, 24, 23, 23, 23, 23, 23, 22, 22, 22, 22, 22, 22,
+ 22, 22, 21, 21 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 31,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31,
+ 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ /* Size 4x8 */
+ 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 32, 32, 32, 32,
+ 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32,
+ 31, 30,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 30, 30,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 31, 31, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31,
+ 30, 30, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 30, 30, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 31, 30, 30,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31,
+ 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+ 30, 30, 30, 30, 30, 30, 30, 30,
+ /* Size 4x16 */
+ 33, 33, 33, 32, 33, 33, 33, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ 33, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 31, 32, 32,
+ 32, 31, 32, 32, 32, 31, 32, 32, 32, 31,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 31, 31, 31,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32,
+ 33, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 32, 33, 32,
+ 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32, 32, 31, 33, 32, 32, 32,
+ 32, 32, 31, 31, 33, 32, 32, 32, 32, 32, 31, 31, 33, 32, 32, 32, 32, 32,
+ 31, 31, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30,
+ 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32,
+ 32, 32, 32, 32, 31, 30, 32, 32, 32, 32, 32, 32, 31, 30, 32, 32, 32, 32,
+ 32, 32, 31, 30,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 32, 29, 30, 29, 29, 26,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 34, 33, 31, 31, 33, 33, 33, 33, 33, 32, 30, 30, 33, 33,
+ 33, 33, 33, 32, 30, 30, 33, 33, 33, 33, 33, 32, 29, 29, 34, 33, 33, 33,
+ 32, 32, 29, 29, 33, 32, 32, 32, 32, 31, 28, 28, 31, 30, 30, 29, 29, 28,
+ 26, 26, 31, 30, 30, 29, 29, 28, 26, 26,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 32, 31, 30, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+ 30, 29, 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+ 29, 29, 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29,
+ 34, 34, 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 34, 34,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 32, 32, 32,
+ 32, 32, 31, 31, 31, 31, 31, 30, 28, 28, 28, 28, 31, 31, 31, 30, 30, 30,
+ 30, 30, 30, 30, 30, 28, 28, 27, 27, 27, 31, 30, 30, 30, 30, 30, 29, 29,
+ 29, 29, 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29,
+ 29, 28, 27, 26, 26, 26, 31, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 28,
+ 27, 26, 26, 26,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 33,
+ 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 32, 32, 31, 30,
+ 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30,
+ 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 30, 30, 30, 30, 30, 29, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31,
+ 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30,
+ 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 31, 30, 29,
+ 29, 29, 29, 29, 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+ 29, 29, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30,
+ 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29,
+ 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+ 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 34, 34, 34, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+ 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 28, 28, 28, 28, 28,
+ 28, 28, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 31, 31,
+ 31, 31, 31, 31, 31, 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 32, 32,
+ 32, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 30, 29, 28, 28, 28, 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 31, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26,
+ 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29,
+ 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26,
+ 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29,
+ 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28,
+ 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26,
+ 26, 26, 26, 26, 26, 26, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30, 30, 29,
+ 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 26, 26, 26, 26, 26,
+ 26, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26,
+ /* Size 4x8 */
+ 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32,
+ 32, 28, 33, 31, 31, 28, 30, 28, 28, 26, 30, 28, 28, 26,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 30, 30, 33, 33, 33, 32, 32, 31, 28, 28, 33, 33,
+ 33, 32, 32, 31, 28, 28, 30, 29, 29, 28, 28, 28, 26, 26,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33,
+ 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33,
+ 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32,
+ 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+ 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 33, 32,
+ 31, 31, 31, 31, 28, 26, 31, 30, 30, 29, 29, 29, 28, 26, 31, 30, 29, 28,
+ 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28,
+ 27, 25,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 33, 31, 31, 31, 31, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 30, 30, 30, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 31, 30, 29, 29, 29, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 31, 29, 28, 28, 28, 31, 31, 30, 30, 30, 30, 30, 30, 29, 29, 29, 28,
+ 28, 27, 27, 27, 29, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 26, 26, 25,
+ 25, 25,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 29, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 28, 28, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 31, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31,
+ 30, 29, 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29,
+ 28, 27, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 31, 30, 29, 28, 27, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 31, 30, 28, 28, 26, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 31, 30, 28, 28, 26, 34, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 31, 30, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31,
+ 29, 28, 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28,
+ 27, 26, 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26,
+ 34, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 34, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 31, 29, 28, 27, 26, 33, 33, 33, 32, 32, 31,
+ 31, 31, 31, 31, 31, 30, 29, 28, 27, 26, 33, 32, 32, 31, 31, 31, 31, 31,
+ 31, 31, 31, 29, 28, 28, 26, 25, 32, 32, 31, 31, 30, 30, 30, 30, 30, 30,
+ 30, 29, 28, 27, 26, 25, 31, 31, 30, 30, 30, 29, 29, 29, 29, 29, 29, 28,
+ 28, 26, 26, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26,
+ 25, 24, 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24,
+ 31, 30, 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30,
+ 30, 29, 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29,
+ 29, 28, 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 31, 30, 30, 29, 29, 28,
+ 28, 28, 28, 28, 28, 28, 27, 26, 25, 24, 30, 30, 29, 29, 28, 28, 28, 28,
+ 28, 28, 28, 27, 26, 26, 24, 23,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 31, 30, 30, 30, 30, 30, 30, 30, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30,
+ 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 31, 30, 29, 29, 29, 29, 29,
+ 29, 29, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 31, 30, 30, 29, 29, 29, 29, 29, 29, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30,
+ 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28,
+ 28, 28, 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 31,
+ 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 32, 32, 32, 32, 32, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 30, 29, 29, 28, 28,
+ 28, 28, 28, 28, 28, 27, 31, 31, 31, 31, 30, 30, 30, 30, 30, 30, 30, 30,
+ 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 27, 27, 27, 27, 27,
+ 27, 26, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 29, 29, 28, 28, 28, 28,
+ 28, 28, 28, 28, 28, 28, 28, 27, 26, 26, 26, 26, 26, 26, 26, 26, 29, 29,
+ 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 27, 27, 27, 27, 27, 27,
+ 27, 27, 26, 26, 26, 25, 25, 25, 25, 25, 25, 24, 28, 28, 28, 27, 27, 27,
+ 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 25, 25,
+ 24, 24, 24, 24, 24, 24, 24, 23,
+ /* Size 4x16 */
+ 33, 33, 33, 30, 33, 33, 33, 30, 33, 33, 33, 29, 33, 33, 33, 29, 33, 33,
+ 33, 29, 33, 33, 33, 29, 33, 32, 32, 28, 33, 32, 32, 28, 33, 32, 32, 28,
+ 33, 32, 32, 28, 33, 32, 32, 28, 32, 31, 31, 28, 31, 29, 29, 26, 30, 28,
+ 28, 26, 30, 28, 28, 26, 30, 28, 28, 26,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 31, 30, 30, 30, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 31, 29, 28, 28, 28, 30, 30, 29, 29, 29, 29,
+ 28, 28, 28, 28, 28, 28, 26, 26, 26, 26,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 31, 29, 33, 33, 33, 33, 33, 33, 31, 29, 33, 33,
+ 33, 33, 33, 33, 31, 28, 33, 33, 33, 33, 33, 33, 31, 28, 33, 33, 33, 33,
+ 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33,
+ 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28,
+ 33, 33, 33, 33, 33, 33, 30, 28, 33, 33, 33, 33, 33, 33, 30, 28, 33, 33,
+ 33, 33, 33, 33, 30, 28, 33, 33, 33, 32, 32, 32, 30, 28, 33, 33, 33, 32,
+ 32, 32, 30, 28, 34, 33, 33, 32, 32, 32, 30, 27, 34, 33, 32, 32, 32, 32,
+ 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27,
+ 34, 33, 32, 32, 32, 32, 29, 27, 34, 33, 32, 32, 32, 32, 29, 27, 34, 33,
+ 32, 32, 32, 32, 29, 27, 33, 33, 32, 31, 31, 31, 29, 27, 33, 32, 31, 31,
+ 31, 31, 28, 26, 32, 31, 30, 30, 30, 30, 28, 26, 31, 30, 30, 29, 29, 29,
+ 28, 26, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25,
+ 31, 30, 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 31, 30,
+ 29, 28, 28, 28, 27, 25, 31, 30, 29, 28, 28, 28, 27, 25, 30, 29, 28, 28,
+ 28, 28, 26, 24,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 34, 34, 34, 33, 33, 32, 31, 31, 31, 31, 31, 31, 31, 30, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 31, 30, 30, 30, 30, 30, 30, 30, 29, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 31, 30, 30, 29,
+ 29, 29, 29, 29, 29, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28,
+ 28, 28, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 31, 31, 30, 29, 28, 28, 28, 28, 28, 28, 28, 31, 31, 31, 31, 30, 30,
+ 30, 30, 30, 30, 30, 30, 30, 30, 30, 29, 29, 29, 29, 29, 29, 29, 28, 28,
+ 28, 27, 27, 27, 27, 27, 27, 26, 29, 29, 28, 28, 28, 28, 28, 28, 28, 28,
+ 28, 28, 28, 28, 27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 25, 25, 25,
+ 25, 25, 25, 24 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32, 32, 32, 33, 32,
+ 32, 32, 33, 32, 32, 32, 33, 32, 32, 32,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 33, 33, 33, 32, 32, 32, 32, 32, 33, 33, 33, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32,
+ 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32,
+ 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 32, 32, 32, 32, 32, 32, 33, 33, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 8x8 */
+ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+ /* Size 16x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32,
+ /* Size 32x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34,
+ 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 34, 34, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ /* Size 4x8 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 34, 33, 32, 32,
+ /* Size 8x4 */
+ 33, 33, 33, 33, 33, 33, 33, 34, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 32, 32, 33, 33, 33, 33, 33, 33, 32, 32,
+ /* Size 8x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33,
+ 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32,
+ 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 34, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33,
+ 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33,
+ 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 34, 34, 33, 33, 33, 33, 33, 33,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 33, 33,
+ 32, 32, 33, 33, 32, 32, 34, 33, 32, 32,
+ /* Size 16x4 */
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32,
+ 32, 32, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33, 33, 33, 33, 32, 32, 32,
+ 34, 33, 33, 33, 33, 32, 32, 32, 34, 33, 33, 33, 33, 32, 32, 32, 34, 33,
+ 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 32, 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33,
+ 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ },
+ {
+ { /* Luma */
+ /* Size 4x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ { /* Chroma */
+ /* Size 4x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32,
+ /* Size 16x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 32x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 4x16 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 16x4 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ /* Size 8x32 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32,
+ /* Size 32x8 */
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32,
+ 32, 32, 32, 32 },
+ },
+};
diff --git a/third_party/aom/av1/common/quant_common.h b/third_party/aom/av1/common/quant_common.h
new file mode 100644
index 0000000000..d1f52a660b
--- /dev/null
+++ b/third_party/aom/av1/common/quant_common.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_QUANT_COMMON_H_
+#define AOM_AV1_COMMON_QUANT_COMMON_H_
+
+#include "aom/aom_codec.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/entropy.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MINQ 0
+#define MAXQ 255
+#define QINDEX_RANGE (MAXQ - MINQ + 1)
+#define QINDEX_BITS 8
+// Total number of QM sets stored
+#define QM_LEVEL_BITS 4
+#define NUM_QM_LEVELS (1 << QM_LEVEL_BITS)
+/* Range of QMS is between first and last value, with offset applied to inter
+ * blocks*/
+#define DEFAULT_QM_Y 10
+#define DEFAULT_QM_U 11
+#define DEFAULT_QM_V 12
+#define DEFAULT_QM_FIRST 5
+#define DEFAULT_QM_LAST 9
+
+struct AV1Common;
+
+int16_t av1_dc_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_Q3(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_dc_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+int16_t av1_ac_quant_QTX(int qindex, int delta, aom_bit_depth_t bit_depth);
+
+int av1_get_qindex(const struct segmentation *seg, int segment_id,
+ int base_qindex);
+// Reduce the large number of quantizers to a smaller number of levels for which
+// different matrices may be defined
+static INLINE int aom_get_qmlevel(int qindex, int first, int last) {
+ return first + (qindex * (last + 1 - first)) / QINDEX_RANGE;
+}
+void av1_qm_init(struct AV1Common *cm);
+const qm_val_t *av1_iqmatrix(struct AV1Common *cm, int qindex, int comp,
+ TX_SIZE tx_size);
+const qm_val_t *av1_qmatrix(struct AV1Common *cm, int qindex, int comp,
+ TX_SIZE tx_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_QUANT_COMMON_H_
diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c
new file mode 100644
index 0000000000..3203efce4b
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.c
@@ -0,0 +1,1162 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+
+#define USE_PRECOMPUTED_WEDGE_MASK 1
+#define USE_PRECOMPUTED_WEDGE_SIGN 1
+
+// This function will determine whether or not to create a warped
+// prediction.
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+ const WarpTypesAllowed *const warp_types,
+ const WarpedMotionParams *const gm_params,
+ int build_for_obmc, int x_scale, int y_scale,
+ WarpedMotionParams *final_warp_params) {
+ if (x_scale != SCALE_SUBPEL_SHIFTS || y_scale != SCALE_SUBPEL_SHIFTS)
+ return 0;
+
+ if (final_warp_params != NULL) *final_warp_params = default_warp_params;
+
+ if (build_for_obmc) return 0;
+
+ if (warp_types->local_warp_allowed && !mbmi->wm_params.invalid) {
+ if (final_warp_params != NULL)
+ memcpy(final_warp_params, &mbmi->wm_params, sizeof(*final_warp_params));
+ return 1;
+ } else if (warp_types->global_warp_allowed && !gm_params->invalid) {
+ if (final_warp_params != NULL)
+ memcpy(final_warp_params, gm_params, sizeof(*final_warp_params));
+ return 1;
+ }
+
+ return 0;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ const MB_MODE_INFO *mi, int build_for_obmc,
+ const MACROBLOCKD *xd, int can_use_previous) {
+ // Make sure the selected motion mode is valid for this configuration
+ assert_motion_mode_valid(mi->motion_mode, xd->global_motion, xd, mi,
+ can_use_previous);
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ WarpedMotionParams final_warp_params;
+ const int do_warp =
+ (w >= 8 && h >= 8 &&
+ av1_allow_warp(mi, warp_types, &xd->global_motion[mi->ref_frame[ref]],
+ build_for_obmc, subpel_params->xs, subpel_params->ys,
+ &final_warp_params));
+ const int is_intrabc = mi->use_intrabc;
+ assert(IMPLIES(is_intrabc, !do_warp));
+
+ if (do_warp && xd->cur_frame_force_integer_mv == 0) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const struct buf_2d *const pre_buf = &pd->pre[ref];
+ av1_warp_plane(&final_warp_params,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+ pre_buf->buf0, pre_buf->width, pre_buf->height,
+ pre_buf->stride, dst, p_col, p_row, w, h, dst_stride,
+ pd->subsampling_x, pd->subsampling_y, conv_params);
+ } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf,
+ w, h, conv_params, interp_filters, is_intrabc,
+ xd->bd);
+ } else {
+ inter_predictor(src, src_stride, dst, dst_stride, subpel_params, sf, w, h,
+ conv_params, interp_filters, is_intrabc);
+ }
+}
+
+#if USE_PRECOMPUTED_WEDGE_MASK
+static const uint8_t wedge_master_oblique_odd[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 18,
+ 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_oblique_even[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 4, 11, 27,
+ 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+static const uint8_t wedge_master_vertical[MASK_MASTER_SIZE] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 7, 21,
+ 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
+ if (shift >= 0) {
+ memcpy(dst + shift, src, width - shift);
+ memset(dst, src[0], shift);
+ } else {
+ shift = -shift;
+ memcpy(dst, src + shift, width - shift);
+ memset(dst + width - shift, src[width - 1], shift);
+ }
+}
+#endif // USE_PRECOMPUTED_WEDGE_MASK
+
+#if USE_PRECOMPUTED_WEDGE_SIGN
+/* clang-format off */
+DECLARE_ALIGNED(16, static uint8_t,
+ wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
+ { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, // not used
+};
+/* clang-format on */
+#else
+DECLARE_ALIGNED(16, static uint8_t,
+ wedge_signflip_lookup[BLOCK_SIZES_ALL][MAX_WEDGE_TYPES]);
+#endif // USE_PRECOMPUTED_WEDGE_SIGN
+
+// [negative][direction]
+DECLARE_ALIGNED(
+ 16, static uint8_t,
+ wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+// 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(16, static uint8_t,
+ wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES_ALL][2];
+
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
+ { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+ { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
+ { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
+ { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
+ { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
+ { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
+ { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
+ { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
+ { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL] = {
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
+ wedge_masks[BLOCK_8X8] },
+ { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
+ wedge_masks[BLOCK_8X16] },
+ { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
+ wedge_masks[BLOCK_16X8] },
+ { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
+ wedge_masks[BLOCK_16X16] },
+ { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
+ wedge_masks[BLOCK_16X32] },
+ { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
+ wedge_masks[BLOCK_32X16] },
+ { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
+ wedge_masks[BLOCK_32X32] },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+ { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
+ wedge_masks[BLOCK_8X32] },
+ { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
+ wedge_masks[BLOCK_32X8] },
+ { 0, NULL, NULL, NULL },
+ { 0, NULL, NULL, NULL },
+};
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg,
+ BLOCK_SIZE sb_type) {
+ const uint8_t *master;
+ const int bh = block_size_high[sb_type];
+ const int bw = block_size_wide[sb_type];
+ const wedge_code_type *a =
+ wedge_params_lookup[sb_type].codebook + wedge_index;
+ int woff, hoff;
+ const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+ assert(wedge_index >= 0 &&
+ wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+ woff = (a->x_offset * bw) >> 3;
+ hoff = (a->y_offset * bh) >> 3;
+ master = wedge_mask_obl[neg ^ wsignflip][a->direction] +
+ MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+ MASK_MASTER_SIZE / 2 - woff;
+ return master;
+}
+
+const uint8_t *av1_get_compound_type_mask(
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type) {
+ assert(is_masked_compound_type(comp_data->type));
+ (void)sb_type;
+ switch (comp_data->type) {
+ case COMPOUND_WEDGE:
+ return av1_get_contiguous_soft_mask(comp_data->wedge_index,
+ comp_data->wedge_sign, sb_type);
+ case COMPOUND_DIFFWTD: return comp_data->seg_mask;
+ default: assert(0); return NULL;
+ }
+}
+
+static void diffwtd_mask_d16(uint8_t *mask, int which_inverse, int mask_base,
+ const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h,
+ int w, ConvolveParams *conv_params, int bd) {
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ int i, j, m, diff;
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ diff = abs(src0[i * src0_stride + j] - src1[i * src1_stride + j]);
+ diff = ROUND_POWER_OF_TWO(diff, round);
+ m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+ mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_c(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask_d16(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w,
+ conv_params, bd);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask_d16(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w,
+ conv_params, bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride, int h, int w) {
+ int i, j, m, diff;
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ diff =
+ abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
+ m = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
+ mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_c(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int h, int w) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
+ break;
+ default: assert(0);
+ }
+}
+
+static AOM_FORCE_INLINE void diffwtd_mask_highbd(
+ uint8_t *mask, int which_inverse, int mask_base, const uint16_t *src0,
+ int src0_stride, const uint16_t *src1, int src1_stride, int h, int w,
+ const unsigned int bd) {
+ assert(bd >= 8);
+ if (bd == 8) {
+ if (which_inverse) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const unsigned int bd_shift = bd - 8;
+ if (which_inverse) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff =
+ (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = AOM_BLEND_A64_MAX_ALPHA - m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; ++j) {
+ int diff =
+ (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
+ unsigned int m = negative_to_zero(mask_base + diff);
+ m = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
+ mask[j] = m;
+ }
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_c(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ switch (mask_type) {
+ case DIFFWTD_38:
+ diffwtd_mask_highbd(mask, 0, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+ CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+ break;
+ case DIFFWTD_38_INV:
+ diffwtd_mask_highbd(mask, 1, 38, CONVERT_TO_SHORTPTR(src0), src0_stride,
+ CONVERT_TO_SHORTPTR(src1), src1_stride, h, w, bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void init_wedge_master_masks() {
+ int i, j;
+ const int w = MASK_MASTER_SIZE;
+ const int h = MASK_MASTER_SIZE;
+ const int stride = MASK_MASTER_STRIDE;
+// Note: index [0] stores the masters, and [1] its complement.
+#if USE_PRECOMPUTED_WEDGE_MASK
+ // Generate prototype by shifting the masters
+ int shift = h / 4;
+ for (i = 0; i < h; i += 2) {
+ shift_copy(wedge_master_oblique_even,
+ &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride], shift,
+ MASK_MASTER_SIZE);
+ shift--;
+ shift_copy(wedge_master_oblique_odd,
+ &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride], shift,
+ MASK_MASTER_SIZE);
+ memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
+ wedge_master_vertical,
+ MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+ memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
+ wedge_master_vertical,
+ MASK_MASTER_SIZE * sizeof(wedge_master_vertical[0]));
+ }
+#else
+ static const double smoother_param = 2.85;
+ const int a[2] = { 2, 1 };
+ const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+ for (i = 0; i < h; i++) {
+ for (j = 0; j < w; ++j) {
+ int x = (2 * j + 1 - w);
+ int y = (2 * i + 1 - h);
+ double d = (a[0] * x + a[1] * y) / asqrt;
+ const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
+ wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
+ const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
+ wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
+ }
+ }
+#endif // USE_PRECOMPUTED_WEDGE_MASK
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
+ wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
+ wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+ wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - msk;
+ wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
+ wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - msk;
+ wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+ wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
+ const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
+ wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
+ wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
+ wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
+ (1 << WEDGE_WEIGHT_BITS) - mskx;
+ }
+ }
+}
+
+#if !USE_PRECOMPUTED_WEDGE_SIGN
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+ BLOCK_SIZE sb_type;
+ memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+ for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
+ const int bw = block_size_wide[sb_type];
+ const int bh = block_size_high[sb_type];
+ const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+ const int wbits = wedge_params.bits;
+ const int wtypes = 1 << wbits;
+ int i, w;
+ if (wbits) {
+ for (w = 0; w < wtypes; ++w) {
+ // Get the mask master, i.e. index [0]
+ const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+ int avg = 0;
+ for (i = 0; i < bw; ++i) avg += mask[i];
+ for (i = 1; i < bh; ++i) avg += mask[i * MASK_MASTER_STRIDE];
+ avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
+ // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
+ // If default sign is 1:
+ // If sign requested is 0, we need to flip the sign and return
+ // the complement i.e. index [1] instead. If sign requested is 1
+ // we need to flip the sign and return index [0] instead.
+ // If default sign is 0:
+ // If sign requested is 0, we need to return index [0] the master
+ // if sign requested is 1, we need to return the complement index [1]
+ // instead.
+ wedge_params.signflip[w] = (avg < 32);
+ }
+ }
+ }
+}
+#endif // !USE_PRECOMPUTED_WEDGE_SIGN
+
+static void init_wedge_masks() {
+ uint8_t *dst = wedge_mask_buf;
+ BLOCK_SIZE bsize;
+ memset(wedge_masks, 0, sizeof(wedge_masks));
+ for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const uint8_t *mask;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
+ const int wbits = wedge_params->bits;
+ const int wtypes = 1 << wbits;
+ int w;
+ if (wbits == 0) continue;
+ for (w = 0; w < wtypes; ++w) {
+ mask = get_wedge_mask_inplace(w, 0, bsize);
+ aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+ bh);
+ wedge_params->masks[0][w] = dst;
+ dst += bw * bh;
+
+ mask = get_wedge_mask_inplace(w, 1, bsize);
+ aom_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw, NULL, 0, NULL, 0, bw,
+ bh);
+ wedge_params->masks[1][w] = dst;
+ dst += bw * bh;
+ }
+ assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+ }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void av1_init_wedge_masks() {
+ init_wedge_master_masks();
+#if !USE_PRECOMPUTED_WEDGE_SIGN
+ init_wedge_signs();
+#endif // !USE_PRECOMPUTED_WEDGE_SIGN
+ init_wedge_masks();
+}
+
+static void build_masked_compound_no_round(
+ uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, ConvolveParams *conv_params, MACROBLOCKD *xd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ aom_highbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, block_size_wide[sb_type],
+ w, h, subw, subh, conv_params, xd->bd);
+ else
+ aom_lowbd_blend_a64_d16_mask(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, block_size_wide[sb_type], w,
+ h, subw, subh, conv_params);
+}
+
+void av1_make_masked_inter_predictor(
+ const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
+ const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
+ MACROBLOCKD *xd, int can_use_previous) {
+ MB_MODE_INFO *mi = xd->mi[0];
+ (void)dst;
+ (void)dst_stride;
+ mi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mi->interinter_comp;
+
+// We're going to call av1_make_inter_predictor to generate a prediction into
+// a temporary buffer, then will blend that temporary buffer with that from
+// the other reference.
+//
+#define INTER_PRED_BYTES_PER_PIXEL 2
+
+ DECLARE_ALIGNED(32, uint8_t,
+ tmp_buf[INTER_PRED_BYTES_PER_PIXEL * MAX_SB_SQUARE]);
+#undef INTER_PRED_BYTES_PER_PIXEL
+
+ uint8_t *tmp_dst = get_buf_by_bd(xd, tmp_buf);
+
+ const int tmp_buf_stride = MAX_SB_SIZE;
+ CONV_BUF_TYPE *org_dst = conv_params->dst;
+ int org_dst_stride = conv_params->dst_stride;
+ CONV_BUF_TYPE *tmp_buf16 = (CONV_BUF_TYPE *)tmp_buf;
+ conv_params->dst = tmp_buf16;
+ conv_params->dst_stride = tmp_buf_stride;
+ assert(conv_params->do_average == 0);
+
+ // This will generate a prediction in tmp_buf for the second reference
+ av1_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE, subpel_params,
+ sf, w, h, conv_params, interp_filters, warp_types,
+ p_col, p_row, plane, ref, mi, 0, xd,
+ can_use_previous);
+
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+ av1_build_compound_diffwtd_mask_d16(
+ comp_data->seg_mask, comp_data->mask_type, org_dst, org_dst_stride,
+ tmp_buf16, tmp_buf_stride, h, w, conv_params, xd->bd);
+ }
+ build_masked_compound_no_round(dst, dst_stride, org_dst, org_dst_stride,
+ tmp_buf16, tmp_buf_stride, comp_data,
+ mi->sb_type, h, w, conv_params, xd);
+}
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+ int order_idx, int *fwd_offset, int *bck_offset,
+ int *use_jnt_comp_avg, int is_compound) {
+ assert(fwd_offset != NULL && bck_offset != NULL);
+ if (!is_compound || mbmi->compound_idx) {
+ *use_jnt_comp_avg = 0;
+ return;
+ }
+
+ *use_jnt_comp_avg = 1;
+ const int bck_idx = cm->frame_refs[mbmi->ref_frame[0] - LAST_FRAME].idx;
+ const int fwd_idx = cm->frame_refs[mbmi->ref_frame[1] - LAST_FRAME].idx;
+ const int cur_frame_index = cm->cur_frame->cur_frame_offset;
+ int bck_frame_index = 0, fwd_frame_index = 0;
+
+ if (bck_idx >= 0) {
+ bck_frame_index = cm->buffer_pool->frame_bufs[bck_idx].cur_frame_offset;
+ }
+
+ if (fwd_idx >= 0) {
+ fwd_frame_index = cm->buffer_pool->frame_bufs[fwd_idx].cur_frame_offset;
+ }
+
+ int d0 = clamp(abs(get_relative_dist(cm, fwd_frame_index, cur_frame_index)),
+ 0, MAX_FRAME_DISTANCE);
+ int d1 = clamp(abs(get_relative_dist(cm, cur_frame_index, bck_frame_index)),
+ 0, MAX_FRAME_DISTANCE);
+
+ const int order = d0 <= d1;
+
+ if (d0 == 0 || d1 == 0) {
+ *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
+ *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
+ return;
+ }
+
+ int i;
+ for (i = 0; i < 3; ++i) {
+ int c0 = quant_dist_weight[i][order];
+ int c1 = quant_dist_weight[i][!order];
+ int d0_c0 = d0 * c0;
+ int d1_c1 = d1 * c1;
+ if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
+ }
+
+ *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
+ *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const int plane_start, const int plane_end) {
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = plane_start; i < AOMMIN(plane_end, MAX_MB_PLANE); ++i) {
+ struct macroblockd_plane *const pd = &planes[i];
+ const int is_uv = i > 0;
+ setup_pred_plane(&pd->dst, bsize, src->buffers[i], src->crop_widths[is_uv],
+ src->crop_heights[is_uv], src->strides[is_uv], mi_row,
+ mi_col, NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+}
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *sf,
+ const int num_planes) {
+ if (src != NULL) {
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ struct macroblockd_plane *const pd = &xd->plane[i];
+ const int is_uv = i > 0;
+ setup_pred_plane(&pd->pre[idx], xd->mi[0]->sb_type, src->buffers[i],
+ src->crop_widths[is_uv], src->crop_heights[is_uv],
+ src->strides[is_uv], mi_row, mi_col, sf,
+ pd->subsampling_x, pd->subsampling_y);
+ }
+ }
+}
+
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = { 64 };
+
+static const uint8_t obmc_mask_2[2] = { 45, 64 };
+
+static const uint8_t obmc_mask_4[4] = { 39, 50, 59, 64 };
+
+static const uint8_t obmc_mask_8[8] = { 36, 42, 48, 53, 57, 61, 64, 64 };
+
+static const uint8_t obmc_mask_16[16] = { 34, 37, 40, 43, 46, 49, 52, 54,
+ 56, 58, 60, 61, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_32[32] = { 33, 35, 36, 38, 40, 41, 43, 44,
+ 45, 47, 48, 50, 51, 52, 53, 55,
+ 56, 57, 58, 59, 60, 60, 61, 62,
+ 64, 64, 64, 64, 64, 64, 64, 64 };
+
+static const uint8_t obmc_mask_64[64] = {
+ 33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+ 45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+ 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+ 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+
+const uint8_t *av1_get_obmc_mask(int length) {
+ switch (length) {
+ case 1: return obmc_mask_1;
+ case 2: return obmc_mask_2;
+ case 4: return obmc_mask_4;
+ case 8: return obmc_mask_8;
+ case 16: return obmc_mask_16;
+ case 32: return obmc_mask_32;
+ case 64: return obmc_mask_64;
+ default: assert(0); return NULL;
+ }
+}
+
+static INLINE void increment_int_ptr(MACROBLOCKD *xd, int rel_mi_rc,
+ uint8_t mi_hw, MB_MODE_INFO *mi,
+ void *fun_ctxt, const int num_planes) {
+ (void)xd;
+ (void)rel_mi_rc;
+ (void)mi_hw;
+ (void)mi;
+ ++*(int *)fun_ctxt;
+ (void)num_planes;
+}
+
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+
+ mbmi->overlappable_neighbors[0] = 0;
+ mbmi->overlappable_neighbors[1] = 0;
+
+ if (!is_motion_variation_allowed_bsize(mbmi->sb_type)) return;
+
+ foreach_overlappable_nb_above(cm, xd, mi_col, INT_MAX, increment_int_ptr,
+ &mbmi->overlappable_neighbors[0]);
+ foreach_overlappable_nb_left(cm, xd, mi_row, INT_MAX, increment_int_ptr,
+ &mbmi->overlappable_neighbors[1]);
+}
+
+// HW does not support < 4x4 prediction. To limit the bandwidth requirement, if
+// block-size of current plane is smaller than 8x8, always only blend with the
+// left neighbor(s) (skip blending with the above side).
+#define DISABLE_CHROMA_U8X8_OBMC 0 // 0: one-sided obmc; 1: disable
+
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd, int dir) {
+ assert(is_motion_variation_allowed_bsize(bsize));
+
+ const BLOCK_SIZE bsize_plane =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ switch (bsize_plane) {
+#if DISABLE_CHROMA_U8X8_OBMC
+ case BLOCK_4X4:
+ case BLOCK_8X4:
+ case BLOCK_4X8: return 1; break;
+#else
+ case BLOCK_4X4:
+ case BLOCK_8X4:
+ case BLOCK_4X8: return dir == 0; break;
+#endif
+ default: return 0;
+ }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+ return;
+}
+
+struct obmc_inter_pred_ctxt {
+ uint8_t **adjacent;
+ int *adjacent_stride;
+};
+
+static INLINE void build_obmc_inter_pred_above(MACROBLOCKD *xd, int rel_mi_col,
+ uint8_t above_mi_width,
+ MB_MODE_INFO *above_mi,
+ void *fun_ctxt,
+ const int num_planes) {
+ (void)above_mi;
+ struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int overlap =
+ AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ const int bh = overlap >> pd->subsampling_y;
+ const int plane_col = (rel_mi_col * MI_SIZE) >> pd->subsampling_x;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+
+ const int dst_stride = pd->dst.stride;
+ uint8_t *const dst = &pd->dst.buf[plane_col];
+ const int tmp_stride = ctxt->adjacent_stride[plane];
+ const uint8_t *const tmp = &ctxt->adjacent[plane][plane_col];
+ const uint8_t *const mask = av1_get_obmc_mask(bh);
+
+ if (is_hbd)
+ aom_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp,
+ tmp_stride, mask, bw, bh, xd->bd);
+ else
+ aom_blend_a64_vmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+ mask, bw, bh);
+ }
+}
+
+static INLINE void build_obmc_inter_pred_left(MACROBLOCKD *xd, int rel_mi_row,
+ uint8_t left_mi_height,
+ MB_MODE_INFO *left_mi,
+ void *fun_ctxt,
+ const int num_planes) {
+ (void)left_mi;
+ struct obmc_inter_pred_ctxt *ctxt = (struct obmc_inter_pred_ctxt *)fun_ctxt;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const int overlap =
+ AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = overlap >> pd->subsampling_x;
+ const int bh = (left_mi_height * MI_SIZE) >> pd->subsampling_y;
+ const int plane_row = (rel_mi_row * MI_SIZE) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+
+ const int dst_stride = pd->dst.stride;
+ uint8_t *const dst = &pd->dst.buf[plane_row * dst_stride];
+ const int tmp_stride = ctxt->adjacent_stride[plane];
+ const uint8_t *const tmp = &ctxt->adjacent[plane][plane_row * tmp_stride];
+ const uint8_t *const mask = av1_get_obmc_mask(bw);
+
+ if (is_hbd)
+ aom_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp,
+ tmp_stride, mask, bw, bh, xd->bd);
+ else
+ aom_blend_a64_hmask(dst, dst_stride, dst, dst_stride, tmp, tmp_stride,
+ mask, bw, bh);
+ }
+}
+
+// This function combines motion compensated predictions that are generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *above[MAX_MB_PLANE],
+ int above_stride[MAX_MB_PLANE],
+ uint8_t *left[MAX_MB_PLANE],
+ int left_stride[MAX_MB_PLANE]) {
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ // handle above row
+ struct obmc_inter_pred_ctxt ctxt_above = { above, above_stride };
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_obmc_inter_pred_above, &ctxt_above);
+
+ // handle left column
+ struct obmc_inter_pred_ctxt ctxt_left = { left, left_stride };
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_obmc_inter_pred_left, &ctxt_left);
+}
+
+void av1_setup_build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE a_bsize = AOMMAX(BLOCK_8X8, above_mbmi->sb_type);
+ const int above_mi_col = ctxt->mi_col + rel_mi_col;
+
+ av1_modify_neighbor_predictor_for_obmc(above_mbmi);
+
+ for (int j = 0; j < num_planes; ++j) {
+ struct macroblockd_plane *const pd = &xd->plane[j];
+ setup_pred_plane(&pd->dst, a_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+ ctxt->tmp_height[j], ctxt->tmp_stride[j], 0, rel_mi_col,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const int num_refs = 1 + has_second_ref(above_mbmi);
+
+ for (int ref = 0; ref < num_refs; ++ref) {
+ const MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+
+ const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+ xd->block_refs[ref] = ref_buf;
+ if ((!av1_is_valid_scale(&ref_buf->sf)))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ av1_setup_pre_planes(xd, ref, ref_buf->buf, ctxt->mi_row, above_mi_col,
+ &ref_buf->sf, num_planes);
+ }
+
+ xd->mb_to_left_edge = 8 * MI_SIZE * (-above_mi_col);
+ xd->mb_to_right_edge = ctxt->mb_to_far_edge +
+ (xd->n4_w - rel_mi_col - above_mi_width) * MI_SIZE * 8;
+}
+
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+ uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes) {
+ const BLOCK_SIZE l_bsize = AOMMAX(BLOCK_8X8, left_mbmi->sb_type);
+ const int left_mi_row = ctxt->mi_row + rel_mi_row;
+
+ av1_modify_neighbor_predictor_for_obmc(left_mbmi);
+
+ for (int j = 0; j < num_planes; ++j) {
+ struct macroblockd_plane *const pd = &xd->plane[j];
+ setup_pred_plane(&pd->dst, l_bsize, ctxt->tmp_buf[j], ctxt->tmp_width[j],
+ ctxt->tmp_height[j], ctxt->tmp_stride[j], rel_mi_row, 0,
+ NULL, pd->subsampling_x, pd->subsampling_y);
+ }
+
+ const int num_refs = 1 + has_second_ref(left_mbmi);
+
+ for (int ref = 0; ref < num_refs; ++ref) {
+ const MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+
+ const RefBuffer *const ref_buf = &ctxt->cm->frame_refs[frame - LAST_FRAME];
+
+ xd->block_refs[ref] = ref_buf;
+ if ((!av1_is_valid_scale(&ref_buf->sf)))
+ aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ av1_setup_pre_planes(xd, ref, ref_buf->buf, left_mi_row, ctxt->mi_col,
+ &ref_buf->sf, num_planes);
+ }
+
+ xd->mb_to_top_edge = 8 * MI_SIZE * (-left_mi_row);
+ xd->mb_to_bottom_edge =
+ ctxt->mb_to_far_edge +
+ (xd->n4_h - rel_mi_row - left_mi_height) * MI_SIZE * 8;
+}
+
+/* clang-format off */
+static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
+ 60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
+ 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
+ 16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10, 9, 9, 9, 8,
+ 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 4, 4,
+ 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+};
+static uint8_t ii_size_scales[BLOCK_SIZES_ALL] = {
+ 32, 16, 16, 16, 8, 8, 8, 4,
+ 4, 4, 2, 2, 2, 1, 1, 1,
+ 8, 8, 4, 4, 2, 2
+};
+/* clang-format on */
+
+static void build_smooth_interintra_mask(uint8_t *mask, int stride,
+ BLOCK_SIZE plane_bsize,
+ INTERINTRA_MODE mode) {
+ int i, j;
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const int size_scale = ii_size_scales[plane_bsize];
+
+ switch (mode) {
+ case II_V_PRED:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+
+ case II_H_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_SMOOTH_PRED:
+ for (i = 0; i < bh; ++i) {
+ for (j = 0; j < bw; ++j)
+ mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
+ mask += stride;
+ }
+ break;
+
+ case II_DC_PRED:
+ default:
+ for (i = 0; i < bh; ++i) {
+ memset(mask, 32, bw * sizeof(mask[0]));
+ mask += stride;
+ }
+ break;
+ }
+}
+
+static void combine_interintra(INTERINTRA_MODE mode, int use_wedge_interintra,
+ int wedge_index, int wedge_sign,
+ BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+ uint8_t *comppred, int compstride,
+ const uint8_t *interpred, int interstride,
+ const uint8_t *intrapred, int intrastride) {
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ if (use_wedge_interintra) {
+ if (is_interintra_wedge_used(bsize)) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ const int subw = 2 * mi_size_wide[bsize] == bw;
+ const int subh = 2 * mi_size_high[bsize] == bh;
+ aom_blend_a64_mask(comppred, compstride, intrapred, intrastride,
+ interpred, interstride, mask, block_size_wide[bsize],
+ bw, bh, subw, subh);
+ }
+ return;
+ }
+
+ uint8_t mask[MAX_SB_SQUARE];
+ build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+ aom_blend_a64_mask(comppred, compstride, intrapred, intrastride, interpred,
+ interstride, mask, bw, bw, bh, 0, 0);
+}
+
+static void combine_interintra_highbd(
+ INTERINTRA_MODE mode, int use_wedge_interintra, int wedge_index,
+ int wedge_sign, BLOCK_SIZE bsize, BLOCK_SIZE plane_bsize,
+ uint8_t *comppred8, int compstride, const uint8_t *interpred8,
+ int interstride, const uint8_t *intrapred8, int intrastride, int bd) {
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ if (use_wedge_interintra) {
+ if (is_interintra_wedge_used(bsize)) {
+ const uint8_t *mask =
+ av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ const int subh = 2 * mi_size_high[bsize] == bh;
+ const int subw = 2 * mi_size_wide[bsize] == bw;
+ aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+ interpred8, interstride, mask,
+ block_size_wide[bsize], bw, bh, subw, subh, bd);
+ }
+ return;
+ }
+
+ uint8_t mask[MAX_SB_SQUARE];
+ build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
+ aom_highbd_blend_a64_mask(comppred8, compstride, intrapred8, intrastride,
+ interpred8, interstride, mask, bw, bw, bh, 0, 0,
+ bd);
+}
+
+void av1_build_intra_predictors_for_interintra(const AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int plane,
+ BUFFER_SET *ctx, uint8_t *dst,
+ int dst_stride) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int ssx = xd->plane[plane].subsampling_x;
+ const int ssy = xd->plane[plane].subsampling_y;
+ BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+ PREDICTION_MODE mode = interintra_to_intra_mode[xd->mi[0]->interintra_mode];
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_Y] == 0);
+ assert(xd->mi[0]->angle_delta[PLANE_TYPE_UV] == 0);
+ assert(xd->mi[0]->filter_intra_mode_info.use_filter_intra == 0);
+ assert(xd->mi[0]->use_intrabc == 0);
+
+ av1_predict_intra_block(cm, xd, pd->width, pd->height,
+ max_txsize_rect_lookup[plane_bsize], mode, 0, 0,
+ FILTER_INTRA_MODES, ctx->plane[plane],
+ ctx->stride[plane], dst, dst_stride, 0, 0, plane);
+}
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride) {
+ const int ssx = xd->plane[plane].subsampling_x;
+ const int ssy = xd->plane[plane].subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, ssx, ssy);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ combine_interintra_highbd(
+ xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+ xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+ bsize, plane_bsize, xd->plane[plane].dst.buf,
+ xd->plane[plane].dst.stride, inter_pred, inter_stride, intra_pred,
+ intra_stride, xd->bd);
+ return;
+ }
+ combine_interintra(
+ xd->mi[0]->interintra_mode, xd->mi[0]->use_wedge_interintra,
+ xd->mi[0]->interintra_wedge_index, xd->mi[0]->interintra_wedge_sign,
+ bsize, plane_bsize, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+ inter_pred, inter_stride, intra_pred, intra_stride);
+}
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, intrapredictor[MAX_SB_SQUARE]);
+ av1_build_intra_predictors_for_interintra(
+ cm, xd, bsize, plane, ctx, CONVERT_TO_BYTEPTR(intrapredictor),
+ MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride,
+ CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, plane, ctx,
+ intrapredictor, MAX_SB_SIZE);
+ av1_combine_interintra(xd, bsize, plane, pred, stride, intrapredictor,
+ MAX_SB_SIZE);
+ }
+}
+
+void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *upred, uint8_t *vpred,
+ int ustride, int vstride,
+ BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+ av1_build_interintra_predictors_sbp(cm, xd, upred, ustride, ctx, 1, bsize);
+ av1_build_interintra_predictors_sbp(cm, xd, vpred, vstride, ctx, 2, bsize);
+}
diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h
new file mode 100644
index 0000000000..db86c777e3
--- /dev/null
+++ b/third_party/aom/av1/common/reconinter.h
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTER_H_
+#define AOM_AV1_COMMON_RECONINTER_H_
+
+#include "av1/common/filter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+#include "aom/aom_integer.h"
+
+// Work out how many pixels off the edge of a reference frame we're allowed
+// to go when forming an inter prediction.
+// The outermost row/col of each referernce frame is extended by
+// (AOM_BORDER_IN_PIXELS >> subsampling) pixels, but we need to keep
+// at least AOM_INTERP_EXTEND pixels within that to account for filtering.
+//
+// We have to break this up into two macros to keep both clang-format and
+// tools/lint-hunks.py happy.
+#define AOM_LEFT_TOP_MARGIN_PX(subsampling) \
+ ((AOM_BORDER_IN_PIXELS >> subsampling) - AOM_INTERP_EXTEND)
+#define AOM_LEFT_TOP_MARGIN_SCALED(subsampling) \
+ (AOM_LEFT_TOP_MARGIN_PX(subsampling) << SCALE_SUBPEL_BITS)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Set to (1 << 5) if the 32-ary codebooks are used for any bock size
+#define MAX_WEDGE_TYPES (1 << 4)
+
+#define MAX_WEDGE_SIZE_LOG2 5 // 32x32
+#define MAX_WEDGE_SIZE (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+ WEDGE_HORIZONTAL = 0,
+ WEDGE_VERTICAL = 1,
+ WEDGE_OBLIQUE27 = 2,
+ WEDGE_OBLIQUE63 = 3,
+ WEDGE_OBLIQUE117 = 4,
+ WEDGE_OBLIQUE153 = 5,
+ WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+ WedgeDirectionType direction;
+ int x_offset;
+ int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+ int bits;
+ const wedge_code_type *codebook;
+ uint8_t *signflip;
+ wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES_ALL];
+
+typedef struct SubpelParams {
+ int xs;
+ int ys;
+ int subpel_x;
+ int subpel_y;
+} SubpelParams;
+
+struct build_prediction_ctxt {
+ const AV1_COMMON *cm;
+ int mi_row;
+ int mi_col;
+ uint8_t **tmp_buf;
+ int *tmp_width;
+ int *tmp_height;
+ int *tmp_stride;
+ int mb_to_far_edge;
+};
+
+static INLINE int has_scale(int xs, int ys) {
+ return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
+}
+
+static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
+ sp->subpel_x >>= SCALE_EXTRA_BITS;
+ sp->subpel_y >>= SCALE_EXTRA_BITS;
+ sp->xs >>= SCALE_EXTRA_BITS;
+ sp->ys >>= SCALE_EXTRA_BITS;
+ assert(sp->subpel_x < SUBPEL_SHIFTS);
+ assert(sp->subpel_y < SUBPEL_SHIFTS);
+ assert(sp->xs <= SUBPEL_SHIFTS);
+ assert(sp->ys <= SUBPEL_SHIFTS);
+}
+
+static INLINE void inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ int is_intrabc) {
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+ assert(sf);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
+ av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, subpel_params->subpel_x,
+ subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, sf, is_intrabc);
+ } else {
+ SubpelParams sp = *subpel_params;
+ revert_scale_extra_bits(&sp);
+ av1_convolve_2d_facade(src, src_stride, dst, dst_stride, w, h,
+ interp_filters, sp.subpel_x, sp.xs, sp.subpel_y,
+ sp.ys, 0, conv_params, sf, is_intrabc);
+ }
+}
+
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ int is_intrabc, int bd) {
+ assert(conv_params->do_average == 0 || conv_params->do_average == 1);
+ assert(sf);
+ const int is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
+ assert(IMPLIES(is_intrabc, !is_scaled));
+ if (is_scaled) {
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters,
+ subpel_params->subpel_x, subpel_params->xs, subpel_params->subpel_y,
+ subpel_params->ys, 1, conv_params, sf, is_intrabc, bd);
+ } else {
+ SubpelParams sp = *subpel_params;
+ revert_scale_extra_bits(&sp);
+ av1_highbd_convolve_2d_facade(
+ src, src_stride, dst, dst_stride, w, h, interp_filters, sp.subpel_x,
+ sp.xs, sp.subpel_y, sp.ys, 0, conv_params, sf, is_intrabc, bd);
+ }
+}
+
+void av1_modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi);
+int av1_skip_u4x4_pred_in_obmc(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd, int dir);
+
+static INLINE int is_interinter_compound_used(COMPOUND_TYPE type,
+ BLOCK_SIZE sb_type) {
+ const int comp_allowed = is_comp_ref_allowed(sb_type);
+ switch (type) {
+ case COMPOUND_AVERAGE:
+ case COMPOUND_DIFFWTD: return comp_allowed;
+ case COMPOUND_WEDGE:
+ return comp_allowed && wedge_params_lookup[sb_type].bits > 0;
+ default: assert(0); return 0;
+ }
+}
+
+static INLINE int is_any_masked_compound_used(BLOCK_SIZE sb_type) {
+ COMPOUND_TYPE comp_type;
+ int i;
+ if (!is_comp_ref_allowed(sb_type)) return 0;
+ for (i = 0; i < COMPOUND_TYPES; i++) {
+ comp_type = (COMPOUND_TYPE)i;
+ if (is_masked_compound_type(comp_type) &&
+ is_interinter_compound_used(comp_type, sb_type))
+ return 1;
+ }
+ return 0;
+}
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+ return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+ const int wbits = wedge_params_lookup[sb_type].bits;
+ return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+ return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+ return wedge_params_lookup[sb_type].bits;
+}
+
+void av1_make_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const SubpelParams *subpel_params,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ const MB_MODE_INFO *mi, int build_for_obmc,
+ const MACROBLOCKD *xd, int can_use_previous);
+
+void av1_make_masked_inter_predictor(
+ const uint8_t *pre, int pre_stride, uint8_t *dst, int dst_stride,
+ const SubpelParams *subpel_params, const struct scale_factors *sf, int w,
+ int h, ConvolveParams *conv_params, InterpFilters interp_filters, int plane,
+ const WarpTypesAllowed *warp_types, int p_col, int p_row, int ref,
+ MACROBLOCKD *xd, int can_use_previous);
+
+// TODO(jkoleszar): yet another mv clamping function :-(
+static INLINE MV clamp_mv_to_umv_border_sb(const MACROBLOCKD *xd,
+ const MV *src_mv, int bw, int bh,
+ int ss_x, int ss_y) {
+ // If the MV points so far into the UMV border that no visible pixels
+ // are used for reconstruction, the subpel part of the MV can be
+ // discarded and the MV limited to 16 pixels with equivalent results.
+ const int spel_left = (AOM_INTERP_EXTEND + bw) << SUBPEL_BITS;
+ const int spel_right = spel_left - SUBPEL_SHIFTS;
+ const int spel_top = (AOM_INTERP_EXTEND + bh) << SUBPEL_BITS;
+ const int spel_bottom = spel_top - SUBPEL_SHIFTS;
+ MV clamped_mv = { (int16_t)(src_mv->row * (1 << (1 - ss_y))),
+ (int16_t)(src_mv->col * (1 << (1 - ss_x))) };
+ assert(ss_x <= 1);
+ assert(ss_y <= 1);
+
+ clamp_mv(&clamped_mv, xd->mb_to_left_edge * (1 << (1 - ss_x)) - spel_left,
+ xd->mb_to_right_edge * (1 << (1 - ss_x)) + spel_right,
+ xd->mb_to_top_edge * (1 << (1 - ss_y)) - spel_top,
+ xd->mb_to_bottom_edge * (1 << (1 - ss_y)) + spel_bottom);
+
+ return clamped_mv;
+}
+
+static INLINE int scaled_buffer_offset(int x_offset, int y_offset, int stride,
+ const struct scale_factors *sf) {
+ const int x =
+ sf ? sf->scale_value_x(x_offset, sf) >> SCALE_EXTRA_BITS : x_offset;
+ const int y =
+ sf ? sf->scale_value_y(y_offset, sf) >> SCALE_EXTRA_BITS : y_offset;
+ return y * stride + x;
+}
+
+static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize,
+ uint8_t *src, int width, int height,
+ int stride, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ int subsampling_x, int subsampling_y) {
+ // Offset the buffer pointer
+ if (subsampling_y && (mi_row & 0x01) && (mi_size_high[bsize] == 1))
+ mi_row -= 1;
+ if (subsampling_x && (mi_col & 0x01) && (mi_size_wide[bsize] == 1))
+ mi_col -= 1;
+
+ const int x = (MI_SIZE * mi_col) >> subsampling_x;
+ const int y = (MI_SIZE * mi_row) >> subsampling_y;
+ dst->buf = src + scaled_buffer_offset(x, y, stride, scale);
+ dst->buf0 = src;
+ dst->width = width;
+ dst->height = height;
+ dst->stride = stride;
+}
+
+void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const int plane_start, const int plane_end);
+
+void av1_setup_pre_planes(MACROBLOCKD *xd, int idx,
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *sf, const int num_planes);
+
+static INLINE void set_default_interp_filters(
+ MB_MODE_INFO *const mbmi, InterpFilter frame_interp_filter) {
+ mbmi->interp_filters =
+ av1_broadcast_interp_filter(av1_unswitchable_filter(frame_interp_filter));
+}
+
+static INLINE int av1_is_interp_needed(const MACROBLOCKD *const xd) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ if (mbmi->skip_mode) return 0;
+ if (mbmi->motion_mode == WARPED_CAUSAL) return 0;
+ if (is_nontrans_global_motion(xd, xd->mi[0])) return 0;
+ return 1;
+}
+
+void av1_setup_build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, struct build_prediction_ctxt *ctxt,
+ const int num_planes);
+void av1_setup_build_prediction_by_left_pred(MACROBLOCKD *xd, int rel_mi_row,
+ uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi,
+ struct build_prediction_ctxt *ctxt,
+ const int num_planes);
+void av1_build_obmc_inter_prediction(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *above[MAX_MB_PLANE],
+ int above_stride[MAX_MB_PLANE],
+ uint8_t *left[MAX_MB_PLANE],
+ int left_stride[MAX_MB_PLANE]);
+
+const uint8_t *av1_get_obmc_mask(int length);
+void av1_count_overlappable_neighbors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col);
+
+#define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1)
+#define MASK_MASTER_STRIDE (MASK_MASTER_SIZE)
+
+void av1_init_wedge_masks();
+
+static INLINE const uint8_t *av1_get_contiguous_soft_mask(int wedge_index,
+ int wedge_sign,
+ BLOCK_SIZE sb_type) {
+ return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *av1_get_compound_type_mask(
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type);
+
+// build interintra_predictors for one plane
+void av1_build_interintra_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *pred, int stride,
+ BUFFER_SET *ctx, int plane,
+ BLOCK_SIZE bsize);
+
+void av1_build_interintra_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ uint8_t *upred, uint8_t *vpred,
+ int ustride, int vstride,
+ BUFFER_SET *ctx, BLOCK_SIZE bsize);
+
+void av1_build_intra_predictors_for_interintra(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+ BUFFER_SET *ctx, uint8_t *intra_pred, int intra_stride);
+
+void av1_combine_interintra(MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane,
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride);
+
+void av1_jnt_comp_weight_assign(const AV1_COMMON *cm, const MB_MODE_INFO *mbmi,
+ int order_idx, int *fwd_offset, int *bck_offset,
+ int *use_jnt_comp_avg, int is_compound);
+int av1_allow_warp(const MB_MODE_INFO *const mbmi,
+ const WarpTypesAllowed *const warp_types,
+ const WarpedMotionParams *const gm_params,
+ int build_for_obmc, int x_scale, int y_scale,
+ WarpedMotionParams *final_warp_params);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RECONINTER_H_
diff --git a/third_party/aom/av1/common/reconintra.c b/third_party/aom/av1/common/reconintra.c
new file mode 100644
index 0000000000..71a52e73e5
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.c
@@ -0,0 +1,1640 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/cfl.h"
+
+enum {
+ NEED_LEFT = 1 << 1,
+ NEED_ABOVE = 1 << 2,
+ NEED_ABOVERIGHT = 1 << 3,
+ NEED_ABOVELEFT = 1 << 4,
+ NEED_BOTTOMLEFT = 1 << 5,
+};
+
+#define INTRA_EDGE_FILT 3
+#define INTRA_EDGE_TAPS 5
+#define MAX_UPSAMPLE_SZ 16
+
+static const uint8_t extend_modes[INTRA_MODES] = {
+ NEED_ABOVE | NEED_LEFT, // DC
+ NEED_ABOVE, // V
+ NEED_LEFT, // H
+ NEED_ABOVE | NEED_ABOVERIGHT, // D45
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D135
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D113
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // D157
+ NEED_LEFT | NEED_BOTTOMLEFT, // D203
+ NEED_ABOVE | NEED_ABOVERIGHT, // D67
+ NEED_LEFT | NEED_ABOVE, // SMOOTH
+ NEED_LEFT | NEED_ABOVE, // SMOOTH_V
+ NEED_LEFT | NEED_ABOVE, // SMOOTH_H
+ NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT, // PAETH
+};
+
+// Tables to store if the top-right reference pixels are available. The flags
+// are represented with bits, packed into 8-bit integers. E.g., for the 32x32
+// blocks in a 128x128 superblock, the index of the "o" block is 10 (in raster
+// order), so its flag is stored at the 3rd bit of the 2nd entry in the table,
+// i.e. (table[10 / 8] >> (10 % 8)) & 1.
+// . . . .
+// . . . .
+// . . o .
+// . . . .
+static uint8_t has_tr_4x4[128] = {
+ 255, 255, 255, 255, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 255, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 255, 127, 255, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+ 127, 127, 127, 127, 85, 85, 85, 85, 119, 119, 119, 119, 85, 85, 85, 85,
+};
+static uint8_t has_tr_4x8[64] = {
+ 255, 255, 255, 255, 119, 119, 119, 119, 127, 127, 127, 127, 119,
+ 119, 119, 119, 255, 127, 255, 127, 119, 119, 119, 119, 127, 127,
+ 127, 127, 119, 119, 119, 119, 255, 255, 255, 127, 119, 119, 119,
+ 119, 127, 127, 127, 127, 119, 119, 119, 119, 255, 127, 255, 127,
+ 119, 119, 119, 119, 127, 127, 127, 127, 119, 119, 119, 119,
+};
+static uint8_t has_tr_8x4[64] = {
+ 255, 255, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 255, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+ 127, 127, 0, 0, 85, 85, 0, 0, 119, 119, 0, 0, 85, 85, 0, 0,
+};
+static uint8_t has_tr_8x8[32] = {
+ 255, 255, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+ 255, 127, 85, 85, 119, 119, 85, 85, 127, 127, 85, 85, 119, 119, 85, 85,
+};
+static uint8_t has_tr_8x16[16] = {
+ 255, 255, 119, 119, 127, 127, 119, 119,
+ 255, 127, 119, 119, 127, 127, 119, 119,
+};
+static uint8_t has_tr_16x8[16] = {
+ 255, 0, 85, 0, 119, 0, 85, 0, 127, 0, 85, 0, 119, 0, 85, 0,
+};
+static uint8_t has_tr_16x16[8] = {
+ 255, 85, 119, 85, 127, 85, 119, 85,
+};
+static uint8_t has_tr_16x32[4] = { 255, 119, 127, 119 };
+static uint8_t has_tr_32x16[4] = { 15, 5, 7, 5 };
+static uint8_t has_tr_32x32[2] = { 95, 87 };
+static uint8_t has_tr_32x64[1] = { 127 };
+static uint8_t has_tr_64x32[1] = { 19 };
+static uint8_t has_tr_64x64[1] = { 7 };
+static uint8_t has_tr_64x128[1] = { 3 };
+static uint8_t has_tr_128x64[1] = { 1 };
+static uint8_t has_tr_128x128[1] = { 1 };
+static uint8_t has_tr_4x16[32] = {
+ 255, 255, 255, 255, 127, 127, 127, 127, 255, 127, 255,
+ 127, 127, 127, 127, 127, 255, 255, 255, 127, 127, 127,
+ 127, 127, 255, 127, 255, 127, 127, 127, 127, 127,
+};
+static uint8_t has_tr_16x4[32] = {
+ 255, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+ 127, 0, 0, 0, 85, 0, 0, 0, 119, 0, 0, 0, 85, 0, 0, 0,
+};
+static uint8_t has_tr_8x32[8] = {
+ 255, 255, 127, 127, 255, 127, 127, 127,
+};
+static uint8_t has_tr_32x8[8] = {
+ 15, 0, 5, 0, 7, 0, 5, 0,
+};
+static uint8_t has_tr_16x64[2] = { 255, 127 };
+static uint8_t has_tr_64x16[2] = { 3, 1 };
+
+static const uint8_t *const has_tr_tables[BLOCK_SIZES_ALL] = {
+ // 4X4
+ has_tr_4x4,
+ // 4X8, 8X4, 8X8
+ has_tr_4x8, has_tr_8x4, has_tr_8x8,
+ // 8X16, 16X8, 16X16
+ has_tr_8x16, has_tr_16x8, has_tr_16x16,
+ // 16X32, 32X16, 32X32
+ has_tr_16x32, has_tr_32x16, has_tr_32x32,
+ // 32X64, 64X32, 64X64
+ has_tr_32x64, has_tr_64x32, has_tr_64x64,
+ // 64x128, 128x64, 128x128
+ has_tr_64x128, has_tr_128x64, has_tr_128x128,
+ // 4x16, 16x4, 8x32
+ has_tr_4x16, has_tr_16x4, has_tr_8x32,
+ // 32x8, 16x64, 64x16
+ has_tr_32x8, has_tr_16x64, has_tr_64x16
+};
+
+static uint8_t has_tr_vert_8x8[32] = {
+ 255, 255, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+ 255, 127, 0, 0, 119, 119, 0, 0, 127, 127, 0, 0, 119, 119, 0, 0,
+};
+static uint8_t has_tr_vert_16x16[8] = {
+ 255, 0, 119, 0, 127, 0, 119, 0,
+};
+static uint8_t has_tr_vert_32x32[2] = { 15, 7 };
+static uint8_t has_tr_vert_64x64[1] = { 3 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_tr_vert_tables[BLOCK_SIZES] = {
+ // 4X4
+ NULL,
+ // 4X8, 8X4, 8X8
+ has_tr_4x8, NULL, has_tr_vert_8x8,
+ // 8X16, 16X8, 16X16
+ has_tr_8x16, NULL, has_tr_vert_16x16,
+ // 16X32, 32X16, 32X32
+ has_tr_16x32, NULL, has_tr_vert_32x32,
+ // 32X64, 64X32, 64X64
+ has_tr_32x64, NULL, has_tr_vert_64x64,
+ // 64x128, 128x64, 128x128
+ has_tr_64x128, NULL, has_tr_128x128
+};
+
+static const uint8_t *get_has_tr_table(PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ const uint8_t *ret = NULL;
+ // If this is a mixed vertical partition, look up bsize in orders_vert.
+ if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+ assert(bsize < BLOCK_SIZES);
+ ret = has_tr_vert_tables[bsize];
+ } else {
+ ret = has_tr_tables[bsize];
+ }
+ assert(ret);
+ return ret;
+}
+
+static int has_top_right(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int top_available, int right_available,
+ PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+ int col_off, int ss_x, int ss_y) {
+ if (!top_available || !right_available) return 0;
+
+ const int bw_unit = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int plane_bw_unit = AOMMAX(bw_unit >> ss_x, 1);
+ const int top_right_count_unit = tx_size_wide_unit[txsz];
+
+ if (row_off > 0) { // Just need to check if enough pixels on the right.
+ if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64]) {
+ // Special case: For 128x128 blocks, the transform unit whose
+ // top-right corner is at the center of the block does in fact have
+ // pixels available at its top-right corner.
+ if (row_off == mi_size_high[BLOCK_64X64] >> ss_y &&
+ col_off + top_right_count_unit == mi_size_wide[BLOCK_64X64] >> ss_x) {
+ return 1;
+ }
+ const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+ const int col_off_64 = col_off % plane_bw_unit_64;
+ return col_off_64 + top_right_count_unit < plane_bw_unit_64;
+ }
+ return col_off + top_right_count_unit < plane_bw_unit;
+ } else {
+ // All top-right pixels are in the block above, which is already available.
+ if (col_off + top_right_count_unit < plane_bw_unit) return 1;
+
+ const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+ const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+ const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+ const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+ const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+ // Top row of superblock: so top-right pixels are in the top and/or
+ // top-right superblocks, both of which are already available.
+ if (blk_row_in_sb == 0) return 1;
+
+ // Rightmost column of superblock (and not the top row): so top-right pixels
+ // fall in the right superblock, which is not available yet.
+ if (((blk_col_in_sb + 1) << bw_in_mi_log2) >= sb_mi_size) {
+ return 0;
+ }
+
+ // General case (neither top row nor rightmost column): check if the
+ // top-right block is coded before the current block.
+ const int this_blk_index =
+ ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+ blk_col_in_sb + 0;
+ const int idx1 = this_blk_index / 8;
+ const int idx2 = this_blk_index % 8;
+ const uint8_t *has_tr_table = get_has_tr_table(partition, bsize);
+ return (has_tr_table[idx1] >> idx2) & 1;
+ }
+}
+
+// Similar to the has_tr_* tables, but store if the bottom-left reference
+// pixels are available.
+static uint8_t has_bl_4x4[128] = {
+ 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85,
+ 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1, 0, 84, 85, 85, 85, 16, 17,
+ 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84,
+ 85, 85, 85, 0, 0, 0, 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85,
+ 0, 1, 1, 1, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 1,
+ 0, 84, 85, 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 1, 1, 1, 84, 85,
+ 85, 85, 16, 17, 17, 17, 84, 85, 85, 85, 0, 0, 0, 0,
+};
+static uint8_t has_bl_4x8[64] = {
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 1, 0,
+ 16, 17, 17, 17, 0, 1, 1, 1, 16, 17, 17, 17, 0, 0, 0, 0,
+};
+static uint8_t has_bl_8x4[64] = {
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 1,
+ 254, 255, 84, 85, 254, 255, 16, 17, 254, 255, 84, 85, 254, 255, 0, 0,
+};
+static uint8_t has_bl_8x8[32] = {
+ 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+ 84, 85, 16, 17, 84, 85, 0, 1, 84, 85, 16, 17, 84, 85, 0, 0,
+};
+static uint8_t has_bl_8x16[16] = {
+ 16, 17, 0, 1, 16, 17, 0, 0, 16, 17, 0, 1, 16, 17, 0, 0,
+};
+static uint8_t has_bl_16x8[16] = {
+ 254, 84, 254, 16, 254, 84, 254, 0, 254, 84, 254, 16, 254, 84, 254, 0,
+};
+static uint8_t has_bl_16x16[8] = {
+ 84, 16, 84, 0, 84, 16, 84, 0,
+};
+static uint8_t has_bl_16x32[4] = { 16, 0, 16, 0 };
+static uint8_t has_bl_32x16[4] = { 78, 14, 78, 14 };
+static uint8_t has_bl_32x32[2] = { 4, 4 };
+static uint8_t has_bl_32x64[1] = { 0 };
+static uint8_t has_bl_64x32[1] = { 34 };
+static uint8_t has_bl_64x64[1] = { 0 };
+static uint8_t has_bl_64x128[1] = { 0 };
+static uint8_t has_bl_128x64[1] = { 0 };
+static uint8_t has_bl_128x128[1] = { 0 };
+static uint8_t has_bl_4x16[32] = {
+ 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+ 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
+};
+static uint8_t has_bl_16x4[32] = {
+ 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+ 254, 254, 254, 84, 254, 254, 254, 16, 254, 254, 254, 84, 254, 254, 254, 0,
+};
+static uint8_t has_bl_8x32[8] = {
+ 0, 1, 0, 0, 0, 1, 0, 0,
+};
+static uint8_t has_bl_32x8[8] = {
+ 238, 78, 238, 14, 238, 78, 238, 14,
+};
+static uint8_t has_bl_16x64[2] = { 0, 0 };
+static uint8_t has_bl_64x16[2] = { 42, 42 };
+
+static const uint8_t *const has_bl_tables[BLOCK_SIZES_ALL] = {
+ // 4X4
+ has_bl_4x4,
+ // 4X8, 8X4, 8X8
+ has_bl_4x8, has_bl_8x4, has_bl_8x8,
+ // 8X16, 16X8, 16X16
+ has_bl_8x16, has_bl_16x8, has_bl_16x16,
+ // 16X32, 32X16, 32X32
+ has_bl_16x32, has_bl_32x16, has_bl_32x32,
+ // 32X64, 64X32, 64X64
+ has_bl_32x64, has_bl_64x32, has_bl_64x64,
+ // 64x128, 128x64, 128x128
+ has_bl_64x128, has_bl_128x64, has_bl_128x128,
+ // 4x16, 16x4, 8x32
+ has_bl_4x16, has_bl_16x4, has_bl_8x32,
+ // 32x8, 16x64, 64x16
+ has_bl_32x8, has_bl_16x64, has_bl_64x16
+};
+
+static uint8_t has_bl_vert_8x8[32] = {
+ 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+ 254, 255, 16, 17, 254, 255, 0, 1, 254, 255, 16, 17, 254, 255, 0, 0,
+};
+static uint8_t has_bl_vert_16x16[8] = {
+ 254, 16, 254, 0, 254, 16, 254, 0,
+};
+static uint8_t has_bl_vert_32x32[2] = { 14, 14 };
+static uint8_t has_bl_vert_64x64[1] = { 2 };
+
+// The _vert_* tables are like the ordinary tables above, but describe the
+// order we visit square blocks when doing a PARTITION_VERT_A or
+// PARTITION_VERT_B. This is the same order as normal except for on the last
+// split where we go vertically (TL, BL, TR, BR). We treat the rectangular block
+// as a pair of squares, which means that these tables work correctly for both
+// mixed vertical partition types.
+//
+// There are tables for each of the square sizes. Vertical rectangles (like
+// BLOCK_16X32) use their respective "non-vert" table
+static const uint8_t *const has_bl_vert_tables[BLOCK_SIZES] = {
+ // 4X4
+ NULL,
+ // 4X8, 8X4, 8X8
+ has_bl_4x8, NULL, has_bl_vert_8x8,
+ // 8X16, 16X8, 16X16
+ has_bl_8x16, NULL, has_bl_vert_16x16,
+ // 16X32, 32X16, 32X32
+ has_bl_16x32, NULL, has_bl_vert_32x32,
+ // 32X64, 64X32, 64X64
+ has_bl_32x64, NULL, has_bl_vert_64x64,
+ // 64x128, 128x64, 128x128
+ has_bl_64x128, NULL, has_bl_128x128
+};
+
+static const uint8_t *get_has_bl_table(PARTITION_TYPE partition,
+ BLOCK_SIZE bsize) {
+ const uint8_t *ret = NULL;
+ // If this is a mixed vertical partition, look up bsize in orders_vert.
+ if (partition == PARTITION_VERT_A || partition == PARTITION_VERT_B) {
+ assert(bsize < BLOCK_SIZES);
+ ret = has_bl_vert_tables[bsize];
+ } else {
+ ret = has_bl_tables[bsize];
+ }
+ assert(ret);
+ return ret;
+}
+
+static int has_bottom_left(const AV1_COMMON *cm, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int bottom_available, int left_available,
+ PARTITION_TYPE partition, TX_SIZE txsz, int row_off,
+ int col_off, int ss_x, int ss_y) {
+ if (!bottom_available || !left_available) return 0;
+
+ // Special case for 128x* blocks, when col_off is half the block width.
+ // This is needed because 128x* superblocks are divided into 64x* blocks in
+ // raster order
+ if (block_size_wide[bsize] > block_size_wide[BLOCK_64X64] && col_off > 0) {
+ const int plane_bw_unit_64 = mi_size_wide[BLOCK_64X64] >> ss_x;
+ const int col_off_64 = col_off % plane_bw_unit_64;
+ if (col_off_64 == 0) {
+ // We are at the left edge of top-right or bottom-right 64x* block.
+ const int plane_bh_unit_64 = mi_size_high[BLOCK_64X64] >> ss_y;
+ const int row_off_64 = row_off % plane_bh_unit_64;
+ const int plane_bh_unit =
+ AOMMIN(mi_size_high[bsize] >> ss_y, plane_bh_unit_64);
+ // Check if all bottom-left pixels are in the left 64x* block (which is
+ // already coded).
+ return row_off_64 + tx_size_high_unit[txsz] < plane_bh_unit;
+ }
+ }
+
+ if (col_off > 0) {
+ // Bottom-left pixels are in the bottom-left block, which is not available.
+ return 0;
+ } else {
+ const int bh_unit = block_size_high[bsize] >> tx_size_high_log2[0];
+ const int plane_bh_unit = AOMMAX(bh_unit >> ss_y, 1);
+ const int bottom_left_count_unit = tx_size_high_unit[txsz];
+
+ // All bottom-left pixels are in the left block, which is already available.
+ if (row_off + bottom_left_count_unit < plane_bh_unit) return 1;
+
+ const int bw_in_mi_log2 = mi_size_wide_log2[bsize];
+ const int bh_in_mi_log2 = mi_size_high_log2[bsize];
+ const int sb_mi_size = mi_size_high[cm->seq_params.sb_size];
+ const int blk_row_in_sb = (mi_row & (sb_mi_size - 1)) >> bh_in_mi_log2;
+ const int blk_col_in_sb = (mi_col & (sb_mi_size - 1)) >> bw_in_mi_log2;
+
+ // Leftmost column of superblock: so bottom-left pixels maybe in the left
+ // and/or bottom-left superblocks. But only the left superblock is
+ // available, so check if all required pixels fall in that superblock.
+ if (blk_col_in_sb == 0) {
+ const int blk_start_row_off = blk_row_in_sb
+ << (bh_in_mi_log2 + MI_SIZE_LOG2 -
+ tx_size_wide_log2[0]) >>
+ ss_y;
+ const int row_off_in_sb = blk_start_row_off + row_off;
+ const int sb_height_unit = sb_mi_size >> ss_y;
+ return row_off_in_sb + bottom_left_count_unit < sb_height_unit;
+ }
+
+ // Bottom row of superblock (and not the leftmost column): so bottom-left
+ // pixels fall in the bottom superblock, which is not available yet.
+ if (((blk_row_in_sb + 1) << bh_in_mi_log2) >= sb_mi_size) return 0;
+
+ // General case (neither leftmost column nor bottom row): check if the
+ // bottom-left block is coded before the current block.
+ const int this_blk_index =
+ ((blk_row_in_sb + 0) << (MAX_MIB_SIZE_LOG2 - bw_in_mi_log2)) +
+ blk_col_in_sb + 0;
+ const int idx1 = this_blk_index / 8;
+ const int idx2 = this_blk_index % 8;
+ const uint8_t *has_bl_table = get_has_bl_table(partition, bsize);
+ return (has_bl_table[idx1] >> idx2) & 1;
+ }
+}
+
+typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left);
+
+static intra_pred_fn pred[INTRA_MODES][TX_SIZES_ALL];
+static intra_pred_fn dc_pred[2][2][TX_SIZES_ALL];
+
+typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above, const uint16_t *left,
+ int bd);
+static intra_high_pred_fn pred_high[INTRA_MODES][TX_SIZES_ALL];
+static intra_high_pred_fn dc_pred_high[2][2][TX_SIZES_ALL];
+
+static void init_intra_predictors_internal(void) {
+ assert(NELEMENTS(mode_to_angle_map) == INTRA_MODES);
+
+#define INIT_RECTANGULAR(p, type) \
+ p[TX_4X8] = aom_##type##_predictor_4x8; \
+ p[TX_8X4] = aom_##type##_predictor_8x4; \
+ p[TX_8X16] = aom_##type##_predictor_8x16; \
+ p[TX_16X8] = aom_##type##_predictor_16x8; \
+ p[TX_16X32] = aom_##type##_predictor_16x32; \
+ p[TX_32X16] = aom_##type##_predictor_32x16; \
+ p[TX_32X64] = aom_##type##_predictor_32x64; \
+ p[TX_64X32] = aom_##type##_predictor_64x32; \
+ p[TX_4X16] = aom_##type##_predictor_4x16; \
+ p[TX_16X4] = aom_##type##_predictor_16x4; \
+ p[TX_8X32] = aom_##type##_predictor_8x32; \
+ p[TX_32X8] = aom_##type##_predictor_32x8; \
+ p[TX_16X64] = aom_##type##_predictor_16x64; \
+ p[TX_64X16] = aom_##type##_predictor_64x16;
+
+#define INIT_NO_4X4(p, type) \
+ p[TX_8X8] = aom_##type##_predictor_8x8; \
+ p[TX_16X16] = aom_##type##_predictor_16x16; \
+ p[TX_32X32] = aom_##type##_predictor_32x32; \
+ p[TX_64X64] = aom_##type##_predictor_64x64; \
+ INIT_RECTANGULAR(p, type)
+
+#define INIT_ALL_SIZES(p, type) \
+ p[TX_4X4] = aom_##type##_predictor_4x4; \
+ INIT_NO_4X4(p, type)
+
+ INIT_ALL_SIZES(pred[V_PRED], v);
+ INIT_ALL_SIZES(pred[H_PRED], h);
+ INIT_ALL_SIZES(pred[PAETH_PRED], paeth);
+ INIT_ALL_SIZES(pred[SMOOTH_PRED], smooth);
+ INIT_ALL_SIZES(pred[SMOOTH_V_PRED], smooth_v);
+ INIT_ALL_SIZES(pred[SMOOTH_H_PRED], smooth_h);
+ INIT_ALL_SIZES(dc_pred[0][0], dc_128);
+ INIT_ALL_SIZES(dc_pred[0][1], dc_top);
+ INIT_ALL_SIZES(dc_pred[1][0], dc_left);
+ INIT_ALL_SIZES(dc_pred[1][1], dc);
+
+ INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
+ INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
+ INIT_ALL_SIZES(pred_high[PAETH_PRED], highbd_paeth);
+ INIT_ALL_SIZES(pred_high[SMOOTH_PRED], highbd_smooth);
+ INIT_ALL_SIZES(pred_high[SMOOTH_V_PRED], highbd_smooth_v);
+ INIT_ALL_SIZES(pred_high[SMOOTH_H_PRED], highbd_smooth_h);
+ INIT_ALL_SIZES(dc_pred_high[0][0], highbd_dc_128);
+ INIT_ALL_SIZES(dc_pred_high[0][1], highbd_dc_top);
+ INIT_ALL_SIZES(dc_pred_high[1][0], highbd_dc_left);
+ INIT_ALL_SIZES(dc_pred_high[1][1], highbd_dc);
+#undef intra_pred_allsizes
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ int r, c, x, base, shift, val;
+
+ (void)left;
+ (void)dy;
+ assert(dy == 1);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << upsample_above;
+ const int frac_bits = 6 - upsample_above;
+ const int base_inc = 1 << upsample_above;
+ x = dx;
+ for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+ base = x >> frac_bits;
+ shift = ((x << upsample_above) & 0x3F) >> 1;
+
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ memset(dst, above[max_base_x], bw * sizeof(dst[0]));
+ dst += stride;
+ }
+ return;
+ }
+
+ for (c = 0; c < bw; ++c, base += base_inc) {
+ if (base < max_base_x) {
+ val = above[base] * (32 - shift) + above[base + 1] * shift;
+ dst[c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ dst[c] = above[max_base_x];
+ }
+ }
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ int r, c, x, y, shift1, shift2, val, base1, base2;
+
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int min_base_x = -(1 << upsample_above);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ const int base_inc_x = 1 << upsample_above;
+ x = -dx;
+ for (r = 0; r < bh; ++r, x -= dx, dst += stride) {
+ base1 = x >> frac_bits_x;
+ y = (r << 6) - dy;
+ for (c = 0; c < bw; ++c, base1 += base_inc_x, y -= dy) {
+ if (base1 >= min_base_x) {
+ shift1 = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base1] * (32 - shift1) + above[base1 + 1] * shift1;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ base2 = y >> frac_bits_y;
+ assert(base2 >= -(1 << upsample_left));
+ shift2 = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base2] * (32 - shift2) + left[base2 + 1] * shift2;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ }
+ dst[c] = val;
+ }
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_dr_prediction_z3_c(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ int r, c, y, base, shift, val;
+
+ (void)above;
+ (void)dx;
+
+ assert(dx == 1);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << upsample_left;
+ const int frac_bits = 6 - upsample_left;
+ const int base_inc = 1 << upsample_left;
+ y = dy;
+ for (c = 0; c < bw; ++c, y += dy) {
+ base = y >> frac_bits;
+ shift = ((y << upsample_left) & 0x3F) >> 1;
+
+ for (r = 0; r < bh; ++r, base += base_inc) {
+ if (base < max_base_y) {
+ val = left[base] * (32 - shift) + left[base + 1] * shift;
+ dst[r * stride + c] = val = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+ break;
+ }
+ }
+ }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int angle) {
+ const int dx = av1_get_dx(angle);
+ const int dy = av1_get_dy(angle);
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ assert(angle > 0 && angle < 270);
+
+ if (angle > 0 && angle < 90) {
+ av1_dr_prediction_z1(dst, stride, bw, bh, above, left, upsample_above, dx,
+ dy);
+ } else if (angle > 90 && angle < 180) {
+ av1_dr_prediction_z2(dst, stride, bw, bh, above, left, upsample_above,
+ upsample_left, dx, dy);
+ } else if (angle > 180 && angle < 270) {
+ av1_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left, dx,
+ dy);
+ } else if (angle == 90) {
+ pred[V_PRED][tx_size](dst, stride, above, left);
+ } else if (angle == 180) {
+ pred[H_PRED][tx_size](dst, stride, above, left);
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ int r, c, x, base, shift, val;
+
+ (void)left;
+ (void)dy;
+ (void)bd;
+ assert(dy == 1);
+ assert(dx > 0);
+
+ const int max_base_x = ((bw + bh) - 1) << upsample_above;
+ const int frac_bits = 6 - upsample_above;
+ const int base_inc = 1 << upsample_above;
+ x = dx;
+ for (r = 0; r < bh; ++r, dst += stride, x += dx) {
+ base = x >> frac_bits;
+ shift = ((x << upsample_above) & 0x3F) >> 1;
+
+ if (base >= max_base_x) {
+ for (int i = r; i < bh; ++i) {
+ aom_memset16(dst, above[max_base_x], bw);
+ dst += stride;
+ }
+ return;
+ }
+
+ for (c = 0; c < bw; ++c, base += base_inc) {
+ if (base < max_base_x) {
+ val = above[base] * (32 - shift) + above[base + 1] * shift;
+ dst[c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ dst[c] = above[max_base_x];
+ }
+ }
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy, int bd) {
+ int r, c, x, y, shift, val, base;
+
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+
+ const int min_base_x = -(1 << upsample_above);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+ for (r = 0; r < bh; ++r) {
+ for (c = 0; c < bw; ++c) {
+ y = r + 1;
+ x = (c << 6) - y * dx;
+ base = x >> frac_bits_x;
+ if (base >= min_base_x) {
+ shift = ((x * (1 << upsample_above)) & 0x3F) >> 1;
+ val = above[base] * (32 - shift) + above[base + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ x = c + 1;
+ y = (r << 6) - x * dy;
+ base = y >> frac_bits_y;
+ shift = ((y * (1 << upsample_left)) & 0x3F) >> 1;
+ val = left[base] * (32 - shift) + left[base + 1] * shift;
+ val = ROUND_POWER_OF_TWO(val, 5);
+ }
+ dst[c] = val;
+ }
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_c(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ int r, c, y, base, shift, val;
+
+ (void)above;
+ (void)dx;
+ (void)bd;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ const int max_base_y = (bw + bh - 1) << upsample_left;
+ const int frac_bits = 6 - upsample_left;
+ const int base_inc = 1 << upsample_left;
+ y = dy;
+ for (c = 0; c < bw; ++c, y += dy) {
+ base = y >> frac_bits;
+ shift = ((y << upsample_left) & 0x3F) >> 1;
+
+ for (r = 0; r < bh; ++r, base += base_inc) {
+ if (base < max_base_y) {
+ val = left[base] * (32 - shift) + left[base + 1] * shift;
+ dst[r * stride + c] = ROUND_POWER_OF_TWO(val, 5);
+ } else {
+ for (; r < bh; ++r) dst[r * stride + c] = left[max_base_y];
+ break;
+ }
+ }
+ }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int angle, int bd) {
+ const int dx = av1_get_dx(angle);
+ const int dy = av1_get_dy(angle);
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ assert(angle > 0 && angle < 270);
+
+ if (angle > 0 && angle < 90) {
+ av1_highbd_dr_prediction_z1(dst, stride, bw, bh, above, left,
+ upsample_above, dx, dy, bd);
+ } else if (angle > 90 && angle < 180) {
+ av1_highbd_dr_prediction_z2(dst, stride, bw, bh, above, left,
+ upsample_above, upsample_left, dx, dy, bd);
+ } else if (angle > 180 && angle < 270) {
+ av1_highbd_dr_prediction_z3(dst, stride, bw, bh, above, left, upsample_left,
+ dx, dy, bd);
+ } else if (angle == 90) {
+ pred_high[V_PRED][tx_size](dst, stride, above, left, bd);
+ } else if (angle == 180) {
+ pred_high[H_PRED][tx_size](dst, stride, above, left, bd);
+ }
+}
+
+DECLARE_ALIGNED(16, const int8_t,
+ av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
+ {
+ { -6, 10, 0, 0, 0, 12, 0, 0 },
+ { -5, 2, 10, 0, 0, 9, 0, 0 },
+ { -3, 1, 1, 10, 0, 7, 0, 0 },
+ { -3, 1, 1, 2, 10, 5, 0, 0 },
+ { -4, 6, 0, 0, 0, 2, 12, 0 },
+ { -3, 2, 6, 0, 0, 2, 9, 0 },
+ { -3, 2, 2, 6, 0, 2, 7, 0 },
+ { -3, 1, 2, 2, 6, 3, 5, 0 },
+ },
+ {
+ { -10, 16, 0, 0, 0, 10, 0, 0 },
+ { -6, 0, 16, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 16, 0, 4, 0, 0 },
+ { -2, 0, 0, 0, 16, 2, 0, 0 },
+ { -10, 16, 0, 0, 0, 0, 10, 0 },
+ { -6, 0, 16, 0, 0, 0, 6, 0 },
+ { -4, 0, 0, 16, 0, 0, 4, 0 },
+ { -2, 0, 0, 0, 16, 0, 2, 0 },
+ },
+ {
+ { -8, 8, 0, 0, 0, 16, 0, 0 },
+ { -8, 0, 8, 0, 0, 16, 0, 0 },
+ { -8, 0, 0, 8, 0, 16, 0, 0 },
+ { -8, 0, 0, 0, 8, 16, 0, 0 },
+ { -4, 4, 0, 0, 0, 0, 16, 0 },
+ { -4, 0, 4, 0, 0, 0, 16, 0 },
+ { -4, 0, 0, 4, 0, 0, 16, 0 },
+ { -4, 0, 0, 0, 4, 0, 16, 0 },
+ },
+ {
+ { -2, 8, 0, 0, 0, 10, 0, 0 },
+ { -1, 3, 8, 0, 0, 6, 0, 0 },
+ { -1, 2, 3, 8, 0, 4, 0, 0 },
+ { 0, 1, 2, 3, 8, 2, 0, 0 },
+ { -1, 4, 0, 0, 0, 3, 10, 0 },
+ { -1, 3, 4, 0, 0, 4, 6, 0 },
+ { -1, 2, 3, 4, 0, 4, 4, 0 },
+ { -1, 2, 2, 3, 4, 3, 3, 0 },
+ },
+ {
+ { -12, 14, 0, 0, 0, 14, 0, 0 },
+ { -10, 0, 14, 0, 0, 12, 0, 0 },
+ { -9, 0, 0, 14, 0, 11, 0, 0 },
+ { -8, 0, 0, 0, 14, 10, 0, 0 },
+ { -10, 12, 0, 0, 0, 0, 14, 0 },
+ { -9, 1, 12, 0, 0, 0, 12, 0 },
+ { -8, 0, 0, 12, 0, 1, 11, 0 },
+ { -7, 0, 0, 1, 12, 1, 9, 0 },
+ },
+};
+
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ int r, c;
+ uint8_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ // The initialization is just for silencing Jenkins static analysis warnings
+ for (r = 0; r < bh + 1; ++r)
+ memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 4) {
+ const uint8_t p0 = buffer[r - 1][c - 1];
+ const uint8_t p1 = buffer[r - 1][c];
+ const uint8_t p2 = buffer[r - 1][c + 1];
+ const uint8_t p3 = buffer[r - 1][c + 2];
+ const uint8_t p4 = buffer[r - 1][c + 3];
+ const uint8_t p5 = buffer[r][c - 1];
+ const uint8_t p6 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 8; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ buffer[r + r_offset][c + c_offset] =
+ clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
+ av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6,
+ FILTER_INTRA_SCALE_BITS));
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+ dst += stride;
+ }
+}
+
+static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size,
+ const uint16_t *above,
+ const uint16_t *left, int mode,
+ int bd) {
+ int r, c;
+ uint16_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ // The initialization is just for silencing Jenkins static analysis warnings
+ for (r = 0; r < bh + 1; ++r)
+ memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 1; r < bh + 1; r += 2)
+ for (c = 1; c < bw + 1; c += 4) {
+ const uint16_t p0 = buffer[r - 1][c - 1];
+ const uint16_t p1 = buffer[r - 1][c];
+ const uint16_t p2 = buffer[r - 1][c + 1];
+ const uint16_t p3 = buffer[r - 1][c + 2];
+ const uint16_t p4 = buffer[r - 1][c + 3];
+ const uint16_t p5 = buffer[r][c - 1];
+ const uint16_t p6 = buffer[r + 1][c - 1];
+ for (int k = 0; k < 8; ++k) {
+ int r_offset = k >> 2;
+ int c_offset = k & 0x03;
+ buffer[r + r_offset][c + c_offset] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+ av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6,
+ FILTER_INTRA_SCALE_BITS),
+ bd);
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
+ dst += stride;
+ }
+}
+
+static int is_smooth(const MB_MODE_INFO *mbmi, int plane) {
+ if (plane == 0) {
+ const PREDICTION_MODE mode = mbmi->mode;
+ return (mode == SMOOTH_PRED || mode == SMOOTH_V_PRED ||
+ mode == SMOOTH_H_PRED);
+ } else {
+ // uv_mode is not set for inter blocks, so need to explicitly
+ // detect that case.
+ if (is_inter_block(mbmi)) return 0;
+
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ return (uv_mode == UV_SMOOTH_PRED || uv_mode == UV_SMOOTH_V_PRED ||
+ uv_mode == UV_SMOOTH_H_PRED);
+ }
+}
+
+static int get_filt_type(const MACROBLOCKD *xd, int plane) {
+ int ab_sm, le_sm;
+
+ if (plane == 0) {
+ const MB_MODE_INFO *ab = xd->above_mbmi;
+ const MB_MODE_INFO *le = xd->left_mbmi;
+ ab_sm = ab ? is_smooth(ab, plane) : 0;
+ le_sm = le ? is_smooth(le, plane) : 0;
+ } else {
+ const MB_MODE_INFO *ab = xd->chroma_above_mbmi;
+ const MB_MODE_INFO *le = xd->chroma_left_mbmi;
+ ab_sm = ab ? is_smooth(ab, plane) : 0;
+ le_sm = le ? is_smooth(le, plane) : 0;
+ }
+
+ return (ab_sm || le_sm) ? 1 : 0;
+}
+
+static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) {
+ const int d = abs(delta);
+ int strength = 0;
+
+ const int blk_wh = bs0 + bs1;
+ if (type == 0) {
+ if (blk_wh <= 8) {
+ if (d >= 56) strength = 1;
+ } else if (blk_wh <= 12) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 16) {
+ if (d >= 40) strength = 1;
+ } else if (blk_wh <= 24) {
+ if (d >= 8) strength = 1;
+ if (d >= 16) strength = 2;
+ if (d >= 32) strength = 3;
+ } else if (blk_wh <= 32) {
+ if (d >= 1) strength = 1;
+ if (d >= 4) strength = 2;
+ if (d >= 32) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ } else {
+ if (blk_wh <= 8) {
+ if (d >= 40) strength = 1;
+ if (d >= 64) strength = 2;
+ } else if (blk_wh <= 16) {
+ if (d >= 20) strength = 1;
+ if (d >= 48) strength = 2;
+ } else if (blk_wh <= 24) {
+ if (d >= 4) strength = 3;
+ } else {
+ if (d >= 1) strength = 3;
+ }
+ }
+ return strength;
+}
+
+void av1_filter_intra_edge_c(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
+ { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
+ };
+ const int filt = strength - 1;
+ uint8_t edge[129];
+
+ memcpy(edge, p, sz * sizeof(*p));
+ for (int i = 1; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+ int k = i - 2 + j;
+ k = (k < 0) ? 0 : k;
+ k = (k > sz - 1) ? sz - 1 : k;
+ s += edge[k] * kernel[filt][j];
+ }
+ s = (s + 8) >> 4;
+ p[i] = s;
+ }
+}
+
+static void filter_intra_edge_corner(uint8_t *p_above, uint8_t *p_left) {
+ const int kernel[3] = { 5, 6, 5 };
+
+ int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+ (p_above[0] * kernel[2]);
+ s = (s + 8) >> 4;
+ p_above[-1] = s;
+ p_left[-1] = s;
+}
+
+void av1_filter_intra_edge_high_c(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ const int kernel[INTRA_EDGE_FILT][INTRA_EDGE_TAPS] = {
+ { 0, 4, 8, 4, 0 }, { 0, 5, 6, 5, 0 }, { 2, 4, 4, 4, 2 }
+ };
+ const int filt = strength - 1;
+ uint16_t edge[129];
+
+ memcpy(edge, p, sz * sizeof(*p));
+ for (int i = 1; i < sz; i++) {
+ int s = 0;
+ for (int j = 0; j < INTRA_EDGE_TAPS; j++) {
+ int k = i - 2 + j;
+ k = (k < 0) ? 0 : k;
+ k = (k > sz - 1) ? sz - 1 : k;
+ s += edge[k] * kernel[filt][j];
+ }
+ s = (s + 8) >> 4;
+ p[i] = s;
+ }
+}
+
+static void filter_intra_edge_corner_high(uint16_t *p_above, uint16_t *p_left) {
+ const int kernel[3] = { 5, 6, 5 };
+
+ int s = (p_left[0] * kernel[0]) + (p_above[-1] * kernel[1]) +
+ (p_above[0] * kernel[2]);
+ s = (s + 8) >> 4;
+ p_above[-1] = s;
+ p_left[-1] = s;
+}
+
+void av1_upsample_intra_edge_c(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint8_t in[MAX_UPSAMPLE_SZ + 3];
+ // copy p[-1..(sz-1)] and extend first and last samples
+ in[0] = p[-1];
+ in[1] = p[-1];
+ for (int i = 0; i < sz; i++) {
+ in[i + 2] = p[i];
+ }
+ in[sz + 2] = p[sz - 1];
+
+ // interpolate half-sample edge positions
+ p[-2] = in[0];
+ for (int i = 0; i < sz; i++) {
+ int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+ s = clip_pixel((s + 8) >> 4);
+ p[2 * i - 1] = s;
+ p[2 * i] = in[i + 2];
+ }
+}
+
+void av1_upsample_intra_edge_high_c(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= MAX_UPSAMPLE_SZ);
+
+ uint16_t in[MAX_UPSAMPLE_SZ + 3];
+ // copy p[-1..(sz-1)] and extend first and last samples
+ in[0] = p[-1];
+ in[1] = p[-1];
+ for (int i = 0; i < sz; i++) {
+ in[i + 2] = p[i];
+ }
+ in[sz + 2] = p[sz - 1];
+
+ // interpolate half-sample edge positions
+ p[-2] = in[0];
+ for (int i = 0; i < sz; i++) {
+ int s = -in[i] + (9 * in[i + 1]) + (9 * in[i + 2]) - in[i + 3];
+ s = (s + 8) >> 4;
+ s = clip_pixel_highbd(s, bd);
+ p[2 * i - 1] = s;
+ p[2 * i] = in[i + 2];
+ }
+}
+
+static void build_intra_predictors_high(
+ const MACROBLOCKD *xd, const uint8_t *ref8, int ref_stride, uint8_t *dst8,
+ int dst_stride, PREDICTION_MODE mode, int angle_delta,
+ FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size,
+ int disable_edge_filter, int n_top_px, int n_topright_px, int n_left_px,
+ int n_bottomleft_px, int plane) {
+ int i;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ DECLARE_ALIGNED(16, uint16_t, left_data[MAX_TX_SIZE * 2 + 32]);
+ DECLARE_ALIGNED(16, uint16_t, above_data[MAX_TX_SIZE * 2 + 32]);
+ uint16_t *const above_row = above_data + 16;
+ uint16_t *const left_col = left_data + 16;
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ int need_left = extend_modes[mode] & NEED_LEFT;
+ int need_above = extend_modes[mode] & NEED_ABOVE;
+ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ const uint16_t *above_ref = ref - ref_stride;
+ const uint16_t *left_ref = ref - 1;
+ int p_angle = 0;
+ const int is_dr_mode = av1_is_directional_mode(mode);
+ const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+ int base = 128 << (xd->bd - 8);
+
+ // The default values if ref pixels are not available:
+ // base-1 base-1 base-1 .. base-1 base-1 base-1 base-1 base-1 base-1
+ // base+1 A B .. Y Z
+ // base+1 C D .. W X
+ // base+1 E F .. U V
+ // base+1 G H .. S T T T T T
+
+ if (is_dr_mode) {
+ p_angle = mode_to_angle_map[mode] + angle_delta;
+ if (p_angle <= 90)
+ need_above = 1, need_left = 0, need_above_left = 1;
+ else if (p_angle < 180)
+ need_above = 1, need_left = 1, need_above_left = 1;
+ else
+ need_above = 0, need_left = 1, need_above_left = 1;
+ }
+ if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+ assert(n_top_px >= 0);
+ assert(n_topright_px >= 0);
+ assert(n_left_px >= 0);
+ assert(n_bottomleft_px >= 0);
+
+ if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+ int val;
+ if (need_left) {
+ val = (n_top_px > 0) ? above_ref[0] : base + 1;
+ } else {
+ val = (n_left_px > 0) ? left_ref[0] : base - 1;
+ }
+ for (i = 0; i < txhpx; ++i) {
+ aom_memset16(dst, val, txwpx);
+ dst += dst_stride;
+ }
+ return;
+ }
+
+ // NEED_LEFT
+ if (need_left) {
+ int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+ if (use_filter_intra) need_bottom = 0;
+ if (is_dr_mode) need_bottom = p_angle > 180;
+ const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+ i = 0;
+ if (n_left_px > 0) {
+ for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+ if (need_bottom && n_bottomleft_px > 0) {
+ assert(i == txhpx);
+ for (; i < txhpx + n_bottomleft_px; i++)
+ left_col[i] = left_ref[i * ref_stride];
+ }
+ if (i < num_left_pixels_needed)
+ aom_memset16(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ } else {
+ if (n_top_px > 0) {
+ aom_memset16(left_col, above_ref[0], num_left_pixels_needed);
+ } else {
+ aom_memset16(left_col, base + 1, num_left_pixels_needed);
+ }
+ }
+ }
+
+ // NEED_ABOVE
+ if (need_above) {
+ int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+ if (use_filter_intra) need_right = 0;
+ if (is_dr_mode) need_right = p_angle < 90;
+ const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+ if (n_top_px > 0) {
+ memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0]));
+ i = n_top_px;
+ if (need_right && n_topright_px > 0) {
+ assert(n_top_px == txwpx);
+ memcpy(above_row + txwpx, above_ref + txwpx,
+ n_topright_px * sizeof(above_ref[0]));
+ i += n_topright_px;
+ }
+ if (i < num_top_pixels_needed)
+ aom_memset16(&above_row[i], above_row[i - 1],
+ num_top_pixels_needed - i);
+ } else {
+ if (n_left_px > 0) {
+ aom_memset16(above_row, left_ref[0], num_top_pixels_needed);
+ } else {
+ aom_memset16(above_row, base - 1, num_top_pixels_needed);
+ }
+ }
+ }
+
+ if (need_above_left) {
+ if (n_top_px > 0 && n_left_px > 0) {
+ above_row[-1] = above_ref[-1];
+ } else if (n_top_px > 0) {
+ above_row[-1] = above_ref[0];
+ } else if (n_left_px > 0) {
+ above_row[-1] = left_ref[0];
+ } else {
+ above_row[-1] = base;
+ }
+ left_col[-1] = above_row[-1];
+ }
+
+ if (use_filter_intra) {
+ highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode, xd->bd);
+ return;
+ }
+
+ if (is_dr_mode) {
+ int upsample_above = 0;
+ int upsample_left = 0;
+ if (!disable_edge_filter) {
+ const int need_right = p_angle < 90;
+ const int need_bottom = p_angle > 180;
+ const int filt_type = get_filt_type(xd, plane);
+ if (p_angle != 90 && p_angle != 180) {
+ const int ab_le = need_above_left ? 1 : 0;
+ if (need_above && need_left && (txwpx + txhpx >= 24)) {
+ filter_intra_edge_corner_high(above_row, left_col);
+ }
+ if (need_above && n_top_px > 0) {
+ const int strength =
+ intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+ const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+ av1_filter_intra_edge_high(above_row - ab_le, n_px, strength);
+ }
+ if (need_left && n_left_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txhpx, txwpx, p_angle - 180, filt_type);
+ const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+ av1_filter_intra_edge_high(left_col - ab_le, n_px, strength);
+ }
+ }
+ upsample_above =
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ if (need_above && upsample_above) {
+ const int n_px = txwpx + (need_right ? txhpx : 0);
+ av1_upsample_intra_edge_high(above_row, n_px, xd->bd);
+ }
+ upsample_left =
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ if (need_left && upsample_left) {
+ const int n_px = txhpx + (need_bottom ? txwpx : 0);
+ av1_upsample_intra_edge_high(left_col, n_px, xd->bd);
+ }
+ }
+ highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ upsample_above, upsample_left, p_angle, xd->bd);
+ return;
+ }
+
+ // predict
+ if (mode == DC_PRED) {
+ dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](
+ dst, dst_stride, above_row, left_col, xd->bd);
+ } else {
+ pred_high[mode][tx_size](dst, dst_stride, above_row, left_col, xd->bd);
+ }
+}
+
+static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
+ int ref_stride, uint8_t *dst, int dst_stride,
+ PREDICTION_MODE mode, int angle_delta,
+ FILTER_INTRA_MODE filter_intra_mode,
+ TX_SIZE tx_size, int disable_edge_filter,
+ int n_top_px, int n_topright_px,
+ int n_left_px, int n_bottomleft_px,
+ int plane) {
+ int i;
+ const uint8_t *above_ref = ref - ref_stride;
+ const uint8_t *left_ref = ref - 1;
+ DECLARE_ALIGNED(16, uint8_t, left_data[MAX_TX_SIZE * 2 + 32]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[MAX_TX_SIZE * 2 + 32]);
+ uint8_t *const above_row = above_data + 16;
+ uint8_t *const left_col = left_data + 16;
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ int need_left = extend_modes[mode] & NEED_LEFT;
+ int need_above = extend_modes[mode] & NEED_ABOVE;
+ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT;
+ int p_angle = 0;
+ const int is_dr_mode = av1_is_directional_mode(mode);
+ const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES;
+
+ // The default values if ref pixels are not available:
+ // 127 127 127 .. 127 127 127 127 127 127
+ // 129 A B .. Y Z
+ // 129 C D .. W X
+ // 129 E F .. U V
+ // 129 G H .. S T T T T T
+ // ..
+
+ if (is_dr_mode) {
+ p_angle = mode_to_angle_map[mode] + angle_delta;
+ if (p_angle <= 90)
+ need_above = 1, need_left = 0, need_above_left = 1;
+ else if (p_angle < 180)
+ need_above = 1, need_left = 1, need_above_left = 1;
+ else
+ need_above = 0, need_left = 1, need_above_left = 1;
+ }
+ if (use_filter_intra) need_left = need_above = need_above_left = 1;
+
+ assert(n_top_px >= 0);
+ assert(n_topright_px >= 0);
+ assert(n_left_px >= 0);
+ assert(n_bottomleft_px >= 0);
+
+ if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+ int val;
+ if (need_left) {
+ val = (n_top_px > 0) ? above_ref[0] : 129;
+ } else {
+ val = (n_left_px > 0) ? left_ref[0] : 127;
+ }
+ for (i = 0; i < txhpx; ++i) {
+ memset(dst, val, txwpx);
+ dst += dst_stride;
+ }
+ return;
+ }
+
+ // NEED_LEFT
+ if (need_left) {
+ int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+ if (use_filter_intra) need_bottom = 0;
+ if (is_dr_mode) need_bottom = p_angle > 180;
+ const int num_left_pixels_needed = txhpx + (need_bottom ? txwpx : 0);
+ i = 0;
+ if (n_left_px > 0) {
+ for (; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride];
+ if (need_bottom && n_bottomleft_px > 0) {
+ assert(i == txhpx);
+ for (; i < txhpx + n_bottomleft_px; i++)
+ left_col[i] = left_ref[i * ref_stride];
+ }
+ if (i < num_left_pixels_needed)
+ memset(&left_col[i], left_col[i - 1], num_left_pixels_needed - i);
+ } else {
+ if (n_top_px > 0) {
+ memset(left_col, above_ref[0], num_left_pixels_needed);
+ } else {
+ memset(left_col, 129, num_left_pixels_needed);
+ }
+ }
+ }
+
+ // NEED_ABOVE
+ if (need_above) {
+ int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+ if (use_filter_intra) need_right = 0;
+ if (is_dr_mode) need_right = p_angle < 90;
+ const int num_top_pixels_needed = txwpx + (need_right ? txhpx : 0);
+ if (n_top_px > 0) {
+ memcpy(above_row, above_ref, n_top_px);
+ i = n_top_px;
+ if (need_right && n_topright_px > 0) {
+ assert(n_top_px == txwpx);
+ memcpy(above_row + txwpx, above_ref + txwpx, n_topright_px);
+ i += n_topright_px;
+ }
+ if (i < num_top_pixels_needed)
+ memset(&above_row[i], above_row[i - 1], num_top_pixels_needed - i);
+ } else {
+ if (n_left_px > 0) {
+ memset(above_row, left_ref[0], num_top_pixels_needed);
+ } else {
+ memset(above_row, 127, num_top_pixels_needed);
+ }
+ }
+ }
+
+ if (need_above_left) {
+ if (n_top_px > 0 && n_left_px > 0) {
+ above_row[-1] = above_ref[-1];
+ } else if (n_top_px > 0) {
+ above_row[-1] = above_ref[0];
+ } else if (n_left_px > 0) {
+ above_row[-1] = left_ref[0];
+ } else {
+ above_row[-1] = 128;
+ }
+ left_col[-1] = above_row[-1];
+ }
+
+ if (use_filter_intra) {
+ av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode);
+ return;
+ }
+
+ if (is_dr_mode) {
+ int upsample_above = 0;
+ int upsample_left = 0;
+ if (!disable_edge_filter) {
+ const int need_right = p_angle < 90;
+ const int need_bottom = p_angle > 180;
+ const int filt_type = get_filt_type(xd, plane);
+ if (p_angle != 90 && p_angle != 180) {
+ const int ab_le = need_above_left ? 1 : 0;
+ if (need_above && need_left && (txwpx + txhpx >= 24)) {
+ filter_intra_edge_corner(above_row, left_col);
+ }
+ if (need_above && n_top_px > 0) {
+ const int strength =
+ intra_edge_filter_strength(txwpx, txhpx, p_angle - 90, filt_type);
+ const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0);
+ av1_filter_intra_edge(above_row - ab_le, n_px, strength);
+ }
+ if (need_left && n_left_px > 0) {
+ const int strength = intra_edge_filter_strength(
+ txhpx, txwpx, p_angle - 180, filt_type);
+ const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0);
+ av1_filter_intra_edge(left_col - ab_le, n_px, strength);
+ }
+ }
+ upsample_above =
+ av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, filt_type);
+ if (need_above && upsample_above) {
+ const int n_px = txwpx + (need_right ? txhpx : 0);
+ av1_upsample_intra_edge(above_row, n_px);
+ }
+ upsample_left =
+ av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, filt_type);
+ if (need_left && upsample_left) {
+ const int n_px = txhpx + (need_bottom ? txwpx : 0);
+ av1_upsample_intra_edge(left_col, n_px);
+ }
+ }
+ dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above,
+ upsample_left, p_angle);
+ return;
+ }
+
+ // predict
+ if (mode == DC_PRED) {
+ dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row,
+ left_col);
+ } else {
+ pred[mode][tx_size](dst, dst_stride, above_row, left_col);
+ }
+}
+
+void av1_predict_intra_block(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, int wpx, int hpx,
+ TX_SIZE tx_size, PREDICTION_MODE mode, int angle_delta, int use_palette,
+ FILTER_INTRA_MODE filter_intra_mode, const uint8_t *ref, int ref_stride,
+ uint8_t *dst, int dst_stride, int col_off, int row_off, int plane) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int txwpx = tx_size_wide[tx_size];
+ const int txhpx = tx_size_high[tx_size];
+ const int x = col_off << tx_size_wide_log2[0];
+ const int y = row_off << tx_size_high_log2[0];
+
+ if (use_palette) {
+ int r, c;
+ const uint8_t *const map = xd->plane[plane != 0].color_index_map +
+ xd->color_index_map_offset[plane != 0];
+ const uint16_t *const palette =
+ mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (r = 0; r < txhpx; ++r) {
+ for (c = 0; c < txwpx; ++c) {
+ dst16[r * dst_stride + c] = palette[map[(r + y) * wpx + c + x]];
+ }
+ }
+ } else {
+ for (r = 0; r < txhpx; ++r) {
+ for (c = 0; c < txwpx; ++c) {
+ dst[r * dst_stride + c] =
+ (uint8_t)palette[map[(r + y) * wpx + c + x]];
+ }
+ }
+ }
+ return;
+ }
+
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int txw = tx_size_wide_unit[tx_size];
+ const int txh = tx_size_high_unit[tx_size];
+ const int have_top = row_off || (pd->subsampling_y ? xd->chroma_up_available
+ : xd->up_available);
+ const int have_left =
+ col_off ||
+ (pd->subsampling_x ? xd->chroma_left_available : xd->left_available);
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+ const int xr_chr_offset = 0;
+ const int yd_chr_offset = 0;
+
+ // Distance between the right edge of this prediction block to
+ // the frame right edge
+ const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
+ (wpx - x - txwpx) - xr_chr_offset;
+ // Distance between the bottom edge of this prediction block to
+ // the frame bottom edge
+ const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
+ (hpx - y - txhpx) - yd_chr_offset;
+ const int right_available =
+ mi_col + ((col_off + txw) << pd->subsampling_x) < xd->tile.mi_col_end;
+ const int bottom_available =
+ (yd > 0) &&
+ (mi_row + ((row_off + txh) << pd->subsampling_y) < xd->tile.mi_row_end);
+
+ const PARTITION_TYPE partition = mbmi->partition;
+
+ // force 4x4 chroma component block size.
+ bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ const int have_top_right = has_top_right(
+ cm, bsize, mi_row, mi_col, have_top, right_available, partition, tx_size,
+ row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+ const int have_bottom_left = has_bottom_left(
+ cm, bsize, mi_row, mi_col, bottom_available, have_left, partition,
+ tx_size, row_off, col_off, pd->subsampling_x, pd->subsampling_y);
+
+ const int disable_edge_filter = !cm->seq_params.enable_intra_edge_filter;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ build_intra_predictors_high(
+ xd, ref, ref_stride, dst, dst_stride, mode, angle_delta,
+ filter_intra_mode, tx_size, disable_edge_filter,
+ have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+ have_top_right ? AOMMIN(txwpx, xr) : 0,
+ have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+ have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+ return;
+ }
+
+ build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode,
+ angle_delta, filter_intra_mode, tx_size,
+ disable_edge_filter,
+ have_top ? AOMMIN(txwpx, xr + txwpx) : 0,
+ have_top_right ? AOMMIN(txwpx, xr) : 0,
+ have_left ? AOMMIN(txhpx, yd + txhpx) : 0,
+ have_bottom_left ? AOMMIN(txhpx, yd) : 0, plane);
+}
+
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int blk_col, int blk_row,
+ TX_SIZE tx_size) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ const PREDICTION_MODE mode =
+ (plane == AOM_PLANE_Y) ? mbmi->mode : get_uv_mode(mbmi->uv_mode);
+ const int use_palette = mbmi->palette_mode_info.palette_size[plane != 0] > 0;
+ const FILTER_INTRA_MODE filter_intra_mode =
+ (plane == AOM_PLANE_Y && mbmi->filter_intra_mode_info.use_filter_intra)
+ ? mbmi->filter_intra_mode_info.filter_intra_mode
+ : FILTER_INTRA_MODES;
+ const int angle_delta = mbmi->angle_delta[plane != AOM_PLANE_Y] * ANGLE_STEP;
+
+ if (plane != AOM_PLANE_Y && mbmi->uv_mode == UV_CFL_PRED) {
+#if CONFIG_DEBUG
+ assert(is_cfl_allowed(xd));
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ mbmi->sb_type, pd->subsampling_x, pd->subsampling_y);
+ (void)plane_bsize;
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ if (!xd->lossless[mbmi->segment_id]) {
+ assert(blk_col == 0);
+ assert(blk_row == 0);
+ assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+ assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+ }
+#endif
+ CFL_CTX *const cfl = &xd->cfl;
+ CFL_PRED_TYPE pred_plane = get_cfl_pred_type(plane);
+ if (cfl->dc_pred_is_cached[pred_plane] == 0) {
+ av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+ angle_delta, use_palette, filter_intra_mode, dst,
+ dst_stride, dst, dst_stride, blk_col, blk_row,
+ plane);
+ if (cfl->use_dc_pred_cache) {
+ cfl_store_dc_pred(xd, dst, pred_plane, tx_size_wide[tx_size]);
+ cfl->dc_pred_is_cached[pred_plane] = 1;
+ }
+ } else {
+ cfl_load_dc_pred(xd, dst, dst_stride, tx_size, pred_plane);
+ }
+ cfl_predict_block(xd, dst, dst_stride, tx_size, plane);
+ return;
+ }
+ av1_predict_intra_block(cm, xd, pd->width, pd->height, tx_size, mode,
+ angle_delta, use_palette, filter_intra_mode, dst,
+ dst_stride, dst, dst_stride, blk_col, blk_row, plane);
+}
+
+void av1_init_intra_predictors(void) {
+ aom_once(init_intra_predictors_internal);
+}
diff --git a/third_party/aom/av1/common/reconintra.h b/third_party/aom/av1/common/reconintra.h
new file mode 100644
index 0000000000..07853aba01
--- /dev/null
+++ b/third_party/aom/av1/common/reconintra.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RECONINTRA_H_
+#define AOM_AV1_COMMON_RECONINTRA_H_
+
+#include <stdlib.h>
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_init_intra_predictors(void);
+void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, int blk_col, int blk_row,
+ TX_SIZE tx_size);
+void av1_predict_intra_block(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int bw, int bh, TX_SIZE tx_size,
+ PREDICTION_MODE mode, int angle_delta,
+ int use_palette,
+ FILTER_INTRA_MODE filter_intra_mode,
+ const uint8_t *ref, int ref_stride, uint8_t *dst,
+ int dst_stride, int aoff, int loff, int plane);
+
+// Mapping of interintra to intra mode for use in the intra component
+static const PREDICTION_MODE interintra_to_intra_mode[INTERINTRA_MODES] = {
+ DC_PRED, V_PRED, H_PRED, SMOOTH_PRED
+};
+
+// Mapping of intra mode to the interintra mode
+static const INTERINTRA_MODE intra_to_interintra_mode[INTRA_MODES] = {
+ II_DC_PRED, II_V_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_V_PRED,
+ II_H_PRED, II_H_PRED, II_V_PRED, II_SMOOTH_PRED, II_SMOOTH_PRED
+};
+
+#define FILTER_INTRA_SCALE_BITS 4
+
+static INLINE int av1_is_directional_mode(PREDICTION_MODE mode) {
+ return mode >= V_PRED && mode <= D67_PRED;
+}
+
+static INLINE int av1_use_angle_delta(BLOCK_SIZE bsize) {
+ return bsize >= BLOCK_8X8;
+}
+
+static INLINE int av1_allow_intrabc(const AV1_COMMON *const cm) {
+ return frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+ cm->allow_intrabc;
+}
+
+static INLINE int av1_filter_intra_allowed_bsize(const AV1_COMMON *const cm,
+ BLOCK_SIZE bs) {
+ if (!cm->seq_params.enable_filter_intra || bs == BLOCK_INVALID) return 0;
+
+ return block_size_wide[bs] <= 32 && block_size_high[bs] <= 32;
+}
+
+static INLINE int av1_filter_intra_allowed(const AV1_COMMON *const cm,
+ const MB_MODE_INFO *mbmi) {
+ return mbmi->mode == DC_PRED &&
+ mbmi->palette_mode_info.palette_size[0] == 0 &&
+ av1_filter_intra_allowed_bsize(cm, mbmi->sb_type);
+}
+
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+
+// Get the shift (up-scaled by 256) in X w.r.t a unit change in Y.
+// If angle > 0 && angle < 90, dx = -((int)(256 / t));
+// If angle > 90 && angle < 180, dx = (int)(256 / t);
+// If angle > 180 && angle < 270, dx = 1;
+static INLINE int av1_get_dx(int angle) {
+ if (angle > 0 && angle < 90) {
+ return dr_intra_derivative[angle];
+ } else if (angle > 90 && angle < 180) {
+ return dr_intra_derivative[180 - angle];
+ } else {
+ // In this case, we are not really going to use dx. We may return any value.
+ return 1;
+ }
+}
+
+// Get the shift (up-scaled by 256) in Y w.r.t a unit change in X.
+// If angle > 0 && angle < 90, dy = 1;
+// If angle > 90 && angle < 180, dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dy = -((int)(256 * t));
+static INLINE int av1_get_dy(int angle) {
+ if (angle > 90 && angle < 180) {
+ return dr_intra_derivative[angle - 90];
+ } else if (angle > 180 && angle < 270) {
+ return dr_intra_derivative[270 - angle];
+ } else {
+ // In this case, we are not really going to use dy. We may return any value.
+ return 1;
+ }
+}
+
+static INLINE int av1_use_intra_edge_upsample(int bs0, int bs1, int delta,
+ int type) {
+ const int d = abs(delta);
+ const int blk_wh = bs0 + bs1;
+ if (d <= 0 || d >= 40) return 0;
+ return type ? (blk_wh <= 8) : (blk_wh <= 16);
+}
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_COMMON_RECONINTRA_H_
diff --git a/third_party/aom/av1/common/resize.c b/third_party/aom/av1/common/resize.c
new file mode 100644
index 0000000000..d61a20aa2d
--- /dev/null
+++ b/third_party/aom/av1/common/resize.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/aom_scale.h"
+#include "av1/common/common.h"
+#include "av1/common/resize.h"
+
+#include "config/aom_scale_rtcd.h"
+
+// Filters for interpolation (0.5-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters500[(1 << RS_SUBPEL_BITS)] = {
+ { -3, 0, 35, 64, 35, 0, -3, 0 }, { -3, 0, 34, 64, 36, 0, -3, 0 },
+ { -3, -1, 34, 64, 36, 1, -3, 0 }, { -3, -1, 33, 64, 37, 1, -3, 0 },
+ { -3, -1, 32, 64, 38, 1, -3, 0 }, { -3, -1, 31, 64, 39, 1, -3, 0 },
+ { -3, -1, 31, 63, 39, 2, -3, 0 }, { -2, -2, 30, 63, 40, 2, -3, 0 },
+ { -2, -2, 29, 63, 41, 2, -3, 0 }, { -2, -2, 29, 63, 41, 3, -4, 0 },
+ { -2, -2, 28, 63, 42, 3, -4, 0 }, { -2, -2, 27, 63, 43, 3, -4, 0 },
+ { -2, -3, 27, 63, 43, 4, -4, 0 }, { -2, -3, 26, 62, 44, 5, -4, 0 },
+ { -2, -3, 25, 62, 45, 5, -4, 0 }, { -2, -3, 25, 62, 45, 5, -4, 0 },
+ { -2, -3, 24, 62, 46, 5, -4, 0 }, { -2, -3, 23, 61, 47, 6, -4, 0 },
+ { -2, -3, 23, 61, 47, 6, -4, 0 }, { -2, -3, 22, 61, 48, 7, -4, -1 },
+ { -2, -3, 21, 60, 49, 7, -4, 0 }, { -1, -4, 20, 60, 49, 8, -4, 0 },
+ { -1, -4, 20, 60, 50, 8, -4, -1 }, { -1, -4, 19, 59, 51, 9, -4, -1 },
+ { -1, -4, 19, 59, 51, 9, -4, -1 }, { -1, -4, 18, 58, 52, 10, -4, -1 },
+ { -1, -4, 17, 58, 52, 11, -4, -1 }, { -1, -4, 16, 58, 53, 11, -4, -1 },
+ { -1, -4, 16, 57, 53, 12, -4, -1 }, { -1, -4, 15, 57, 54, 12, -4, -1 },
+ { -1, -4, 15, 56, 54, 13, -4, -1 }, { -1, -4, 14, 56, 55, 13, -4, -1 },
+ { -1, -4, 14, 55, 55, 14, -4, -1 }, { -1, -4, 13, 55, 56, 14, -4, -1 },
+ { -1, -4, 13, 54, 56, 15, -4, -1 }, { -1, -4, 12, 54, 57, 15, -4, -1 },
+ { -1, -4, 12, 53, 57, 16, -4, -1 }, { -1, -4, 11, 53, 58, 16, -4, -1 },
+ { -1, -4, 11, 52, 58, 17, -4, -1 }, { -1, -4, 10, 52, 58, 18, -4, -1 },
+ { -1, -4, 9, 51, 59, 19, -4, -1 }, { -1, -4, 9, 51, 59, 19, -4, -1 },
+ { -1, -4, 8, 50, 60, 20, -4, -1 }, { 0, -4, 8, 49, 60, 20, -4, -1 },
+ { 0, -4, 7, 49, 60, 21, -3, -2 }, { -1, -4, 7, 48, 61, 22, -3, -2 },
+ { 0, -4, 6, 47, 61, 23, -3, -2 }, { 0, -4, 6, 47, 61, 23, -3, -2 },
+ { 0, -4, 5, 46, 62, 24, -3, -2 }, { 0, -4, 5, 45, 62, 25, -3, -2 },
+ { 0, -4, 5, 45, 62, 25, -3, -2 }, { 0, -4, 5, 44, 62, 26, -3, -2 },
+ { 0, -4, 4, 43, 63, 27, -3, -2 }, { 0, -4, 3, 43, 63, 27, -2, -2 },
+ { 0, -4, 3, 42, 63, 28, -2, -2 }, { 0, -4, 3, 41, 63, 29, -2, -2 },
+ { 0, -3, 2, 41, 63, 29, -2, -2 }, { 0, -3, 2, 40, 63, 30, -2, -2 },
+ { 0, -3, 2, 39, 63, 31, -1, -3 }, { 0, -3, 1, 39, 64, 31, -1, -3 },
+ { 0, -3, 1, 38, 64, 32, -1, -3 }, { 0, -3, 1, 37, 64, 33, -1, -3 },
+ { 0, -3, 1, 36, 64, 34, -1, -3 }, { 0, -3, 0, 36, 64, 34, 0, -3 },
+};
+
+// Filters for interpolation (0.625-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters625[(1 << RS_SUBPEL_BITS)] = {
+ { -1, -8, 33, 80, 33, -8, -1, 0 }, { -1, -8, 31, 80, 34, -8, -1, 1 },
+ { -1, -8, 30, 80, 35, -8, -1, 1 }, { -1, -8, 29, 80, 36, -7, -2, 1 },
+ { -1, -8, 28, 80, 37, -7, -2, 1 }, { -1, -8, 27, 80, 38, -7, -2, 1 },
+ { 0, -8, 26, 79, 39, -7, -2, 1 }, { 0, -8, 25, 79, 40, -7, -2, 1 },
+ { 0, -8, 24, 79, 41, -7, -2, 1 }, { 0, -8, 23, 78, 42, -6, -2, 1 },
+ { 0, -8, 22, 78, 43, -6, -2, 1 }, { 0, -8, 21, 78, 44, -6, -2, 1 },
+ { 0, -8, 20, 78, 45, -5, -3, 1 }, { 0, -8, 19, 77, 47, -5, -3, 1 },
+ { 0, -8, 18, 77, 48, -5, -3, 1 }, { 0, -8, 17, 77, 49, -5, -3, 1 },
+ { 0, -8, 16, 76, 50, -4, -3, 1 }, { 0, -8, 15, 76, 51, -4, -3, 1 },
+ { 0, -8, 15, 75, 52, -3, -4, 1 }, { 0, -7, 14, 74, 53, -3, -4, 1 },
+ { 0, -7, 13, 74, 54, -3, -4, 1 }, { 0, -7, 12, 73, 55, -2, -4, 1 },
+ { 0, -7, 11, 73, 56, -2, -4, 1 }, { 0, -7, 10, 72, 57, -1, -4, 1 },
+ { 1, -7, 10, 71, 58, -1, -5, 1 }, { 0, -7, 9, 71, 59, 0, -5, 1 },
+ { 1, -7, 8, 70, 60, 0, -5, 1 }, { 1, -7, 7, 69, 61, 1, -5, 1 },
+ { 1, -6, 6, 68, 62, 1, -5, 1 }, { 0, -6, 6, 68, 62, 2, -5, 1 },
+ { 1, -6, 5, 67, 63, 2, -5, 1 }, { 1, -6, 5, 66, 64, 3, -6, 1 },
+ { 1, -6, 4, 65, 65, 4, -6, 1 }, { 1, -6, 3, 64, 66, 5, -6, 1 },
+ { 1, -5, 2, 63, 67, 5, -6, 1 }, { 1, -5, 2, 62, 68, 6, -6, 0 },
+ { 1, -5, 1, 62, 68, 6, -6, 1 }, { 1, -5, 1, 61, 69, 7, -7, 1 },
+ { 1, -5, 0, 60, 70, 8, -7, 1 }, { 1, -5, 0, 59, 71, 9, -7, 0 },
+ { 1, -5, -1, 58, 71, 10, -7, 1 }, { 1, -4, -1, 57, 72, 10, -7, 0 },
+ { 1, -4, -2, 56, 73, 11, -7, 0 }, { 1, -4, -2, 55, 73, 12, -7, 0 },
+ { 1, -4, -3, 54, 74, 13, -7, 0 }, { 1, -4, -3, 53, 74, 14, -7, 0 },
+ { 1, -4, -3, 52, 75, 15, -8, 0 }, { 1, -3, -4, 51, 76, 15, -8, 0 },
+ { 1, -3, -4, 50, 76, 16, -8, 0 }, { 1, -3, -5, 49, 77, 17, -8, 0 },
+ { 1, -3, -5, 48, 77, 18, -8, 0 }, { 1, -3, -5, 47, 77, 19, -8, 0 },
+ { 1, -3, -5, 45, 78, 20, -8, 0 }, { 1, -2, -6, 44, 78, 21, -8, 0 },
+ { 1, -2, -6, 43, 78, 22, -8, 0 }, { 1, -2, -6, 42, 78, 23, -8, 0 },
+ { 1, -2, -7, 41, 79, 24, -8, 0 }, { 1, -2, -7, 40, 79, 25, -8, 0 },
+ { 1, -2, -7, 39, 79, 26, -8, 0 }, { 1, -2, -7, 38, 80, 27, -8, -1 },
+ { 1, -2, -7, 37, 80, 28, -8, -1 }, { 1, -2, -7, 36, 80, 29, -8, -1 },
+ { 1, -1, -8, 35, 80, 30, -8, -1 }, { 1, -1, -8, 34, 80, 31, -8, -1 },
+};
+
+// Filters for interpolation (0.75-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters750[(1 << RS_SUBPEL_BITS)] = {
+ { 2, -11, 25, 96, 25, -11, 2, 0 }, { 2, -11, 24, 96, 26, -11, 2, 0 },
+ { 2, -11, 22, 96, 28, -11, 2, 0 }, { 2, -10, 21, 96, 29, -12, 2, 0 },
+ { 2, -10, 19, 96, 31, -12, 2, 0 }, { 2, -10, 18, 95, 32, -11, 2, 0 },
+ { 2, -10, 17, 95, 34, -12, 2, 0 }, { 2, -9, 15, 95, 35, -12, 2, 0 },
+ { 2, -9, 14, 94, 37, -12, 2, 0 }, { 2, -9, 13, 94, 38, -12, 2, 0 },
+ { 2, -8, 12, 93, 40, -12, 1, 0 }, { 2, -8, 11, 93, 41, -12, 1, 0 },
+ { 2, -8, 9, 92, 43, -12, 1, 1 }, { 2, -8, 8, 92, 44, -12, 1, 1 },
+ { 2, -7, 7, 91, 46, -12, 1, 0 }, { 2, -7, 6, 90, 47, -12, 1, 1 },
+ { 2, -7, 5, 90, 49, -12, 1, 0 }, { 2, -6, 4, 89, 50, -12, 1, 0 },
+ { 2, -6, 3, 88, 52, -12, 0, 1 }, { 2, -6, 2, 87, 54, -12, 0, 1 },
+ { 2, -5, 1, 86, 55, -12, 0, 1 }, { 2, -5, 0, 85, 57, -12, 0, 1 },
+ { 2, -5, -1, 84, 58, -11, 0, 1 }, { 2, -5, -2, 83, 60, -11, 0, 1 },
+ { 2, -4, -2, 82, 61, -11, -1, 1 }, { 1, -4, -3, 81, 63, -10, -1, 1 },
+ { 2, -4, -4, 80, 64, -10, -1, 1 }, { 1, -4, -4, 79, 66, -10, -1, 1 },
+ { 1, -3, -5, 77, 67, -9, -1, 1 }, { 1, -3, -6, 76, 69, -9, -1, 1 },
+ { 1, -3, -6, 75, 70, -8, -2, 1 }, { 1, -2, -7, 74, 71, -8, -2, 1 },
+ { 1, -2, -7, 72, 72, -7, -2, 1 }, { 1, -2, -8, 71, 74, -7, -2, 1 },
+ { 1, -2, -8, 70, 75, -6, -3, 1 }, { 1, -1, -9, 69, 76, -6, -3, 1 },
+ { 1, -1, -9, 67, 77, -5, -3, 1 }, { 1, -1, -10, 66, 79, -4, -4, 1 },
+ { 1, -1, -10, 64, 80, -4, -4, 2 }, { 1, -1, -10, 63, 81, -3, -4, 1 },
+ { 1, -1, -11, 61, 82, -2, -4, 2 }, { 1, 0, -11, 60, 83, -2, -5, 2 },
+ { 1, 0, -11, 58, 84, -1, -5, 2 }, { 1, 0, -12, 57, 85, 0, -5, 2 },
+ { 1, 0, -12, 55, 86, 1, -5, 2 }, { 1, 0, -12, 54, 87, 2, -6, 2 },
+ { 1, 0, -12, 52, 88, 3, -6, 2 }, { 0, 1, -12, 50, 89, 4, -6, 2 },
+ { 0, 1, -12, 49, 90, 5, -7, 2 }, { 1, 1, -12, 47, 90, 6, -7, 2 },
+ { 0, 1, -12, 46, 91, 7, -7, 2 }, { 1, 1, -12, 44, 92, 8, -8, 2 },
+ { 1, 1, -12, 43, 92, 9, -8, 2 }, { 0, 1, -12, 41, 93, 11, -8, 2 },
+ { 0, 1, -12, 40, 93, 12, -8, 2 }, { 0, 2, -12, 38, 94, 13, -9, 2 },
+ { 0, 2, -12, 37, 94, 14, -9, 2 }, { 0, 2, -12, 35, 95, 15, -9, 2 },
+ { 0, 2, -12, 34, 95, 17, -10, 2 }, { 0, 2, -11, 32, 95, 18, -10, 2 },
+ { 0, 2, -12, 31, 96, 19, -10, 2 }, { 0, 2, -12, 29, 96, 21, -10, 2 },
+ { 0, 2, -11, 28, 96, 22, -11, 2 }, { 0, 2, -11, 26, 96, 24, -11, 2 },
+};
+
+// Filters for interpolation (0.875-band) - note this also filters integer pels.
+static const InterpKernel filteredinterp_filters875[(1 << RS_SUBPEL_BITS)] = {
+ { 3, -8, 13, 112, 13, -8, 3, 0 }, { 2, -7, 12, 112, 15, -8, 3, -1 },
+ { 3, -7, 10, 112, 17, -9, 3, -1 }, { 2, -6, 8, 112, 19, -9, 3, -1 },
+ { 2, -6, 7, 112, 21, -10, 3, -1 }, { 2, -5, 6, 111, 22, -10, 3, -1 },
+ { 2, -5, 4, 111, 24, -10, 3, -1 }, { 2, -4, 3, 110, 26, -11, 3, -1 },
+ { 2, -4, 1, 110, 28, -11, 3, -1 }, { 2, -4, 0, 109, 30, -12, 4, -1 },
+ { 1, -3, -1, 108, 32, -12, 4, -1 }, { 1, -3, -2, 108, 34, -13, 4, -1 },
+ { 1, -2, -4, 107, 36, -13, 4, -1 }, { 1, -2, -5, 106, 38, -13, 4, -1 },
+ { 1, -1, -6, 105, 40, -14, 4, -1 }, { 1, -1, -7, 104, 42, -14, 4, -1 },
+ { 1, -1, -7, 103, 44, -15, 4, -1 }, { 1, 0, -8, 101, 46, -15, 4, -1 },
+ { 1, 0, -9, 100, 48, -15, 4, -1 }, { 1, 0, -10, 99, 50, -15, 4, -1 },
+ { 1, 1, -11, 97, 53, -16, 4, -1 }, { 0, 1, -11, 96, 55, -16, 4, -1 },
+ { 0, 1, -12, 95, 57, -16, 4, -1 }, { 0, 2, -13, 93, 59, -16, 4, -1 },
+ { 0, 2, -13, 91, 61, -16, 4, -1 }, { 0, 2, -14, 90, 63, -16, 4, -1 },
+ { 0, 2, -14, 88, 65, -16, 4, -1 }, { 0, 2, -15, 86, 67, -16, 4, 0 },
+ { 0, 3, -15, 84, 69, -17, 4, 0 }, { 0, 3, -16, 83, 71, -17, 4, 0 },
+ { 0, 3, -16, 81, 73, -16, 3, 0 }, { 0, 3, -16, 79, 75, -16, 3, 0 },
+ { 0, 3, -16, 77, 77, -16, 3, 0 }, { 0, 3, -16, 75, 79, -16, 3, 0 },
+ { 0, 3, -16, 73, 81, -16, 3, 0 }, { 0, 4, -17, 71, 83, -16, 3, 0 },
+ { 0, 4, -17, 69, 84, -15, 3, 0 }, { 0, 4, -16, 67, 86, -15, 2, 0 },
+ { -1, 4, -16, 65, 88, -14, 2, 0 }, { -1, 4, -16, 63, 90, -14, 2, 0 },
+ { -1, 4, -16, 61, 91, -13, 2, 0 }, { -1, 4, -16, 59, 93, -13, 2, 0 },
+ { -1, 4, -16, 57, 95, -12, 1, 0 }, { -1, 4, -16, 55, 96, -11, 1, 0 },
+ { -1, 4, -16, 53, 97, -11, 1, 1 }, { -1, 4, -15, 50, 99, -10, 0, 1 },
+ { -1, 4, -15, 48, 100, -9, 0, 1 }, { -1, 4, -15, 46, 101, -8, 0, 1 },
+ { -1, 4, -15, 44, 103, -7, -1, 1 }, { -1, 4, -14, 42, 104, -7, -1, 1 },
+ { -1, 4, -14, 40, 105, -6, -1, 1 }, { -1, 4, -13, 38, 106, -5, -2, 1 },
+ { -1, 4, -13, 36, 107, -4, -2, 1 }, { -1, 4, -13, 34, 108, -2, -3, 1 },
+ { -1, 4, -12, 32, 108, -1, -3, 1 }, { -1, 4, -12, 30, 109, 0, -4, 2 },
+ { -1, 3, -11, 28, 110, 1, -4, 2 }, { -1, 3, -11, 26, 110, 3, -4, 2 },
+ { -1, 3, -10, 24, 111, 4, -5, 2 }, { -1, 3, -10, 22, 111, 6, -5, 2 },
+ { -1, 3, -10, 21, 112, 7, -6, 2 }, { -1, 3, -9, 19, 112, 8, -6, 2 },
+ { -1, 3, -9, 17, 112, 10, -7, 3 }, { -1, 3, -8, 15, 112, 12, -7, 2 },
+};
+
+const int16_t av1_resize_filter_normative[(
+ 1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
+#if UPSCALE_NORMATIVE_TAPS == 8
+ { 0, 0, 0, 128, 0, 0, 0, 0 }, { 0, 0, -1, 128, 2, -1, 0, 0 },
+ { 0, 1, -3, 127, 4, -2, 1, 0 }, { 0, 1, -4, 127, 6, -3, 1, 0 },
+ { 0, 2, -6, 126, 8, -3, 1, 0 }, { 0, 2, -7, 125, 11, -4, 1, 0 },
+ { -1, 2, -8, 125, 13, -5, 2, 0 }, { -1, 3, -9, 124, 15, -6, 2, 0 },
+ { -1, 3, -10, 123, 18, -6, 2, -1 }, { -1, 3, -11, 122, 20, -7, 3, -1 },
+ { -1, 4, -12, 121, 22, -8, 3, -1 }, { -1, 4, -13, 120, 25, -9, 3, -1 },
+ { -1, 4, -14, 118, 28, -9, 3, -1 }, { -1, 4, -15, 117, 30, -10, 4, -1 },
+ { -1, 5, -16, 116, 32, -11, 4, -1 }, { -1, 5, -16, 114, 35, -12, 4, -1 },
+ { -1, 5, -17, 112, 38, -12, 4, -1 }, { -1, 5, -18, 111, 40, -13, 5, -1 },
+ { -1, 5, -18, 109, 43, -14, 5, -1 }, { -1, 6, -19, 107, 45, -14, 5, -1 },
+ { -1, 6, -19, 105, 48, -15, 5, -1 }, { -1, 6, -19, 103, 51, -16, 5, -1 },
+ { -1, 6, -20, 101, 53, -16, 6, -1 }, { -1, 6, -20, 99, 56, -17, 6, -1 },
+ { -1, 6, -20, 97, 58, -17, 6, -1 }, { -1, 6, -20, 95, 61, -18, 6, -1 },
+ { -2, 7, -20, 93, 64, -18, 6, -2 }, { -2, 7, -20, 91, 66, -19, 6, -1 },
+ { -2, 7, -20, 88, 69, -19, 6, -1 }, { -2, 7, -20, 86, 71, -19, 6, -1 },
+ { -2, 7, -20, 84, 74, -20, 7, -2 }, { -2, 7, -20, 81, 76, -20, 7, -1 },
+ { -2, 7, -20, 79, 79, -20, 7, -2 }, { -1, 7, -20, 76, 81, -20, 7, -2 },
+ { -2, 7, -20, 74, 84, -20, 7, -2 }, { -1, 6, -19, 71, 86, -20, 7, -2 },
+ { -1, 6, -19, 69, 88, -20, 7, -2 }, { -1, 6, -19, 66, 91, -20, 7, -2 },
+ { -2, 6, -18, 64, 93, -20, 7, -2 }, { -1, 6, -18, 61, 95, -20, 6, -1 },
+ { -1, 6, -17, 58, 97, -20, 6, -1 }, { -1, 6, -17, 56, 99, -20, 6, -1 },
+ { -1, 6, -16, 53, 101, -20, 6, -1 }, { -1, 5, -16, 51, 103, -19, 6, -1 },
+ { -1, 5, -15, 48, 105, -19, 6, -1 }, { -1, 5, -14, 45, 107, -19, 6, -1 },
+ { -1, 5, -14, 43, 109, -18, 5, -1 }, { -1, 5, -13, 40, 111, -18, 5, -1 },
+ { -1, 4, -12, 38, 112, -17, 5, -1 }, { -1, 4, -12, 35, 114, -16, 5, -1 },
+ { -1, 4, -11, 32, 116, -16, 5, -1 }, { -1, 4, -10, 30, 117, -15, 4, -1 },
+ { -1, 3, -9, 28, 118, -14, 4, -1 }, { -1, 3, -9, 25, 120, -13, 4, -1 },
+ { -1, 3, -8, 22, 121, -12, 4, -1 }, { -1, 3, -7, 20, 122, -11, 3, -1 },
+ { -1, 2, -6, 18, 123, -10, 3, -1 }, { 0, 2, -6, 15, 124, -9, 3, -1 },
+ { 0, 2, -5, 13, 125, -8, 2, -1 }, { 0, 1, -4, 11, 125, -7, 2, 0 },
+ { 0, 1, -3, 8, 126, -6, 2, 0 }, { 0, 1, -3, 6, 127, -4, 1, 0 },
+ { 0, 1, -2, 4, 127, -3, 1, 0 }, { 0, 0, -1, 2, 128, -1, 0, 0 },
+#else
+#error "Invalid value of UPSCALE_NORMATIVE_TAPS"
+#endif // UPSCALE_NORMATIVE_TAPS == 8
+};
+
+// Filters for interpolation (full-band) - no filtering for integer pixels
+#define filteredinterp_filters1000 av1_resize_filter_normative
+
+// Filters for factor of 2 downsampling.
+static const int16_t av1_down2_symeven_half_filter[] = { 56, 12, -3, -1 };
+static const int16_t av1_down2_symodd_half_filter[] = { 64, 35, 0, -3 };
+
+static const InterpKernel *choose_interp_filter(int in_length, int out_length) {
+ int out_length16 = out_length * 16;
+ if (out_length16 >= in_length * 16)
+ return filteredinterp_filters1000;
+ else if (out_length16 >= in_length * 13)
+ return filteredinterp_filters875;
+ else if (out_length16 >= in_length * 11)
+ return filteredinterp_filters750;
+ else if (out_length16 >= in_length * 9)
+ return filteredinterp_filters625;
+ else
+ return filteredinterp_filters500;
+}
+
+static void interpolate_core(const uint8_t *const input, int in_length,
+ uint8_t *output, int out_length,
+ const int16_t *interp_filters, int interp_taps) {
+ const int32_t delta =
+ (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+ out_length;
+ const int32_t offset =
+ in_length > out_length
+ ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length
+ : -(((int32_t)(out_length - in_length)
+ << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length;
+ uint8_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int32_t y;
+
+ x = 0;
+ y = offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = out_length - 1;
+ y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+ in_length) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+ ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k) {
+ const int pk = int_pel - interp_taps / 2 + 1 + k;
+ sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+ }
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ // End part.
+ for (; x < out_length; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] *
+ input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+ *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+ }
+ }
+}
+
+static void interpolate(const uint8_t *const input, int in_length,
+ uint8_t *output, int out_length) {
+ const InterpKernel *interp_filters =
+ choose_interp_filter(in_length, out_length);
+
+ interpolate_core(input, in_length, output, out_length, &interp_filters[0][0],
+ SUBPEL_TAPS);
+}
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length) {
+ return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length;
+}
+
+static int32_t get_upscale_convolve_x0(int in_length, int out_length,
+ int32_t x_step_qn) {
+ const int err = out_length * x_step_qn - (in_length << RS_SCALE_SUBPEL_BITS);
+ const int32_t x0 =
+ (-((out_length - in_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length +
+ RS_SCALE_EXTRA_OFF - err / 2;
+ return (int32_t)((uint32_t)x0 & RS_SCALE_SUBPEL_MASK);
+}
+
+#ifndef __clang_analyzer__
+static void down2_symeven(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half.
+ const int16_t *filter = av1_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+#endif
+
+static void down2_symodd(const uint8_t *const input, int length,
+ uint8_t *output) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ const int16_t *filter = av1_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint8_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] +
+ input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel(sum);
+ }
+ }
+}
+
+static int get_down2_length(int length, int steps) {
+ for (int s = 0; s < steps; ++s) length = (length + 1) >> 1;
+ return length;
+}
+
+static int get_down2_steps(int in_length, int out_length) {
+ int steps = 0;
+ int proj_in_length;
+ while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
+ ++steps;
+ in_length = proj_in_length;
+ if (in_length == 1) {
+ // Special case: we break because any further calls to get_down2_length()
+ // with be with length == 1, which return 1, resulting in an infinite
+ // loop.
+ break;
+ }
+ }
+ return steps;
+}
+
+static void resize_multistep(const uint8_t *const input, int length,
+ uint8_t *output, int olength, uint8_t *otmp) {
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ const int steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ uint8_t *out = NULL;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ uint8_t *otmp2 = otmp + get_down2_length(length, 1);
+ for (int s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint8_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ down2_symodd(in, filteredlength, out);
+ else
+ down2_symeven(in, filteredlength, out);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ interpolate(out, filteredlength, output, olength);
+ }
+ } else {
+ interpolate(input, length, output, olength);
+ }
+}
+
+static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) {
+ int i;
+ uint8_t *iptr = img;
+ uint8_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+static void resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride) {
+ int i;
+ uint8_t *intbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * width2 * height);
+ uint8_t *tmpbuf =
+ (uint8_t *)aom_malloc(sizeof(uint8_t) * AOMMAX(width, height));
+ uint8_t *arrbuf = (uint8_t *)aom_malloc(sizeof(uint8_t) * height);
+ uint8_t *arrbuf2 = (uint8_t *)aom_malloc(sizeof(uint8_t) * height2);
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ goto Error;
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ for (i = 0; i < height; ++i)
+ resize_multistep(input + in_stride * i, width, intbuf + width2 * i, width2,
+ tmpbuf);
+ for (i = 0; i < width2; ++i) {
+ fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf);
+ fill_arr_to_col(output + i, out_stride, height2, arrbuf2);
+ }
+
+Error:
+ aom_free(intbuf);
+ aom_free(tmpbuf);
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+}
+
+static void upscale_normative_rect(const uint8_t *const input, int height,
+ int width, int in_stride, uint8_t *output,
+ int height2, int width2, int out_stride,
+ int x_step_qn, int x0_qn, int pad_left,
+ int pad_right) {
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ assert(height2 == height);
+
+ // Extend the left/right pixels of the tile column if needed
+ // (either because we can't sample from other tiles, or because we're at
+ // a frame edge).
+ // Save the overwritten pixels into tmp_left and tmp_right.
+ // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+ // column of border pixels compared to what we'd naively think.
+ const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+ uint8_t *tmp_left =
+ NULL; // Silence spurious "may be used uninitialized" warnings
+ uint8_t *tmp_right = NULL;
+ uint8_t *const in_tl = (uint8_t *)(input - border_cols); // Cast off 'const'
+ uint8_t *const in_tr = (uint8_t *)(input + width);
+ if (pad_left) {
+ tmp_left = (uint8_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_cols);
+ memset(in_tl + i * in_stride, input[i * in_stride], border_cols);
+ }
+ }
+ if (pad_right) {
+ tmp_right =
+ (uint8_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_cols);
+ memset(in_tr + i * in_stride, input[i * in_stride + width - 1],
+ border_cols);
+ }
+ }
+
+ av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
+ height2, &av1_resize_filter_normative[0][0], x0_qn,
+ x_step_qn);
+
+ // Restore the left/right border pixels
+ if (pad_left) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_cols);
+ }
+ aom_free(tmp_left);
+ }
+ if (pad_right) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_cols);
+ }
+ aom_free(tmp_right);
+ }
+}
+
+static void highbd_interpolate_core(const uint16_t *const input, int in_length,
+ uint16_t *output, int out_length, int bd,
+ const int16_t *interp_filters,
+ int interp_taps) {
+ const int32_t delta =
+ (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) /
+ out_length;
+ const int32_t offset =
+ in_length > out_length
+ ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length
+ : -(((int32_t)(out_length - in_length)
+ << (RS_SCALE_SUBPEL_BITS - 1)) +
+ out_length / 2) /
+ out_length;
+ uint16_t *optr = output;
+ int x, x1, x2, sum, k, int_pel, sub_pel;
+ int32_t y;
+
+ x = 0;
+ y = offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) {
+ x++;
+ y += delta;
+ }
+ x1 = x;
+ x = out_length - 1;
+ y = delta * x + offset + RS_SCALE_EXTRA_OFF;
+ while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >=
+ in_length) {
+ x--;
+ y -= delta;
+ }
+ x2 = x;
+ if (x1 > x2) {
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length;
+ ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k) {
+ const int pk = int_pel - interp_taps / 2 + 1 + k;
+ sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)];
+ }
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ } else {
+ // Initial part.
+ for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // Middle part.
+ for (; x <= x2; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ // End part.
+ for (; x < out_length; ++x, y += delta) {
+ int_pel = y >> RS_SCALE_SUBPEL_BITS;
+ sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK;
+ const int16_t *filter = &interp_filters[sub_pel * interp_taps];
+ sum = 0;
+ for (k = 0; k < interp_taps; ++k)
+ sum += filter[k] *
+ input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)];
+ *optr++ = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+ }
+ }
+}
+
+static void highbd_interpolate(const uint16_t *const input, int in_length,
+ uint16_t *output, int out_length, int bd) {
+ const InterpKernel *interp_filters =
+ choose_interp_filter(in_length, out_length);
+
+ highbd_interpolate_core(input, in_length, output, out_length, bd,
+ &interp_filters[0][0], SUBPEL_TAPS);
+}
+
+#ifndef __clang_analyzer__
+static void highbd_down2_symeven(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half.
+ static const int16_t *filter = av1_down2_symeven_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symeven_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half;
+ int l2 = (length - filter_len_half);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[AOMMAX(0, i - j)] + input[AOMMIN(i + 1 + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(0, i - j)] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + 1 + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1));
+ for (j = 0; j < filter_len_half; ++j) {
+ sum +=
+ (input[i - j] + input[AOMMIN(i + 1 + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+
+static void highbd_down2_symodd(const uint16_t *const input, int length,
+ uint16_t *output, int bd) {
+ // Actual filter len = 2 * filter_len_half - 1.
+ static const int16_t *filter = av1_down2_symodd_half_filter;
+ const int filter_len_half = sizeof(av1_down2_symodd_half_filter) / 2;
+ int i, j;
+ uint16_t *optr = output;
+ int l1 = filter_len_half - 1;
+ int l2 = (length - filter_len_half + 1);
+ l1 += (l1 & 1);
+ l2 += (l2 & 1);
+ if (l1 > l2) {
+ // Short input length.
+ for (i = 0; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[AOMMIN(i + j, length - 1)]) *
+ filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ } else {
+ // Initial part.
+ for (i = 0; i < l1; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[AOMMAX(i - j, 0)] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // Middle part.
+ for (; i < l2; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[i + j]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ // End part.
+ for (; i < length; i += 2) {
+ int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0];
+ for (j = 1; j < filter_len_half; ++j) {
+ sum += (input[i - j] + input[AOMMIN(i + j, length - 1)]) * filter[j];
+ }
+ sum >>= FILTER_BITS;
+ *optr++ = clip_pixel_highbd(sum, bd);
+ }
+ }
+}
+#endif
+
+static void highbd_resize_multistep(const uint16_t *const input, int length,
+ uint16_t *output, int olength,
+ uint16_t *otmp, int bd) {
+ if (length == olength) {
+ memcpy(output, input, sizeof(output[0]) * length);
+ return;
+ }
+ const int steps = get_down2_steps(length, olength);
+
+ if (steps > 0) {
+ uint16_t *out = NULL;
+ int filteredlength = length;
+
+ assert(otmp != NULL);
+ uint16_t *otmp2 = otmp + get_down2_length(length, 1);
+ for (int s = 0; s < steps; ++s) {
+ const int proj_filteredlength = get_down2_length(filteredlength, 1);
+ const uint16_t *const in = (s == 0 ? input : out);
+ if (s == steps - 1 && proj_filteredlength == olength)
+ out = output;
+ else
+ out = (s & 1 ? otmp2 : otmp);
+ if (filteredlength & 1)
+ highbd_down2_symodd(in, filteredlength, out, bd);
+ else
+ highbd_down2_symeven(in, filteredlength, out, bd);
+ filteredlength = proj_filteredlength;
+ }
+ if (filteredlength != olength) {
+ highbd_interpolate(out, filteredlength, output, olength, bd);
+ }
+ } else {
+ highbd_interpolate(input, length, output, olength, bd);
+ }
+}
+
+static void highbd_fill_col_to_arr(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *aptr++ = *iptr;
+ }
+}
+
+static void highbd_fill_arr_to_col(uint16_t *img, int stride, int len,
+ uint16_t *arr) {
+ int i;
+ uint16_t *iptr = img;
+ uint16_t *aptr = arr;
+ for (i = 0; i < len; ++i, iptr += stride) {
+ *iptr = *aptr++;
+ }
+}
+
+static void highbd_resize_plane(const uint8_t *const input, int height,
+ int width, int in_stride, uint8_t *output,
+ int height2, int width2, int out_stride,
+ int bd) {
+ int i;
+ uint16_t *intbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * width2 * height);
+ uint16_t *tmpbuf =
+ (uint16_t *)aom_malloc(sizeof(uint16_t) * AOMMAX(width, height));
+ uint16_t *arrbuf = (uint16_t *)aom_malloc(sizeof(uint16_t) * height);
+ uint16_t *arrbuf2 = (uint16_t *)aom_malloc(sizeof(uint16_t) * height2);
+ if (intbuf == NULL || tmpbuf == NULL || arrbuf == NULL || arrbuf2 == NULL)
+ goto Error;
+ for (i = 0; i < height; ++i) {
+ highbd_resize_multistep(CONVERT_TO_SHORTPTR(input + in_stride * i), width,
+ intbuf + width2 * i, width2, tmpbuf, bd);
+ }
+ for (i = 0; i < width2; ++i) {
+ highbd_fill_col_to_arr(intbuf + i, width2, height, arrbuf);
+ highbd_resize_multistep(arrbuf, height, arrbuf2, height2, tmpbuf, bd);
+ highbd_fill_arr_to_col(CONVERT_TO_SHORTPTR(output + i), out_stride, height2,
+ arrbuf2);
+ }
+
+Error:
+ aom_free(intbuf);
+ aom_free(tmpbuf);
+ aom_free(arrbuf);
+ aom_free(arrbuf2);
+}
+
+static void highbd_upscale_normative_rect(const uint8_t *const input,
+ int height, int width, int in_stride,
+ uint8_t *output, int height2,
+ int width2, int out_stride,
+ int x_step_qn, int x0_qn,
+ int pad_left, int pad_right, int bd) {
+ assert(width > 0);
+ assert(height > 0);
+ assert(width2 > 0);
+ assert(height2 > 0);
+ assert(height2 == height);
+
+ // Extend the left/right pixels of the tile column if needed
+ // (either because we can't sample from other tiles, or because we're at
+ // a frame edge).
+ // Save the overwritten pixels into tmp_left and tmp_right.
+ // Note: Because we pass input-1 to av1_convolve_horiz_rs, we need one extra
+ // column of border pixels compared to what we'd naively think.
+ const int border_cols = UPSCALE_NORMATIVE_TAPS / 2 + 1;
+ const int border_size = border_cols * sizeof(uint16_t);
+ uint16_t *tmp_left =
+ NULL; // Silence spurious "may be used uninitialized" warnings
+ uint16_t *tmp_right = NULL;
+ uint16_t *const input16 = CONVERT_TO_SHORTPTR(input);
+ uint16_t *const in_tl = input16 - border_cols;
+ uint16_t *const in_tr = input16 + width;
+ if (pad_left) {
+ tmp_left = (uint16_t *)aom_malloc(sizeof(*tmp_left) * border_cols * height);
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_left + i * border_cols, in_tl + i * in_stride, border_size);
+ aom_memset16(in_tl + i * in_stride, input16[i * in_stride], border_cols);
+ }
+ }
+ if (pad_right) {
+ tmp_right =
+ (uint16_t *)aom_malloc(sizeof(*tmp_right) * border_cols * height);
+ for (int i = 0; i < height; i++) {
+ memcpy(tmp_right + i * border_cols, in_tr + i * in_stride, border_size);
+ aom_memset16(in_tr + i * in_stride, input16[i * in_stride + width - 1],
+ border_cols);
+ }
+ }
+
+ av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
+ CONVERT_TO_SHORTPTR(output), out_stride, width2,
+ height2, &av1_resize_filter_normative[0][0],
+ x0_qn, x_step_qn, bd);
+
+ // Restore the left/right border pixels
+ if (pad_left) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tl + i * in_stride, tmp_left + i * border_cols, border_size);
+ }
+ aom_free(tmp_left);
+ }
+ if (pad_right) {
+ for (int i = 0; i < height; i++) {
+ memcpy(in_tr + i * in_stride, tmp_right + i * border_cols, border_size);
+ }
+ aom_free(tmp_right);
+ }
+}
+
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2, owidth / 2,
+ ouv_stride);
+ resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2, owidth / 2,
+ ouv_stride);
+}
+
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+ ouv_stride);
+ resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+ ouv_stride);
+}
+
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth) {
+ resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride);
+ resize_plane(u, height, width, uv_stride, ou, oheight, owidth, ouv_stride);
+ resize_plane(v, height, width, uv_stride, ov, oheight, owidth, ouv_stride);
+}
+
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ highbd_resize_plane(u, height / 2, width / 2, uv_stride, ou, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+ highbd_resize_plane(v, height / 2, width / 2, uv_stride, ov, oheight / 2,
+ owidth / 2, ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ highbd_resize_plane(u, height, width / 2, uv_stride, ou, oheight, owidth / 2,
+ ouv_stride, bd);
+ highbd_resize_plane(v, height, width / 2, uv_stride, ov, oheight, owidth / 2,
+ ouv_stride, bd);
+}
+
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd) {
+ highbd_resize_plane(y, height, width, y_stride, oy, oheight, owidth,
+ oy_stride, bd);
+ highbd_resize_plane(u, height, width, uv_stride, ou, oheight, owidth,
+ ouv_stride, bd);
+ highbd_resize_plane(v, height, width, uv_stride, ov, oheight, owidth,
+ ouv_stride, bd);
+}
+
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes) {
+ // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
+ const int is_uv = i > 0;
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH)
+ highbd_resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv], bd);
+ else
+ resize_plane(src->buffers[i], src->crop_heights[is_uv],
+ src->crop_widths[is_uv], src->strides[is_uv],
+ dst->buffers[i], dst->crop_heights[is_uv],
+ dst->crop_widths[is_uv], dst->strides[is_uv]);
+ }
+ aom_extend_frame_borders(dst, num_planes);
+}
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int plane, int rows) {
+ const int is_uv = (plane > 0);
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int downscaled_plane_width = ROUND_POWER_OF_TWO(cm->width, ss_x);
+ const int upscaled_plane_width =
+ ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+ const int superres_denom = cm->superres_scale_denominator;
+
+ TileInfo tile_col;
+ const int32_t x_step_qn = av1_get_upscale_convolve_step(
+ downscaled_plane_width, upscaled_plane_width);
+ int32_t x0_qn = get_upscale_convolve_x0(downscaled_plane_width,
+ upscaled_plane_width, x_step_qn);
+
+ for (int j = 0; j < cm->tile_cols; j++) {
+ av1_tile_set_col(&tile_col, cm, j);
+ // Determine the limits of this tile column in both the source
+ // and destination images.
+ // Note: The actual location which we start sampling from is
+ // (downscaled_x0 - 1 + (x0_qn/2^14)), and this quantity increases
+ // by exactly dst_width * (x_step_qn/2^14) pixels each iteration.
+ const int downscaled_x0 = tile_col.mi_col_start << (MI_SIZE_LOG2 - ss_x);
+ const int downscaled_x1 = tile_col.mi_col_end << (MI_SIZE_LOG2 - ss_x);
+ const int src_width = downscaled_x1 - downscaled_x0;
+
+ const int upscaled_x0 = (downscaled_x0 * superres_denom) / SCALE_NUMERATOR;
+ int upscaled_x1;
+ if (j == cm->tile_cols - 1) {
+ // Note that we can't just use AOMMIN here - due to rounding,
+ // (downscaled_x1 * superres_denom) / SCALE_NUMERATOR may be less than
+ // upscaled_plane_width.
+ upscaled_x1 = upscaled_plane_width;
+ } else {
+ upscaled_x1 = (downscaled_x1 * superres_denom) / SCALE_NUMERATOR;
+ }
+
+ const uint8_t *const src_ptr = src + downscaled_x0;
+ uint8_t *const dst_ptr = dst + upscaled_x0;
+ const int dst_width = upscaled_x1 - upscaled_x0;
+
+ const int pad_left = (j == 0);
+ const int pad_right = (j == cm->tile_cols - 1);
+
+ if (cm->seq_params.use_highbitdepth)
+ highbd_upscale_normative_rect(src_ptr, rows, src_width, src_stride,
+ dst_ptr, rows, dst_width, dst_stride,
+ x_step_qn, x0_qn, pad_left, pad_right,
+ cm->seq_params.bit_depth);
+ else
+ upscale_normative_rect(src_ptr, rows, src_width, src_stride, dst_ptr,
+ rows, dst_width, dst_stride, x_step_qn, x0_qn,
+ pad_left, pad_right);
+
+ // Update the fractional pixel offset to prepare for the next tile column.
+ x0_qn += (dst_width * x_step_qn) - (src_width << RS_SCALE_SUBPEL_BITS);
+ }
+}
+
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ const int num_planes = av1_num_planes(cm);
+ for (int i = 0; i < num_planes; ++i) {
+ const int is_uv = (i > 0);
+ av1_upscale_normative_rows(cm, src->buffers[i], src->strides[is_uv],
+ dst->buffers[i], dst->strides[is_uv], i,
+ src->crop_heights[is_uv]);
+ }
+
+ aom_extend_frame_borders(dst, num_planes);
+}
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled) {
+ const int num_planes = av1_num_planes(cm);
+ if (cm->width != unscaled->y_crop_width ||
+ cm->height != unscaled->y_crop_height) {
+ av1_resize_and_extend_frame(unscaled, scaled, (int)cm->seq_params.bit_depth,
+ num_planes);
+ return scaled;
+ } else {
+ return unscaled;
+ }
+}
+
+// Calculates the scaled dimension given the original dimension and the scale
+// denominator.
+static void calculate_scaled_size_helper(int *dim, int denom) {
+ if (denom != SCALE_NUMERATOR) {
+ // Use this version if we need *dim to be even
+ // *width = (*width * SCALE_NUMERATOR + denom) / (2 * denom);
+ // *width <<= 1;
+ *dim = (*dim * SCALE_NUMERATOR + denom / 2) / (denom);
+ }
+}
+
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom) {
+ calculate_scaled_size_helper(width, resize_denom);
+ calculate_scaled_size_helper(height, resize_denom);
+}
+
+void av1_calculate_scaled_superres_size(int *width, int *height,
+ int superres_denom) {
+ (void)height;
+ calculate_scaled_size_helper(width, superres_denom);
+}
+
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom) {
+ if (denom != SCALE_NUMERATOR) {
+ // Note: av1_calculate_scaled_superres_size() rounds *up* after division
+ // when the resulting dimensions are odd. So here, we round *down*.
+ *width = *width * denom / SCALE_NUMERATOR;
+ (void)height;
+ }
+}
+
+// Copy only the config data from 'src' to 'dst'.
+static void copy_buffer_config(const YV12_BUFFER_CONFIG *const src,
+ YV12_BUFFER_CONFIG *const dst) {
+ dst->bit_depth = src->bit_depth;
+ dst->color_primaries = src->color_primaries;
+ dst->transfer_characteristics = src->transfer_characteristics;
+ dst->matrix_coefficients = src->matrix_coefficients;
+ dst->monochrome = src->monochrome;
+ dst->chroma_sample_position = src->chroma_sample_position;
+ dst->color_range = src->color_range;
+}
+
+// TODO(afergs): Look for in-place upscaling
+// TODO(afergs): aom_ vs av1_ functions? Which can I use?
+// Upscale decoded image.
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool) {
+ const int num_planes = av1_num_planes(cm);
+ if (!av1_superres_scaled(cm)) return;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+
+ YV12_BUFFER_CONFIG copy_buffer;
+ memset(&copy_buffer, 0, sizeof(copy_buffer));
+
+ YV12_BUFFER_CONFIG *const frame_to_show = get_frame_new_buffer(cm);
+
+ const int aligned_width = ALIGN_POWER_OF_TWO(cm->width, 3);
+ if (aom_alloc_frame_buffer(
+ &copy_buffer, aligned_width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate copy buffer for superres upscaling");
+
+ // Copy function assumes the frames are the same size.
+ // Note that it does not copy YV12_BUFFER_CONFIG config data.
+ aom_yv12_copy_frame(frame_to_show, &copy_buffer, num_planes);
+
+ assert(copy_buffer.y_crop_width == aligned_width);
+ assert(copy_buffer.y_crop_height == cm->height);
+
+ // Realloc the current frame buffer at a higher resolution in place.
+ if (pool != NULL) {
+ // Use callbacks if on the decoder.
+ aom_codec_frame_buffer_t *fb =
+ &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer;
+ aom_release_frame_buffer_cb_fn_t release_fb_cb = pool->release_fb_cb;
+ aom_get_frame_buffer_cb_fn_t cb = pool->get_fb_cb;
+ void *cb_priv = pool->cb_priv;
+
+ // Realloc with callback does not release the frame buffer - release first.
+ if (release_fb_cb(cb_priv, fb))
+ aom_internal_error(
+ &cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to free current frame buffer before superres upscaling");
+
+ // aom_realloc_frame_buffer() leaves config data for frame_to_show intact
+ if (aom_realloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, fb, cb, cb_priv))
+ aom_internal_error(
+ &cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate current frame buffer for superres upscaling");
+ } else {
+ // Make a copy of the config data for frame_to_show in copy_buffer
+ copy_buffer_config(frame_to_show, &copy_buffer);
+
+ // Don't use callbacks on the encoder.
+ // aom_alloc_frame_buffer() clears the config data for frame_to_show
+ if (aom_alloc_frame_buffer(
+ frame_to_show, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment))
+ aom_internal_error(
+ &cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate current frame buffer for superres upscaling");
+
+ // Restore config data back to frame_to_show
+ copy_buffer_config(&copy_buffer, frame_to_show);
+ }
+ // TODO(afergs): verify frame_to_show is correct after realloc
+ // encoder:
+ // decoder:
+
+ assert(frame_to_show->y_crop_width == cm->superres_upscaled_width);
+ assert(frame_to_show->y_crop_height == cm->superres_upscaled_height);
+
+ // Scale up and back into frame_to_show.
+ assert(frame_to_show->y_crop_width != cm->width);
+ av1_upscale_normative_and_extend_frame(cm, &copy_buffer, frame_to_show);
+
+ // Free the copy buffer
+ aom_free_frame_buffer(&copy_buffer);
+}
diff --git a/third_party/aom/av1/common/resize.h b/third_party/aom/av1/common/resize.h
new file mode 100644
index 0000000000..9a59a8d63b
--- /dev/null
+++ b/third_party/aom/av1/common/resize.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESIZE_H_
+#define AOM_AV1_COMMON_RESIZE_H_
+
+#include <stdio.h>
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2, int width2,
+ int out_stride);
+void av1_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+void av1_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+void av1_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width, uint8_t *oy,
+ int oy_stride, uint8_t *ou, uint8_t *ov,
+ int ouv_stride, int oheight, int owidth);
+
+void av1_highbd_resize_plane(const uint8_t *const input, int height, int width,
+ int in_stride, uint8_t *output, int height2,
+ int width2, int out_stride, int bd);
+void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride,
+ const uint8_t *const u, const uint8_t *const v,
+ int uv_stride, int height, int width,
+ uint8_t *oy, int oy_stride, uint8_t *ou,
+ uint8_t *ov, int ouv_stride, int oheight,
+ int owidth, int bd);
+void av1_resize_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int bd,
+ const int num_planes);
+
+void av1_upscale_normative_rows(const AV1_COMMON *cm, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int plane, int rows);
+void av1_upscale_normative_and_extend_frame(const AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+YV12_BUFFER_CONFIG *av1_scale_if_required(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *unscaled,
+ YV12_BUFFER_CONFIG *scaled);
+
+// Calculates the scaled dimensions from the given original dimensions and the
+// resize scale denominator.
+void av1_calculate_scaled_size(int *width, int *height, int resize_denom);
+
+// Similar to above, but calculates scaled dimensions after superres from the
+// given original dimensions and superres scale denominator.
+void av1_calculate_scaled_superres_size(int *width, int *height,
+ int superres_denom);
+
+// Inverse of av1_calculate_scaled_superres_size() above: calculates the
+// original dimensions from the given scaled dimensions and the scale
+// denominator.
+void av1_calculate_unscaled_superres_size(int *width, int *height, int denom);
+
+void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool);
+
+// Returns 1 if a superres upscaled frame is scaled and 0 otherwise.
+static INLINE int av1_superres_scaled(const AV1_COMMON *cm) {
+ // Note: for some corner cases (e.g. cm->width of 1), there may be no scaling
+ // required even though cm->superres_scale_denominator != SCALE_NUMERATOR.
+ // So, the following check is more accurate.
+ return !(cm->width == cm->superres_upscaled_width);
+}
+
+#define UPSCALE_NORMATIVE_TAPS 8
+extern const int16_t av1_resize_filter_normative[1 << RS_SUBPEL_BITS]
+ [UPSCALE_NORMATIVE_TAPS];
+
+int32_t av1_get_upscale_convolve_step(int in_length, int out_length);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RESIZE_H_
diff --git a/third_party/aom/av1/common/restoration.c b/third_party/aom/av1/common/restoration.c
new file mode 100644
index 0000000000..d276a915b5
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ */
+
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/restoration.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+#include "aom_ports/mem.h"
+
+// The 's' values are calculated based on original 'r' and 'e' values in the
+// spec using GenSgrprojVtable().
+// Note: Setting r = 0 skips the filter; with corresponding s = -1 (invalid).
+const sgr_params_type sgr_params[SGRPROJ_PARAMS] = {
+ { { 2, 1 }, { 140, 3236 } }, { { 2, 1 }, { 112, 2158 } },
+ { { 2, 1 }, { 93, 1618 } }, { { 2, 1 }, { 80, 1438 } },
+ { { 2, 1 }, { 70, 1295 } }, { { 2, 1 }, { 58, 1177 } },
+ { { 2, 1 }, { 47, 1079 } }, { { 2, 1 }, { 37, 996 } },
+ { { 2, 1 }, { 30, 925 } }, { { 2, 1 }, { 25, 863 } },
+ { { 0, 1 }, { -1, 2589 } }, { { 0, 1 }, { -1, 1618 } },
+ { { 0, 1 }, { -1, 1177 } }, { { 0, 1 }, { -1, 925 } },
+ { { 2, 0 }, { 56, -1 } }, { { 2, 0 }, { 22, -1 } },
+};
+
+AV1PixelRect av1_whole_frame_rect(const AV1_COMMON *cm, int is_uv) {
+ AV1PixelRect rect;
+
+ int ss_x = is_uv && cm->seq_params.subsampling_x;
+ int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+ rect.top = 0;
+ rect.bottom = ROUND_POWER_OF_TWO(cm->height, ss_y);
+ rect.left = 0;
+ rect.right = ROUND_POWER_OF_TWO(cm->superres_upscaled_width, ss_x);
+ return rect;
+}
+
+// Count horizontal or vertical units per tile (use a width or height for
+// tile_size, respectively). We basically want to divide the tile size by the
+// size of a restoration unit. Rather than rounding up unconditionally as you
+// might expect, we round to nearest, which models the way a right or bottom
+// restoration unit can extend to up to 150% its normal width or height. The
+// max with 1 is to deal with tiles that are smaller than half of a restoration
+// unit.
+int av1_lr_count_units_in_tile(int unit_size, int tile_size) {
+ return AOMMAX((tile_size + (unit_size >> 1)) / unit_size, 1);
+}
+
+void av1_alloc_restoration_struct(AV1_COMMON *cm, RestorationInfo *rsi,
+ int is_uv) {
+ // We need to allocate enough space for restoration units to cover the
+ // largest tile. Without CONFIG_MAX_TILE, this is always the tile at the
+ // top-left and we can use av1_get_tile_rect(). With CONFIG_MAX_TILE, we have
+ // to do the computation ourselves, iterating over the tiles and keeping
+ // track of the largest width and height, then upscaling.
+ const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+ const int max_tile_w = tile_rect.right - tile_rect.left;
+ const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+ // To calculate hpertile and vpertile (horizontal and vertical units per
+ // tile), we basically want to divide the largest tile width or height by the
+ // size of a restoration unit. Rather than rounding up unconditionally as you
+ // might expect, we round to nearest, which models the way a right or bottom
+ // restoration unit can extend to up to 150% its normal width or height. The
+ // max with 1 is to deal with tiles that are smaller than half of a
+ // restoration unit.
+ const int unit_size = rsi->restoration_unit_size;
+ const int hpertile = av1_lr_count_units_in_tile(unit_size, max_tile_w);
+ const int vpertile = av1_lr_count_units_in_tile(unit_size, max_tile_h);
+
+ rsi->units_per_tile = hpertile * vpertile;
+ rsi->horz_units_per_tile = hpertile;
+ rsi->vert_units_per_tile = vpertile;
+
+ const int ntiles = 1;
+ const int nunits = ntiles * rsi->units_per_tile;
+
+ aom_free(rsi->unit_info);
+ CHECK_MEM_ERROR(cm, rsi->unit_info,
+ (RestorationUnitInfo *)aom_memalign(
+ 16, sizeof(*rsi->unit_info) * nunits));
+}
+
+void av1_free_restoration_struct(RestorationInfo *rst_info) {
+ aom_free(rst_info->unit_info);
+ rst_info->unit_info = NULL;
+}
+
+#if 0
+// Pair of values for each sgrproj parameter:
+// Index 0 corresponds to r[0], e[0]
+// Index 1 corresponds to r[1], e[1]
+int sgrproj_mtable[SGRPROJ_PARAMS][2];
+
+static void GenSgrprojVtable() {
+ for (int i = 0; i < SGRPROJ_PARAMS; ++i) {
+ const sgr_params_type *const params = &sgr_params[i];
+ for (int j = 0; j < 2; ++j) {
+ const int e = params->e[j];
+ const int r = params->r[j];
+ if (r == 0) { // filter is disabled
+ sgrproj_mtable[i][j] = -1; // mark invalid
+ } else { // filter is enabled
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const int n2e = n * n * e;
+ assert(n2e != 0);
+ sgrproj_mtable[i][j] = (((1 << SGRPROJ_MTABLE_BITS) + n2e / 2) / n2e);
+ }
+ }
+ }
+}
+#endif
+
+void av1_loop_restoration_precal() {
+#if 0
+ GenSgrprojVtable();
+#endif
+}
+
+static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert) {
+ uint8_t *data_p;
+ int i;
+ for (i = 0; i < height; ++i) {
+ data_p = data + i * stride;
+ memset(data_p - border_horz, data_p[0], border_horz);
+ memset(data_p + width, data_p[width - 1], border_horz);
+ }
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
+ memcpy(data_p + i * stride, data_p, width + 2 * border_horz);
+ }
+ for (i = height; i < height + border_vert; ++i) {
+ memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+ width + 2 * border_horz);
+ }
+}
+
+static void extend_frame_highbd(uint16_t *data, int width, int height,
+ int stride, int border_horz, int border_vert) {
+ uint16_t *data_p;
+ int i, j;
+ for (i = 0; i < height; ++i) {
+ data_p = data + i * stride;
+ for (j = -border_horz; j < 0; ++j) data_p[j] = data_p[0];
+ for (j = width; j < width + border_horz; ++j) data_p[j] = data_p[width - 1];
+ }
+ data_p = data - border_horz;
+ for (i = -border_vert; i < 0; ++i) {
+ memcpy(data_p + i * stride, data_p,
+ (width + 2 * border_horz) * sizeof(uint16_t));
+ }
+ for (i = height; i < height + border_vert; ++i) {
+ memcpy(data_p + i * stride, data_p + (height - 1) * stride,
+ (width + 2 * border_horz) * sizeof(uint16_t));
+ }
+}
+
+void extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd) {
+ if (highbd)
+ extend_frame_highbd(CONVERT_TO_SHORTPTR(data), width, height, stride,
+ border_horz, border_vert);
+ else
+ extend_frame_lowbd(data, width, height, stride, border_horz, border_vert);
+}
+
+static void copy_tile_lowbd(int width, int height, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride) {
+ for (int i = 0; i < height; ++i)
+ memcpy(dst + i * dst_stride, src + i * src_stride, width);
+}
+
+static void copy_tile_highbd(int width, int height, const uint16_t *src,
+ int src_stride, uint16_t *dst, int dst_stride) {
+ for (int i = 0; i < height; ++i)
+ memcpy(dst + i * dst_stride, src + i * src_stride, width * sizeof(*dst));
+}
+
+static void copy_tile(int width, int height, const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int highbd) {
+ if (highbd)
+ copy_tile_highbd(width, height, CONVERT_TO_SHORTPTR(src), src_stride,
+ CONVERT_TO_SHORTPTR(dst), dst_stride);
+ else
+ copy_tile_lowbd(width, height, src, src_stride, dst, dst_stride);
+}
+
+#define REAL_PTR(hbd, d) ((hbd) ? (uint8_t *)CONVERT_TO_SHORTPTR(d) : (d))
+
+// With striped loop restoration, the filtering for each 64-pixel stripe gets
+// most of its input from the output of CDEF (stored in data8), but we need to
+// fill out a border of 3 pixels above/below the stripe according to the
+// following
+// rules:
+//
+// * At a frame boundary, we copy the outermost row of CDEF pixels three times.
+// This extension is done by a call to extend_frame() at the start of the loop
+// restoration process, so the value of copy_above/copy_below doesn't strictly
+// matter.
+// However, by setting *copy_above = *copy_below = 1 whenever loop filtering
+// across tiles is disabled, we can allow
+// {setup,restore}_processing_stripe_boundary to assume that the top/bottom
+// data has always been copied, simplifying the behaviour at the left and
+// right edges of tiles.
+//
+// * If we're at a tile boundary and loop filtering across tiles is enabled,
+// then there is a logical stripe which is 64 pixels high, but which is split
+// into an 8px high and a 56px high stripe so that the processing (and
+// coefficient set usage) can be aligned to tiles.
+// In this case, we use the 3 rows of CDEF output across the boundary for
+// context; this corresponds to leaving the frame buffer as-is.
+//
+// * If we're at a tile boundary and loop filtering across tiles is disabled,
+// then we take the outermost row of CDEF pixels *within the current tile*
+// and copy it three times. Thus we behave exactly as if the tile were a full
+// frame.
+//
+// * Otherwise, we're at a stripe boundary within a tile. In that case, we
+// take 2 rows of deblocked pixels and extend them to 3 rows of context.
+//
+// The distinction between the latter two cases is handled by the
+// av1_loop_restoration_save_boundary_lines() function, so here we just need
+// to decide if we're overwriting the above/below boundary pixels or not.
+static void get_stripe_boundary_info(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect, int ss_y,
+ int *copy_above, int *copy_below) {
+ *copy_above = 1;
+ *copy_below = 1;
+
+ const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ const int first_stripe_in_tile = (limits->v_start == tile_rect->top);
+ const int this_stripe_height =
+ full_stripe_height - (first_stripe_in_tile ? runit_offset : 0);
+ const int last_stripe_in_tile =
+ (limits->v_start + this_stripe_height >= tile_rect->bottom);
+
+ if (first_stripe_in_tile) *copy_above = 0;
+ if (last_stripe_in_tile) *copy_below = 0;
+}
+
+// Overwrite the border pixels around a processing stripe so that the conditions
+// listed above get_stripe_boundary_info() are preserved.
+// We save the pixels which get overwritten into a temporary buffer, so that
+// they can be restored by restore_processing_stripe_boundary() after we've
+// processed the stripe.
+//
+// limits gives the rectangular limits of the remaining stripes for the current
+// restoration unit. rsb is the stored stripe boundaries (taken from either
+// deblock or CDEF output as necessary).
+//
+// tile_rect is the limits of the current tile and tile_stripe0 is the index of
+// the first stripe in this tile (needed to convert the tile-relative stripe
+// index we get from limits into something we can look up in rsb).
+static void setup_processing_stripe_boundary(
+ const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
+ int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
+ RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
+ // Offsets within the line buffers. The buffer logically starts at column
+ // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
+ // has column x0 in the buffer.
+ const int buf_stride = rsb->stripe_boundary_stride;
+ const int buf_x0_off = limits->h_start;
+ const int line_width =
+ (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+ const int line_size = line_width << use_highbd;
+
+ const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+ // Replace RESTORATION_BORDER pixels above the top of the stripe
+ // We expand RESTORATION_CTX_VERT=2 lines from rsb->stripe_boundary_above
+ // to fill RESTORATION_BORDER=3 lines of above pixels. This is done by
+ // duplicating the topmost of the 2 lines (see the AOMMAX call when
+ // calculating src_row, which gets the values 0, 0, 1 for i = -3, -2, -1).
+ //
+ // Special case: If we're at the top of a tile, which isn't on the topmost
+ // tile row, and we're allowed to loop filter across tiles, then we have a
+ // logical 64-pixel-high stripe which has been split into an 8-pixel high
+ // stripe and a 56-pixel high stripe (the current one). So, in this case,
+ // we want to leave the boundary alone!
+ if (!opt) {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+ const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+ const int buf_off = buf_x0_off + buf_row * buf_stride;
+ const uint8_t *buf =
+ rsb->stripe_boundary_above + (buf_off << use_highbd);
+ uint8_t *dst8 = data8_tl + i * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_above
+ memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+ REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+ }
+ }
+
+ // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+ // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+ // for i = 0, 1, 2.
+ if (copy_below) {
+ const int stripe_end = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+ for (int i = 0; i < RESTORATION_BORDER; ++i) {
+ const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+ const int buf_off = buf_x0_off + buf_row * buf_stride;
+ const uint8_t *src =
+ rsb->stripe_boundary_below + (buf_off << use_highbd);
+
+ uint8_t *dst8 = data8_bl + i * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_below
+ memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+ }
+ }
+ } else {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ // Only save and overwrite i=-RESTORATION_BORDER line.
+ uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_above
+ memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8),
+ REAL_PTR(use_highbd,
+ data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+ line_size);
+ }
+
+ if (copy_below) {
+ const int stripe_end = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+ // Only save and overwrite i=2 line.
+ uint8_t *dst8 = data8_bl + 2 * data_stride;
+ // Save old pixels, then replace with data from stripe_boundary_below
+ memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+ memcpy(REAL_PTR(use_highbd, dst8),
+ REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
+ }
+ }
+}
+
+// This function restores the boundary lines modified by
+// setup_processing_stripe_boundary.
+//
+// Note: We need to be careful when handling the corners of the processing
+// unit, because (eg.) the top-left corner is considered to be part of
+// both the left and top borders. This means that, depending on the
+// loop_filter_across_tiles_enabled flag, the corner pixels might get
+// overwritten twice, once as part of the "top" border and once as part
+// of the "left" border (or similar for other corners).
+//
+// Everything works out fine as long as we make sure to reverse the order
+// when restoring, ie. we need to restore the left/right borders followed
+// by the top/bottom borders.
+static void restore_processing_stripe_boundary(
+ const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
+ int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
+ int copy_below, int opt) {
+ const int line_width =
+ (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
+ const int line_size = line_width << use_highbd;
+
+ const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
+
+ if (!opt) {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+ for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+ uint8_t *dst8 = data8_tl + i * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8),
+ rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+ }
+ }
+
+ if (copy_below) {
+ const int stripe_bottom = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+ for (int i = 0; i < RESTORATION_BORDER; ++i) {
+ if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+
+ uint8_t *dst8 = data8_bl + i * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+ }
+ }
+ } else {
+ if (copy_above) {
+ uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+ // Only restore i=-RESTORATION_BORDER line.
+ uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+ }
+
+ if (copy_below) {
+ const int stripe_bottom = limits->v_start + h;
+ uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+ // Only restore i=2 line.
+ if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+ uint8_t *dst8 = data8_bl + 2 * data_stride;
+ memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+ }
+ }
+ }
+}
+
+static void wiener_filter_stripe(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth) {
+ (void)tmpbuf;
+ (void)bit_depth;
+ assert(bit_depth == 8);
+ const ConvolveParams conv_params = get_conv_params_wiener(8);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+ const uint8_t *src_p = src + j;
+ uint8_t *dst_p = dst + j;
+ av1_wiener_convolve_add_src(
+ src_p, src_stride, dst_p, dst_stride, rui->wiener_info.hfilter, 16,
+ rui->wiener_info.vfilter, 16, w, stripe_height, &conv_params);
+ }
+}
+
+/* Calculate windowed sums (if sqr=0) or sums of squares (if sqr=1)
+ over the input. The window is of size (2r + 1)x(2r + 1), and we
+ specialize to r = 1, 2, 3. A default function is used for r > 3.
+
+ Each loop follows the same format: We keep a window's worth of input
+ in individual variables and select data out of that as appropriate.
+*/
+static void boxsum1(int32_t *src, int width, int height, int src_stride,
+ int sqr, int32_t *dst, int dst_stride) {
+ int i, j, a, b, c;
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ // Vertical sum over 3-pixel regions, from src into dst.
+ if (!sqr) {
+ for (j = 0; j < width; ++j) {
+ a = src[j];
+ b = src[src_stride + j];
+ c = src[2 * src_stride + j];
+
+ dst[j] = a + b;
+ for (i = 1; i < height - 2; ++i) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[(i - 1) * src_stride + j]
+ // b = src[(i ) * src_stride + j]
+ // c = src[(i + 1) * src_stride + j]
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = src[(i + 2) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[(i + 1) * dst_stride + j] = b + c;
+ }
+ } else {
+ for (j = 0; j < width; ++j) {
+ a = src[j] * src[j];
+ b = src[src_stride + j] * src[src_stride + j];
+ c = src[2 * src_stride + j] * src[2 * src_stride + j];
+
+ dst[j] = a + b;
+ for (i = 1; i < height - 2; ++i) {
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = src[(i + 2) * src_stride + j] * src[(i + 2) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[(i + 1) * dst_stride + j] = b + c;
+ }
+ }
+
+ // Horizontal sum over 3-pixel regions of dst
+ for (i = 0; i < height; ++i) {
+ a = dst[i * dst_stride];
+ b = dst[i * dst_stride + 1];
+ c = dst[i * dst_stride + 2];
+
+ dst[i * dst_stride] = a + b;
+ for (j = 1; j < width - 2; ++j) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[i * src_stride + (j - 1)]
+ // b = src[i * src_stride + (j )]
+ // c = src[i * src_stride + (j + 1)]
+ dst[i * dst_stride + j] = a + b + c;
+ a = b;
+ b = c;
+ c = dst[i * dst_stride + (j + 2)];
+ }
+ dst[i * dst_stride + j] = a + b + c;
+ dst[i * dst_stride + (j + 1)] = b + c;
+ }
+}
+
+static void boxsum2(int32_t *src, int width, int height, int src_stride,
+ int sqr, int32_t *dst, int dst_stride) {
+ int i, j, a, b, c, d, e;
+ assert(width > 2 * SGRPROJ_BORDER_HORZ);
+ assert(height > 2 * SGRPROJ_BORDER_VERT);
+
+ // Vertical sum over 5-pixel regions, from src into dst.
+ if (!sqr) {
+ for (j = 0; j < width; ++j) {
+ a = src[j];
+ b = src[src_stride + j];
+ c = src[2 * src_stride + j];
+ d = src[3 * src_stride + j];
+ e = src[4 * src_stride + j];
+
+ dst[j] = a + b + c;
+ dst[dst_stride + j] = a + b + c + d;
+ for (i = 2; i < height - 3; ++i) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[(i - 2) * src_stride + j]
+ // b = src[(i - 1) * src_stride + j]
+ // c = src[(i ) * src_stride + j]
+ // d = src[(i + 1) * src_stride + j]
+ // e = src[(i + 2) * src_stride + j]
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = src[(i + 3) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[(i + 1) * dst_stride + j] = b + c + d + e;
+ dst[(i + 2) * dst_stride + j] = c + d + e;
+ }
+ } else {
+ for (j = 0; j < width; ++j) {
+ a = src[j] * src[j];
+ b = src[src_stride + j] * src[src_stride + j];
+ c = src[2 * src_stride + j] * src[2 * src_stride + j];
+ d = src[3 * src_stride + j] * src[3 * src_stride + j];
+ e = src[4 * src_stride + j] * src[4 * src_stride + j];
+
+ dst[j] = a + b + c;
+ dst[dst_stride + j] = a + b + c + d;
+ for (i = 2; i < height - 3; ++i) {
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = src[(i + 3) * src_stride + j] * src[(i + 3) * src_stride + j];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[(i + 1) * dst_stride + j] = b + c + d + e;
+ dst[(i + 2) * dst_stride + j] = c + d + e;
+ }
+ }
+
+ // Horizontal sum over 5-pixel regions of dst
+ for (i = 0; i < height; ++i) {
+ a = dst[i * dst_stride];
+ b = dst[i * dst_stride + 1];
+ c = dst[i * dst_stride + 2];
+ d = dst[i * dst_stride + 3];
+ e = dst[i * dst_stride + 4];
+
+ dst[i * dst_stride] = a + b + c;
+ dst[i * dst_stride + 1] = a + b + c + d;
+ for (j = 2; j < width - 3; ++j) {
+ // Loop invariant: At the start of each iteration,
+ // a = src[i * src_stride + (j - 2)]
+ // b = src[i * src_stride + (j - 1)]
+ // c = src[i * src_stride + (j )]
+ // d = src[i * src_stride + (j + 1)]
+ // e = src[i * src_stride + (j + 2)]
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ a = b;
+ b = c;
+ c = d;
+ d = e;
+ e = dst[i * dst_stride + (j + 3)];
+ }
+ dst[i * dst_stride + j] = a + b + c + d + e;
+ dst[i * dst_stride + (j + 1)] = b + c + d + e;
+ dst[i * dst_stride + (j + 2)] = c + d + e;
+ }
+}
+
+static void boxsum(int32_t *src, int width, int height, int src_stride, int r,
+ int sqr, int32_t *dst, int dst_stride) {
+ if (r == 1)
+ boxsum1(src, width, height, src_stride, sqr, dst, dst_stride);
+ else if (r == 2)
+ boxsum2(src, width, height, src_stride, sqr, dst, dst_stride);
+ else
+ assert(0 && "Invalid value of r in self-guided filter");
+}
+
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params) {
+ if (params->r[0] == 0) {
+ xq[0] = 0;
+ xq[1] = (1 << SGRPROJ_PRJ_BITS) - xqd[1];
+ } else if (params->r[1] == 0) {
+ xq[0] = xqd[0];
+ xq[1] = 0;
+ } else {
+ xq[0] = xqd[0];
+ xq[1] = (1 << SGRPROJ_PRJ_BITS) - xq[0] - xqd[1];
+ }
+}
+
+const int32_t x_by_xplus1[256] = {
+ // Special case: Map 0 -> 1 (corresponding to a value of 1/256)
+ // instead of 0. See comments in selfguided_restoration_internal() for why
+ 1, 128, 171, 192, 205, 213, 219, 224, 228, 230, 233, 235, 236, 238, 239,
+ 240, 241, 242, 243, 243, 244, 244, 245, 245, 246, 246, 247, 247, 247, 247,
+ 248, 248, 248, 248, 249, 249, 249, 249, 249, 250, 250, 250, 250, 250, 250,
+ 250, 251, 251, 251, 251, 251, 251, 251, 251, 251, 251, 252, 252, 252, 252,
+ 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253,
+ 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 253, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254, 254,
+ 254, 254, 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 256,
+};
+
+const int32_t one_by_x[MAX_NELEM] = {
+ 4096, 2048, 1365, 1024, 819, 683, 585, 512, 455, 410, 372, 341, 315,
+ 293, 273, 256, 241, 228, 216, 205, 195, 186, 178, 171, 164,
+};
+
+static void calculate_intermediate_result(int32_t *dgd, int width, int height,
+ int dgd_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx,
+ int pass, int32_t *A, int32_t *B) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ const int step = pass == 0 ? 1 : 2;
+ int i, j;
+
+ assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
+ assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
+ "Need SGRPROJ_BORDER_* >= r+1");
+
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 0, B, buf_stride);
+ boxsum(dgd - dgd_stride * SGRPROJ_BORDER_VERT - SGRPROJ_BORDER_HORZ,
+ width_ext, height_ext, dgd_stride, r, 1, A, buf_stride);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ // Calculate the eventual A[] and B[] arrays. Include a 1-pixel border - ie,
+ // for a 64x64 processing unit, we calculate 66x66 pixels of A[] and B[].
+ for (i = -1; i < height + 1; i += step) {
+ for (j = -1; j < width + 1; ++j) {
+ const int k = i * buf_stride + j;
+ const int n = (2 * r + 1) * (2 * r + 1);
+
+ // a < 2^16 * n < 2^22 regardless of bit depth
+ uint32_t a = ROUND_POWER_OF_TWO(A[k], 2 * (bit_depth - 8));
+ // b < 2^8 * n < 2^14 regardless of bit depth
+ uint32_t b = ROUND_POWER_OF_TWO(B[k], bit_depth - 8);
+
+ // Each term in calculating p = a * n - b * b is < 2^16 * n^2 < 2^28,
+ // and p itself satisfies p < 2^14 * n^2 < 2^26.
+ // This bound on p is due to:
+ // https://en.wikipedia.org/wiki/Popoviciu's_inequality_on_variances
+ //
+ // Note: Sometimes, in high bit depth, we can end up with a*n < b*b.
+ // This is an artefact of rounding, and can only happen if all pixels
+ // are (almost) identical, so in this case we saturate to p=0.
+ uint32_t p = (a * n < b * b) ? 0 : a * n - b * b;
+
+ const uint32_t s = params->s[radius_idx];
+
+ // p * s < (2^14 * n^2) * round(2^20 / n^2 eps) < 2^34 / eps < 2^32
+ // as long as eps >= 4. So p * s fits into a uint32_t, and z < 2^12
+ // (this holds even after accounting for the rounding in s)
+ const uint32_t z = ROUND_POWER_OF_TWO(p * s, SGRPROJ_MTABLE_BITS);
+
+ // Note: We have to be quite careful about the value of A[k].
+ // This is used as a blend factor between individual pixel values and the
+ // local mean. So it logically has a range of [0, 256], including both
+ // endpoints.
+ //
+ // This is a pain for hardware, as we'd like something which can be stored
+ // in exactly 8 bits.
+ // Further, in the calculation of B[k] below, if z == 0 and r == 2,
+ // then A[k] "should be" 0. But then we can end up setting B[k] to a value
+ // slightly above 2^(8 + bit depth), due to rounding in the value of
+ // one_by_x[25-1].
+ //
+ // Thus we saturate so that, when z == 0, A[k] is set to 1 instead of 0.
+ // This fixes the above issues (256 - A[k] fits in a uint8, and we can't
+ // overflow), without significantly affecting the final result: z == 0
+ // implies that the image is essentially "flat", so the local mean and
+ // individual pixel values are very similar.
+ //
+ // Note that saturating on the other side, ie. requring A[k] <= 255,
+ // would be a bad idea, as that corresponds to the case where the image
+ // is very variable, when we want to preserve the local pixel value as
+ // much as possible.
+ A[k] = x_by_xplus1[AOMMIN(z, 255)]; // in range [1, 256]
+
+ // SGRPROJ_SGR - A[k] < 2^8 (from above), B[k] < 2^(bit_depth) * n,
+ // one_by_x[n - 1] = round(2^12 / n)
+ // => the product here is < 2^(20 + bit_depth) <= 2^32,
+ // and B[k] is set to a value < 2^(8 + bit depth)
+ // This holds even with the rounding in one_by_x and in the overall
+ // result, as long as SGRPROJ_SGR - A[k] is strictly less than 2^8.
+ B[k] = (int32_t)ROUND_POWER_OF_TWO((uint32_t)(SGRPROJ_SGR - A[k]) *
+ (uint32_t)B[k] *
+ (uint32_t)one_by_x[n - 1],
+ SGRPROJ_RECIP_BITS);
+ }
+ }
+}
+
+static void selfguided_restoration_fast_internal(
+ int32_t *dgd, int width, int height, int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth, int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 1, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Use the A[] and B[] arrays to calculate the filtered image
+ (void)r;
+ assert(r == 2);
+ for (i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a = (A[k - buf_stride] + A[k + buf_stride]) * 6 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 5;
+ const int32_t b = (B[k - buf_stride] + B[k + buf_stride]) * 6 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 5;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ } else { // odd row
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 4;
+ const int32_t a = A[k] * 6 + (A[k - 1] + A[k + 1]) * 5;
+ const int32_t b = B[k] * 6 + (B[k - 1] + B[k + 1]) * 5;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+ }
+}
+
+static void selfguided_restoration_internal(int32_t *dgd, int width, int height,
+ int dgd_stride, int32_t *dst,
+ int dst_stride, int bit_depth,
+ int sgr_params_idx,
+ int radius_idx) {
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes, for consistency
+ // with the SIMD version of this function.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+ int32_t A_[RESTORATION_PROC_UNIT_PELS];
+ int32_t B_[RESTORATION_PROC_UNIT_PELS];
+ int32_t *A = A_;
+ int32_t *B = B_;
+ int i, j;
+ calculate_intermediate_result(dgd, width, height, dgd_stride, bit_depth,
+ sgr_params_idx, radius_idx, 0, A, B);
+ A += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+ B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
+
+ // Use the A[] and B[] arrays to calculate the filtered image
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int k = i * buf_stride + j;
+ const int l = i * dgd_stride + j;
+ const int m = i * dst_stride + j;
+ const int nb = 5;
+ const int32_t a =
+ (A[k] + A[k - 1] + A[k + 1] + A[k - buf_stride] + A[k + buf_stride]) *
+ 4 +
+ (A[k - 1 - buf_stride] + A[k - 1 + buf_stride] +
+ A[k + 1 - buf_stride] + A[k + 1 + buf_stride]) *
+ 3;
+ const int32_t b =
+ (B[k] + B[k - 1] + B[k + 1] + B[k - buf_stride] + B[k + buf_stride]) *
+ 4 +
+ (B[k - 1 - buf_stride] + B[k - 1 + buf_stride] +
+ B[k + 1 - buf_stride] + B[k + 1 + buf_stride]) *
+ 3;
+ const int32_t v = a * dgd[l] + b;
+ dst[m] = ROUND_POWER_OF_TWO(v, SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ }
+ }
+}
+
+int av1_selfguided_restoration_c(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0, int32_t *flt1,
+ int flt_stride, int sgr_params_idx,
+ int bit_depth, int highbd) {
+ int32_t dgd32_[RESTORATION_PROC_UNIT_PELS];
+ const int dgd32_stride = width + 2 * SGRPROJ_BORDER_HORZ;
+ int32_t *dgd32 =
+ dgd32_ + dgd32_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
+
+ if (highbd) {
+ const uint16_t *dgd16 = CONVERT_TO_SHORTPTR(dgd8);
+ for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd16[i * dgd_stride + j];
+ }
+ }
+ } else {
+ for (int i = -SGRPROJ_BORDER_VERT; i < height + SGRPROJ_BORDER_VERT; ++i) {
+ for (int j = -SGRPROJ_BORDER_HORZ; j < width + SGRPROJ_BORDER_HORZ; ++j) {
+ dgd32[i * dgd32_stride + j] = dgd8[i * dgd_stride + j];
+ }
+ }
+ }
+
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+
+ if (params->r[0] > 0)
+ selfguided_restoration_fast_internal(dgd32, width, height, dgd32_stride,
+ flt0, flt_stride, bit_depth,
+ sgr_params_idx, 0);
+ if (params->r[1] > 0)
+ selfguided_restoration_internal(dgd32, width, height, dgd32_stride, flt1,
+ flt_stride, bit_depth, sgr_params_idx, 1);
+ return 0;
+}
+
+void apply_selfguided_restoration_c(const uint8_t *dat8, int width, int height,
+ int stride, int eps, const int *xqd,
+ uint8_t *dst8, int dst_stride,
+ int32_t *tmpbuf, int bit_depth,
+ int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+
+ const int ret = av1_selfguided_restoration_c(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+ decode_xq(xqd, xq, params);
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; ++j) {
+ const int k = i * width + j;
+ uint8_t *dst8ij = dst8 + i * dst_stride + j;
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+
+ const uint16_t pre_u = highbd ? *CONVERT_TO_SHORTPTR(dat8ij) : *dat8ij;
+ const int32_t u = (int32_t)pre_u << SGRPROJ_RST_BITS;
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ // If params->r == 0 then we skipped the filtering in
+ // av1_selfguided_restoration_c, i.e. flt[k] == u
+ if (params->r[0] > 0) v += xq[0] * (flt0[k] - u);
+ if (params->r[1] > 0) v += xq[1] * (flt1[k] - u);
+ const int16_t w =
+ (int16_t)ROUND_POWER_OF_TWO(v, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ const uint16_t out = clip_pixel_highbd(w, bit_depth);
+ if (highbd)
+ *CONVERT_TO_SHORTPTR(dst8ij) = out;
+ else
+ *dst8ij = (uint8_t)out;
+ }
+ }
+}
+
+static void sgrproj_filter_stripe(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth) {
+ (void)bit_depth;
+ assert(bit_depth == 8);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, stripe_width - j);
+ apply_selfguided_restoration(src + j, w, stripe_height, src_stride,
+ rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+ dst + j, dst_stride, tmpbuf, bit_depth, 0);
+ }
+}
+
+static void wiener_filter_stripe_highbd(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src8,
+ int src_stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth) {
+ (void)tmpbuf;
+ const ConvolveParams conv_params = get_conv_params_wiener(bit_depth);
+
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
+ const uint8_t *src8_p = src8 + j;
+ uint8_t *dst8_p = dst8 + j;
+ av1_highbd_wiener_convolve_add_src(src8_p, src_stride, dst8_p, dst_stride,
+ rui->wiener_info.hfilter, 16,
+ rui->wiener_info.vfilter, 16, w,
+ stripe_height, &conv_params, bit_depth);
+ }
+}
+
+static void sgrproj_filter_stripe_highbd(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width,
+ const uint8_t *src8, int src_stride,
+ uint8_t *dst8, int dst_stride,
+ int32_t *tmpbuf, int bit_depth) {
+ for (int j = 0; j < stripe_width; j += procunit_width) {
+ int w = AOMMIN(procunit_width, stripe_width - j);
+ apply_selfguided_restoration(src8 + j, w, stripe_height, src_stride,
+ rui->sgrproj_info.ep, rui->sgrproj_info.xqd,
+ dst8 + j, dst_stride, tmpbuf, bit_depth, 1);
+ }
+}
+
+typedef void (*stripe_filter_fun)(const RestorationUnitInfo *rui,
+ int stripe_width, int stripe_height,
+ int procunit_width, const uint8_t *src,
+ int src_stride, uint8_t *dst, int dst_stride,
+ int32_t *tmpbuf, int bit_depth);
+
+#define NUM_STRIPE_FILTERS 4
+
+static const stripe_filter_fun stripe_filters[NUM_STRIPE_FILTERS] = {
+ wiener_filter_stripe, sgrproj_filter_stripe, wiener_filter_stripe_highbd,
+ sgrproj_filter_stripe_highbd
+};
+
+// Filter one restoration unit
+void av1_loop_restoration_filter_unit(
+ const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+ const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+ const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+ int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int optimized_lr) {
+ RestorationType unit_rtype = rui->restoration_type;
+
+ int unit_h = limits->v_end - limits->v_start;
+ int unit_w = limits->h_end - limits->h_start;
+ uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start;
+ uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start;
+
+ if (unit_rtype == RESTORE_NONE) {
+ copy_tile(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, highbd);
+ return;
+ }
+
+ const int filter_idx = 2 * highbd + (unit_rtype == RESTORE_SGRPROJ);
+ assert(filter_idx < NUM_STRIPE_FILTERS);
+ const stripe_filter_fun stripe_filter = stripe_filters[filter_idx];
+
+ const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+
+ // Convolve the whole tile one stripe at a time
+ RestorationTileLimits remaining_stripes = *limits;
+ int i = 0;
+ while (i < unit_h) {
+ int copy_above, copy_below;
+ remaining_stripes.v_start = limits->v_start + i;
+
+ get_stripe_boundary_info(&remaining_stripes, tile_rect, ss_y, &copy_above,
+ &copy_below);
+
+ const int full_stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int runit_offset = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ // Work out where this stripe's boundaries are within
+ // rsb->stripe_boundary_{above,below}
+ const int tile_stripe =
+ (remaining_stripes.v_start - tile_rect->top + runit_offset) /
+ full_stripe_height;
+ const int frame_stripe = tile_stripe0 + tile_stripe;
+ const int rsb_row = RESTORATION_CTX_VERT * frame_stripe;
+
+ // Calculate this stripe's height, based on two rules:
+ // * The topmost stripe in each tile is 8 luma pixels shorter than usual.
+ // * We can't extend past the end of the current restoration unit
+ const int nominal_stripe_height =
+ full_stripe_height - ((tile_stripe == 0) ? runit_offset : 0);
+ const int h = AOMMIN(nominal_stripe_height,
+ remaining_stripes.v_end - remaining_stripes.v_start);
+
+ setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
+ h, data8, stride, rlbs, copy_above,
+ copy_below, optimized_lr);
+
+ stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
+ dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
+
+ restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
+ data8, stride, copy_above, copy_below,
+ optimized_lr);
+
+ i += h;
+ }
+}
+
+static void filter_frame_on_tile(int tile_row, int tile_col, void *priv,
+ AV1_COMMON *cm) {
+ (void)tile_col;
+ FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+ ctxt->tile_stripe0 = (tile_row == 0) ? 0 : cm->rst_end_stripe[tile_row - 1];
+}
+
+static void filter_frame_on_unit(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect,
+ int rest_unit_idx, void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ FilterFrameCtxt *ctxt = (FilterFrameCtxt *)priv;
+ const RestorationInfo *rsi = ctxt->rsi;
+
+ av1_loop_restoration_filter_unit(
+ limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, rlbs, tile_rect,
+ ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd, ctxt->bit_depth,
+ ctxt->data8, ctxt->data_stride, ctxt->dst8, ctxt->dst_stride, tmpbuf,
+ rsi->optimized_lr);
+}
+
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+ YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ int num_planes) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int bit_depth = seq_params->bit_depth;
+ const int highbd = seq_params->use_highbitdepth;
+ lr_ctxt->dst = &cm->rst_frame;
+
+ const int frame_width = frame->crop_widths[0];
+ const int frame_height = frame->crop_heights[0];
+ if (aom_realloc_frame_buffer(
+ lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, highbd, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL) < 0)
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate restoration dst buffer");
+
+ lr_ctxt->on_rest_unit = filter_frame_on_unit;
+ lr_ctxt->frame = frame;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationType rtype = rsi->frame_restoration_type;
+ rsi->optimized_lr = optimized_lr;
+
+ if (rtype == RESTORE_NONE) {
+ continue;
+ }
+
+ const int is_uv = plane > 0;
+ const int plane_width = frame->crop_widths[is_uv];
+ const int plane_height = frame->crop_heights[is_uv];
+ FilterFrameCtxt *lr_plane_ctxt = &lr_ctxt->ctxt[plane];
+
+ extend_frame(frame->buffers[plane], plane_width, plane_height,
+ frame->strides[is_uv], RESTORATION_BORDER, RESTORATION_BORDER,
+ highbd);
+
+ lr_plane_ctxt->rsi = rsi;
+ lr_plane_ctxt->ss_x = is_uv && seq_params->subsampling_x;
+ lr_plane_ctxt->ss_y = is_uv && seq_params->subsampling_y;
+ lr_plane_ctxt->highbd = highbd;
+ lr_plane_ctxt->bit_depth = bit_depth;
+ lr_plane_ctxt->data8 = frame->buffers[plane];
+ lr_plane_ctxt->dst8 = lr_ctxt->dst->buffers[plane];
+ lr_plane_ctxt->data_stride = frame->strides[is_uv];
+ lr_plane_ctxt->dst_stride = lr_ctxt->dst->strides[is_uv];
+ lr_plane_ctxt->tile_rect = av1_whole_frame_rect(cm, is_uv);
+ filter_frame_on_tile(LR_TILE_ROW, LR_TILE_COL, lr_plane_ctxt, cm);
+ }
+}
+
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+ AV1_COMMON *cm, int num_planes) {
+ typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+ int vstart, int vend);
+ static const copy_fun copy_funs[3] = {
+ aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+ };
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ AV1PixelRect tile_rect = loop_rest_ctxt->ctxt[plane].tile_rect;
+ copy_funs[plane](loop_rest_ctxt->dst, loop_rest_ctxt->frame, tile_rect.left,
+ tile_rect.right, tile_rect.top, tile_rect.bottom);
+ }
+}
+
+static void foreach_rest_unit_in_planes(AV1LrStruct *lr_ctxt, AV1_COMMON *cm,
+ int num_planes) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) {
+ continue;
+ }
+
+ av1_foreach_rest_unit_in_plane(cm, plane, lr_ctxt->on_rest_unit,
+ &ctxt[plane], &ctxt[plane].tile_rect,
+ cm->rst_tmpbuf, cm->rlbs);
+ }
+}
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ void *lr_ctxt) {
+ assert(!cm->all_lossless);
+ const int num_planes = av1_num_planes(cm);
+
+ AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+ av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+ optimized_lr, num_planes);
+
+ foreach_rest_unit_in_planes(loop_rest_ctxt, cm, num_planes);
+
+ av1_loop_restoration_copy_planes(loop_rest_ctxt, cm, num_planes);
+}
+
+void av1_foreach_rest_unit_in_row(
+ RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+ rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+ int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+ void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+ struct AV1LrSyncData *const lr_sync) {
+ const int tile_w = tile_rect->right - tile_rect->left;
+ const int ext_size = unit_size * 3 / 2;
+ int x0 = 0, j = 0;
+ while (x0 < tile_w) {
+ int remaining_w = tile_w - x0;
+ int w = (remaining_w < ext_size) ? remaining_w : unit_size;
+
+ limits->h_start = tile_rect->left + x0;
+ limits->h_end = tile_rect->left + x0 + w;
+ assert(limits->h_end <= tile_rect->right);
+
+ const int unit_idx = unit_idx0 + row_number * hunits_per_tile + j;
+
+ // No sync for even numbered rows
+ // For odd numbered rows, Loop Restoration of current block requires the LR
+ // of top-right and bottom-right blocks to be completed
+
+ // top-right sync
+ on_sync_read(lr_sync, row_number, j, plane);
+ if ((row_number + 1) < vunits_per_tile)
+ // bottom-right sync
+ on_sync_read(lr_sync, row_number + 2, j, plane);
+
+ on_rest_unit(limits, tile_rect, unit_idx, priv, tmpbuf, rlbs);
+
+ on_sync_write(lr_sync, row_number, j, hunits_per_tile, plane);
+
+ x0 += w;
+ ++j;
+ }
+}
+
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane) {
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+}
+
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane) {
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+}
+
+static void foreach_rest_unit_in_tile(
+ const AV1PixelRect *tile_rect, int tile_row, int tile_col, int tile_cols,
+ int hunits_per_tile, int vunits_per_tile, int units_per_tile, int unit_size,
+ int ss_y, int plane, rest_unit_visitor_t on_rest_unit, void *priv,
+ int32_t *tmpbuf, RestorationLineBuffers *rlbs) {
+ const int tile_h = tile_rect->bottom - tile_rect->top;
+ const int ext_size = unit_size * 3 / 2;
+
+ const int tile_idx = tile_col + tile_row * tile_cols;
+ const int unit_idx0 = tile_idx * units_per_tile;
+
+ int y0 = 0, i = 0;
+ while (y0 < tile_h) {
+ int remaining_h = tile_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+ RestorationTileLimits limits;
+ limits.v_start = tile_rect->top + y0;
+ limits.v_end = tile_rect->top + y0 + h;
+ assert(limits.v_end <= tile_rect->bottom);
+ // Offset the tile upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(tile_rect->top, limits.v_start - voffset);
+ if (limits.v_end < tile_rect->bottom) limits.v_end -= voffset;
+
+ av1_foreach_rest_unit_in_row(
+ &limits, tile_rect, on_rest_unit, i, unit_size, unit_idx0,
+ hunits_per_tile, vunits_per_tile, plane, priv, tmpbuf, rlbs,
+ av1_lr_sync_read_dummy, av1_lr_sync_write_dummy, NULL);
+
+ y0 += h;
+ ++i;
+ }
+}
+
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+ rest_unit_visitor_t on_rest_unit,
+ void *priv, AV1PixelRect *tile_rect,
+ int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+
+ foreach_rest_unit_in_tile(tile_rect, LR_TILE_ROW, LR_TILE_COL, LR_TILE_COLS,
+ rsi->horz_units_per_tile, rsi->vert_units_per_tile,
+ rsi->units_per_tile, rsi->restoration_unit_size,
+ ss_y, plane, on_rest_unit, priv, tmpbuf, rlbs);
+}
+
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rcol0, int *rcol1, int *rrow0,
+ int *rrow1) {
+ assert(rcol0 && rcol1 && rrow0 && rrow1);
+
+ if (bsize != cm->seq_params.sb_size) return 0;
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) return 0;
+
+ assert(!cm->all_lossless);
+
+ const int is_uv = plane > 0;
+
+ const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+ const int tile_w = tile_rect.right - tile_rect.left;
+ const int tile_h = tile_rect.bottom - tile_rect.top;
+
+ const int mi_top = 0;
+ const int mi_left = 0;
+
+ // Compute the mi-unit corners of the superblock relative to the top-left of
+ // the tile
+ const int mi_rel_row0 = mi_row - mi_top;
+ const int mi_rel_col0 = mi_col - mi_left;
+ const int mi_rel_row1 = mi_rel_row0 + mi_size_high[bsize];
+ const int mi_rel_col1 = mi_rel_col0 + mi_size_wide[bsize];
+
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ const int size = rsi->restoration_unit_size;
+
+ // Calculate the number of restoration units in this tile (which might be
+ // strictly less than rsi->horz_units_per_tile and rsi->vert_units_per_tile)
+ const int horz_units = av1_lr_count_units_in_tile(size, tile_w);
+ const int vert_units = av1_lr_count_units_in_tile(size, tile_h);
+
+ // The size of an MI-unit on this plane of the image
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+ const int mi_size_x = MI_SIZE >> ss_x;
+ const int mi_size_y = MI_SIZE >> ss_y;
+
+ // Write m for the relative mi column or row, D for the superres denominator
+ // and N for the superres numerator. If u is the upscaled pixel offset then
+ // we can write the downscaled pixel offset in two ways as:
+ //
+ // MI_SIZE * m = N / D u
+ //
+ // from which we get u = D * MI_SIZE * m / N
+ const int mi_to_num_x = av1_superres_scaled(cm)
+ ? mi_size_x * cm->superres_scale_denominator
+ : mi_size_x;
+ const int mi_to_num_y = mi_size_y;
+ const int denom_x = av1_superres_scaled(cm) ? size * SCALE_NUMERATOR : size;
+ const int denom_y = size;
+
+ const int rnd_x = denom_x - 1;
+ const int rnd_y = denom_y - 1;
+
+ // rcol0/rrow0 should be the first column/row of restoration units (relative
+ // to the top-left of the tile) that doesn't start left/below of
+ // mi_col/mi_row. For this calculation, we need to round up the division (if
+ // the sb starts at runit column 10.1, the first matching runit has column
+ // index 11)
+ *rcol0 = (mi_rel_col0 * mi_to_num_x + rnd_x) / denom_x;
+ *rrow0 = (mi_rel_row0 * mi_to_num_y + rnd_y) / denom_y;
+
+ // rel_col1/rel_row1 is the equivalent calculation, but for the superblock
+ // below-right. If we're at the bottom or right of the tile, this restoration
+ // unit might not exist, in which case we'll clamp accordingly.
+ *rcol1 = AOMMIN((mi_rel_col1 * mi_to_num_x + rnd_x) / denom_x, horz_units);
+ *rrow1 = AOMMIN((mi_rel_row1 * mi_to_num_y + rnd_y) / denom_y, vert_units);
+
+ return *rcol0 < *rcol1 && *rrow0 < *rrow1;
+}
+
+// Extend to left and right
+static void extend_lines(uint8_t *buf, int width, int height, int stride,
+ int extend, int use_highbitdepth) {
+ for (int i = 0; i < height; ++i) {
+ if (use_highbitdepth) {
+ uint16_t *buf16 = (uint16_t *)buf;
+ aom_memset16(buf16 - extend, buf16[0], extend);
+ aom_memset16(buf16 + width, buf16[width - 1], extend);
+ } else {
+ memset(buf - extend, buf[0], extend);
+ memset(buf + width, buf[width - 1], extend);
+ }
+ buf += stride;
+ }
+}
+
+static void save_deblock_boundary_lines(
+ const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, int plane, int row,
+ int stripe, int use_highbd, int is_above,
+ RestorationStripeBoundaries *boundaries) {
+ const int is_uv = plane > 0;
+ const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+ const int src_stride = frame->strides[is_uv] << use_highbd;
+ const uint8_t *src_rows = src_buf + row * src_stride;
+
+ uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+ : boundaries->stripe_boundary_below;
+ uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+ const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+ uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+
+ // There is a rare case in which a processing stripe can end 1px above the
+ // crop border. In this case, we do want to use deblocked pixels from below
+ // the stripe (hence why we ended up in this function), but instead of
+ // fetching 2 "below" rows we need to fetch one and duplicate it.
+ // This is equivalent to clamping the sample locations against the crop border
+ const int lines_to_save =
+ AOMMIN(RESTORATION_CTX_VERT, frame->crop_heights[is_uv] - row);
+ assert(lines_to_save == 1 || lines_to_save == 2);
+
+ int upscaled_width;
+ int line_bytes;
+ if (av1_superres_scaled(cm)) {
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ upscaled_width = (cm->superres_upscaled_width + ss_x) >> ss_x;
+ line_bytes = upscaled_width << use_highbd;
+ if (use_highbd)
+ av1_upscale_normative_rows(
+ cm, CONVERT_TO_BYTEPTR(src_rows), frame->strides[is_uv],
+ CONVERT_TO_BYTEPTR(bdry_rows), boundaries->stripe_boundary_stride,
+ plane, lines_to_save);
+ else
+ av1_upscale_normative_rows(cm, src_rows, frame->strides[is_uv], bdry_rows,
+ boundaries->stripe_boundary_stride, plane,
+ lines_to_save);
+ } else {
+ upscaled_width = frame->crop_widths[is_uv];
+ line_bytes = upscaled_width << use_highbd;
+ for (int i = 0; i < lines_to_save; i++) {
+ memcpy(bdry_rows + i * bdry_stride, src_rows + i * src_stride,
+ line_bytes);
+ }
+ }
+ // If we only saved one line, then copy it into the second line buffer
+ if (lines_to_save == 1)
+ memcpy(bdry_rows + bdry_stride, bdry_rows, line_bytes);
+
+ extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+ RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_cdef_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ const AV1_COMMON *cm, int plane, int row,
+ int stripe, int use_highbd, int is_above,
+ RestorationStripeBoundaries *boundaries) {
+ const int is_uv = plane > 0;
+ const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]);
+ const int src_stride = frame->strides[is_uv] << use_highbd;
+ const uint8_t *src_rows = src_buf + row * src_stride;
+
+ uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above
+ : boundaries->stripe_boundary_below;
+ uint8_t *bdry_start = bdry_buf + (RESTORATION_EXTRA_HORZ << use_highbd);
+ const int bdry_stride = boundaries->stripe_boundary_stride << use_highbd;
+ uint8_t *bdry_rows = bdry_start + RESTORATION_CTX_VERT * stripe * bdry_stride;
+ const int src_width = frame->crop_widths[is_uv];
+
+ // At the point where this function is called, we've already applied
+ // superres. So we don't need to extend the lines here, we can just
+ // pull directly from the topmost row of the upscaled frame.
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int upscaled_width = av1_superres_scaled(cm)
+ ? (cm->superres_upscaled_width + ss_x) >> ss_x
+ : src_width;
+ const int line_bytes = upscaled_width << use_highbd;
+ for (int i = 0; i < RESTORATION_CTX_VERT; i++) {
+ // Copy the line at 'row' into both context lines. This is because
+ // we want to (effectively) extend the outermost row of CDEF data
+ // from this tile to produce a border, rather than using deblocked
+ // pixels from the tile above/below.
+ memcpy(bdry_rows + i * bdry_stride, src_rows, line_bytes);
+ }
+ extend_lines(bdry_rows, upscaled_width, RESTORATION_CTX_VERT, bdry_stride,
+ RESTORATION_EXTRA_HORZ, use_highbd);
+}
+
+static void save_tile_row_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ int use_highbd, int plane,
+ AV1_COMMON *cm, int after_cdef) {
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+ const int stripe_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+ const int stripe_off = RESTORATION_UNIT_OFFSET >> ss_y;
+
+ // Get the tile rectangle, with height rounded up to the next multiple of 8
+ // luma pixels (only relevant for the bottom tile of the frame)
+ const AV1PixelRect tile_rect = av1_whole_frame_rect(cm, is_uv);
+ const int stripe0 = 0;
+
+ RestorationStripeBoundaries *boundaries = &cm->rst_info[plane].boundaries;
+
+ const int plane_height = ROUND_POWER_OF_TWO(cm->height, ss_y);
+
+ int tile_stripe;
+ for (tile_stripe = 0;; ++tile_stripe) {
+ const int rel_y0 = AOMMAX(0, tile_stripe * stripe_height - stripe_off);
+ const int y0 = tile_rect.top + rel_y0;
+ if (y0 >= tile_rect.bottom) break;
+
+ const int rel_y1 = (tile_stripe + 1) * stripe_height - stripe_off;
+ const int y1 = AOMMIN(tile_rect.top + rel_y1, tile_rect.bottom);
+
+ const int frame_stripe = stripe0 + tile_stripe;
+
+ // In this case, we should only use CDEF pixels at the top
+ // and bottom of the frame as a whole; internal tile boundaries
+ // can use deblocked pixels from adjacent tiles for context.
+ const int use_deblock_above = (frame_stripe > 0);
+ const int use_deblock_below = (y1 < plane_height);
+
+ if (!after_cdef) {
+ // Save deblocked context where needed.
+ if (use_deblock_above) {
+ save_deblock_boundary_lines(frame, cm, plane, y0 - RESTORATION_CTX_VERT,
+ frame_stripe, use_highbd, 1, boundaries);
+ }
+ if (use_deblock_below) {
+ save_deblock_boundary_lines(frame, cm, plane, y1, frame_stripe,
+ use_highbd, 0, boundaries);
+ }
+ } else {
+ // Save CDEF context where needed. Note that we need to save the CDEF
+ // context for a particular boundary iff we *didn't* save deblocked
+ // context for that boundary.
+ //
+ // In addition, we need to save copies of the outermost line within
+ // the tile, rather than using data from outside the tile.
+ if (!use_deblock_above) {
+ save_cdef_boundary_lines(frame, cm, plane, y0, frame_stripe, use_highbd,
+ 1, boundaries);
+ }
+ if (!use_deblock_below) {
+ save_cdef_boundary_lines(frame, cm, plane, y1 - 1, frame_stripe,
+ use_highbd, 0, boundaries);
+ }
+ }
+ }
+}
+
+// For each RESTORATION_PROC_UNIT_SIZE pixel high stripe, save 4 scan
+// lines to be used as boundary in the loop restoration process. The
+// lines are saved in rst_internal.stripe_boundary_lines
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int after_cdef) {
+ const int num_planes = av1_num_planes(cm);
+ const int use_highbd = cm->seq_params.use_highbitdepth;
+ for (int p = 0; p < num_planes; ++p) {
+ save_tile_row_boundary_lines(frame, use_highbd, p, cm, after_cdef);
+ }
+}
diff --git a/third_party/aom/av1/common/restoration.h b/third_party/aom/av1/common/restoration.h
new file mode 100644
index 0000000000..d834f9270f
--- /dev/null
+++ b/third_party/aom/av1/common/restoration.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_RESTORATION_H_
+#define AOM_AV1_COMMON_RESTORATION_H_
+
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x)-0.5) : (int)((x) + 0.5))
+
+#define RESTORATION_PROC_UNIT_SIZE 64
+
+// Filter tile grid offset upwards compared to the superblock grid
+#define RESTORATION_UNIT_OFFSET 8
+
+#define SGRPROJ_BORDER_VERT 3 // Vertical border used for Sgr
+#define SGRPROJ_BORDER_HORZ 3 // Horizontal border used for Sgr
+
+#define WIENER_BORDER_VERT 2 // Vertical border used for Wiener
+#define WIENER_HALFWIN 3
+#define WIENER_BORDER_HORZ (WIENER_HALFWIN) // Horizontal border for Wiener
+
+// RESTORATION_BORDER_VERT determines line buffer requirement for LR.
+// Should be set at the max of SGRPROJ_BORDER_VERT and WIENER_BORDER_VERT.
+// Note the line buffer needed is twice the value of this macro.
+#if SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+#define RESTORATION_BORDER_VERT (SGRPROJ_BORDER_VERT)
+#else
+#define RESTORATION_BORDER_VERT (WIENER_BORDER_VERT)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+#if SGRPROJ_BORDER_HORZ >= WIENER_BORDER_HORZ
+#define RESTORATION_BORDER_HORZ (SGRPROJ_BORDER_HORZ)
+#else
+#define RESTORATION_BORDER_HORZ (WIENER_BORDER_HORZ)
+#endif // SGRPROJ_BORDER_VERT >= WIENER_BORDER_VERT
+
+// How many border pixels do we need for each processing unit?
+#define RESTORATION_BORDER 3
+
+// How many rows of deblocked pixels do we save above/below each processing
+// stripe?
+#define RESTORATION_CTX_VERT 2
+
+// Additional pixels to the left and right in above/below buffers
+// It is RESTORATION_BORDER_HORZ rounded up to get nicer buffer alignment
+#define RESTORATION_EXTRA_HORZ 4
+
+// Pad up to 20 more (may be much less is needed)
+#define RESTORATION_PADDING 20
+#define RESTORATION_PROC_UNIT_PELS \
+ ((RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_HORZ * 2 + \
+ RESTORATION_PADDING) * \
+ (RESTORATION_PROC_UNIT_SIZE + RESTORATION_BORDER_VERT * 2 + \
+ RESTORATION_PADDING))
+
+#define RESTORATION_UNITSIZE_MAX 256
+#define RESTORATION_UNITPELS_HORZ_MAX \
+ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_HORZ + 16)
+#define RESTORATION_UNITPELS_VERT_MAX \
+ ((RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_BORDER_VERT + \
+ RESTORATION_UNIT_OFFSET))
+#define RESTORATION_UNITPELS_MAX \
+ (RESTORATION_UNITPELS_HORZ_MAX * RESTORATION_UNITPELS_VERT_MAX)
+
+// Two 32-bit buffers needed for the restored versions from two filters
+// TODO(debargha, rupert): Refactor to not need the large tilesize to be stored
+// on the decoder side.
+#define SGRPROJ_TMPBUF_SIZE (RESTORATION_UNITPELS_MAX * 2 * sizeof(int32_t))
+
+#define SGRPROJ_EXTBUF_SIZE (0)
+#define SGRPROJ_PARAMS_BITS 4
+#define SGRPROJ_PARAMS (1 << SGRPROJ_PARAMS_BITS)
+
+// Precision bits for projection
+#define SGRPROJ_PRJ_BITS 7
+// Restoration precision bits generated higher than source before projection
+#define SGRPROJ_RST_BITS 4
+// Internal precision bits for core selfguided_restoration
+#define SGRPROJ_SGR_BITS 8
+#define SGRPROJ_SGR (1 << SGRPROJ_SGR_BITS)
+
+#define SGRPROJ_PRJ_MIN0 (-(1 << SGRPROJ_PRJ_BITS) * 3 / 4)
+#define SGRPROJ_PRJ_MAX0 (SGRPROJ_PRJ_MIN0 + (1 << SGRPROJ_PRJ_BITS) - 1)
+#define SGRPROJ_PRJ_MIN1 (-(1 << SGRPROJ_PRJ_BITS) / 4)
+#define SGRPROJ_PRJ_MAX1 (SGRPROJ_PRJ_MIN1 + (1 << SGRPROJ_PRJ_BITS) - 1)
+
+#define SGRPROJ_PRJ_SUBEXP_K 4
+
+#define SGRPROJ_BITS (SGRPROJ_PRJ_BITS * 2 + SGRPROJ_PARAMS_BITS)
+
+#define MAX_RADIUS 2 // Only 1, 2, 3 allowed
+#define MAX_NELEM ((2 * MAX_RADIUS + 1) * (2 * MAX_RADIUS + 1))
+#define SGRPROJ_MTABLE_BITS 20
+#define SGRPROJ_RECIP_BITS 12
+
+#define WIENER_HALFWIN1 (WIENER_HALFWIN + 1)
+#define WIENER_WIN (2 * WIENER_HALFWIN + 1)
+#define WIENER_WIN2 ((WIENER_WIN) * (WIENER_WIN))
+#define WIENER_TMPBUF_SIZE (0)
+#define WIENER_EXTBUF_SIZE (0)
+
+// If WIENER_WIN_CHROMA == WIENER_WIN - 2, that implies 5x5 filters are used for
+// chroma. To use 7x7 for chroma set WIENER_WIN_CHROMA to WIENER_WIN.
+#define WIENER_WIN_CHROMA (WIENER_WIN - 2)
+#define WIENER_WIN2_CHROMA ((WIENER_WIN_CHROMA) * (WIENER_WIN_CHROMA))
+
+#define WIENER_FILT_PREC_BITS 7
+#define WIENER_FILT_STEP (1 << WIENER_FILT_PREC_BITS)
+
+// Central values for the taps
+#define WIENER_FILT_TAP0_MIDV (3)
+#define WIENER_FILT_TAP1_MIDV (-7)
+#define WIENER_FILT_TAP2_MIDV (15)
+#define WIENER_FILT_TAP3_MIDV \
+ (WIENER_FILT_STEP - 2 * (WIENER_FILT_TAP0_MIDV + WIENER_FILT_TAP1_MIDV + \
+ WIENER_FILT_TAP2_MIDV))
+
+#define WIENER_FILT_TAP0_BITS 4
+#define WIENER_FILT_TAP1_BITS 5
+#define WIENER_FILT_TAP2_BITS 6
+
+#define WIENER_FILT_BITS \
+ ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MINV \
+ (WIENER_FILT_TAP0_MIDV - (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MINV \
+ (WIENER_FILT_TAP1_MIDV - (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MINV \
+ (WIENER_FILT_TAP2_MIDV - (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+ (WIENER_FILT_TAP0_MIDV - 1 + (1 << WIENER_FILT_TAP0_BITS) / 2)
+#define WIENER_FILT_TAP1_MAXV \
+ (WIENER_FILT_TAP1_MIDV - 1 + (1 << WIENER_FILT_TAP1_BITS) / 2)
+#define WIENER_FILT_TAP2_MAXV \
+ (WIENER_FILT_TAP2_MIDV - 1 + (1 << WIENER_FILT_TAP2_BITS) / 2)
+
+#define WIENER_FILT_TAP0_SUBEXP_K 1
+#define WIENER_FILT_TAP1_SUBEXP_K 2
+#define WIENER_FILT_TAP2_SUBEXP_K 3
+
+// Max of SGRPROJ_TMPBUF_SIZE, DOMAINTXFMRF_TMPBUF_SIZE, WIENER_TMPBUF_SIZE
+#define RESTORATION_TMPBUF_SIZE (SGRPROJ_TMPBUF_SIZE)
+
+// Max of SGRPROJ_EXTBUF_SIZE, WIENER_EXTBUF_SIZE
+#define RESTORATION_EXTBUF_SIZE (WIENER_EXTBUF_SIZE)
+
+// Check the assumptions of the existing code
+#if SUBPEL_TAPS != WIENER_WIN + 1
+#error "Wiener filter currently only works if SUBPEL_TAPS == WIENER_WIN + 1"
+#endif
+#if WIENER_FILT_PREC_BITS != 7
+#error "Wiener filter currently only works if WIENER_FILT_PREC_BITS == 7"
+#endif
+
+#define LR_TILE_ROW 0
+#define LR_TILE_COL 0
+#define LR_TILE_COLS 1
+
+typedef struct {
+ int r[2]; // radii
+ int s[2]; // sgr parameters for r[0] and r[1], based on GenSgrprojVtable()
+} sgr_params_type;
+
+typedef struct {
+ RestorationType restoration_type;
+ WienerInfo wiener_info;
+ SgrprojInfo sgrproj_info;
+} RestorationUnitInfo;
+
+// A restoration line buffer needs space for two lines plus a horizontal filter
+// margin of RESTORATION_EXTRA_HORZ on each side.
+#define RESTORATION_LINEBUFFER_WIDTH \
+ (RESTORATION_UNITSIZE_MAX * 3 / 2 + 2 * RESTORATION_EXTRA_HORZ)
+
+// Similarly, the column buffers (used when we're at a vertical tile edge
+// that we can't filter across) need space for one processing unit's worth
+// of pixels, plus the top/bottom border width
+#define RESTORATION_COLBUFFER_HEIGHT \
+ (RESTORATION_PROC_UNIT_SIZE + 2 * RESTORATION_BORDER)
+
+typedef struct {
+ // Temporary buffers to save/restore 3 lines above/below the restoration
+ // stripe.
+ uint16_t tmp_save_above[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+ uint16_t tmp_save_below[RESTORATION_BORDER][RESTORATION_LINEBUFFER_WIDTH];
+} RestorationLineBuffers;
+
+typedef struct {
+ uint8_t *stripe_boundary_above;
+ uint8_t *stripe_boundary_below;
+ int stripe_boundary_stride;
+ int stripe_boundary_size;
+} RestorationStripeBoundaries;
+
+typedef struct {
+ RestorationType frame_restoration_type;
+ int restoration_unit_size;
+
+ // Fields below here are allocated and initialised by
+ // av1_alloc_restoration_struct. (horz_)units_per_tile give the number of
+ // restoration units in (one row of) the largest tile in the frame. The data
+ // in unit_info is laid out with units_per_tile entries for each tile, which
+ // have stride horz_units_per_tile.
+ //
+ // Even if there are tiles of different sizes, the data in unit_info is laid
+ // out as if all tiles are of full size.
+ int units_per_tile;
+ int vert_units_per_tile, horz_units_per_tile;
+ RestorationUnitInfo *unit_info;
+ RestorationStripeBoundaries boundaries;
+ int optimized_lr;
+} RestorationInfo;
+
+static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
+ sgrproj_info->xqd[0] = (SGRPROJ_PRJ_MIN0 + SGRPROJ_PRJ_MAX0) / 2;
+ sgrproj_info->xqd[1] = (SGRPROJ_PRJ_MIN1 + SGRPROJ_PRJ_MAX1) / 2;
+}
+
+static INLINE void set_default_wiener(WienerInfo *wiener_info) {
+ wiener_info->vfilter[0] = wiener_info->hfilter[0] = WIENER_FILT_TAP0_MIDV;
+ wiener_info->vfilter[1] = wiener_info->hfilter[1] = WIENER_FILT_TAP1_MIDV;
+ wiener_info->vfilter[2] = wiener_info->hfilter[2] = WIENER_FILT_TAP2_MIDV;
+ wiener_info->vfilter[WIENER_HALFWIN] = wiener_info->hfilter[WIENER_HALFWIN] =
+ -2 *
+ (WIENER_FILT_TAP2_MIDV + WIENER_FILT_TAP1_MIDV + WIENER_FILT_TAP0_MIDV);
+ wiener_info->vfilter[4] = wiener_info->hfilter[4] = WIENER_FILT_TAP2_MIDV;
+ wiener_info->vfilter[5] = wiener_info->hfilter[5] = WIENER_FILT_TAP1_MIDV;
+ wiener_info->vfilter[6] = wiener_info->hfilter[6] = WIENER_FILT_TAP0_MIDV;
+}
+
+typedef struct {
+ int h_start, h_end, v_start, v_end;
+} RestorationTileLimits;
+
+typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect,
+ int rest_unit_idx, void *priv,
+ int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs);
+
+typedef struct FilterFrameCtxt {
+ const RestorationInfo *rsi;
+ int tile_stripe0;
+ int ss_x, ss_y;
+ int highbd, bit_depth;
+ uint8_t *data8, *dst8;
+ int data_stride, dst_stride;
+ AV1PixelRect tile_rect;
+} FilterFrameCtxt;
+
+typedef struct AV1LrStruct {
+ rest_unit_visitor_t on_rest_unit;
+ FilterFrameCtxt ctxt[MAX_MB_PLANE];
+ YV12_BUFFER_CONFIG *frame;
+ YV12_BUFFER_CONFIG *dst;
+} AV1LrStruct;
+
+extern const sgr_params_type sgr_params[SGRPROJ_PARAMS];
+extern int sgrproj_mtable[SGRPROJ_PARAMS][2];
+extern const int32_t x_by_xplus1[256];
+extern const int32_t one_by_x[MAX_NELEM];
+
+void av1_alloc_restoration_struct(struct AV1Common *cm, RestorationInfo *rsi,
+ int is_uv);
+void av1_free_restoration_struct(RestorationInfo *rst_info);
+
+void extend_frame(uint8_t *data, int width, int height, int stride,
+ int border_horz, int border_vert, int highbd);
+void decode_xq(const int *xqd, int *xq, const sgr_params_type *params);
+
+// Filter a single loop restoration unit.
+//
+// limits is the limits of the unit. rui gives the mode to use for this unit
+// and its coefficients. If striped loop restoration is enabled, rsb contains
+// deblocked pixels to use for stripe boundaries; rlbs is just some space to
+// use as a scratch buffer. tile_rect gives the limits of the tile containing
+// this unit. tile_stripe0 is the index of the first stripe in this tile.
+//
+// ss_x and ss_y are flags which should be 1 if this is a plane with
+// horizontal/vertical subsampling, respectively. highbd is a flag which should
+// be 1 in high bit depth mode, in which case bit_depth is the bit depth.
+//
+// data8 is the frame data (pointing at the top-left corner of the frame, not
+// the restoration unit) and stride is its stride. dst8 is the buffer where the
+// results will be written and has stride dst_stride. Like data8, dst8 should
+// point at the top-left corner of the frame.
+//
+// Finally tmpbuf is a scratch buffer used by the sgrproj filter which should
+// be at least SGRPROJ_TMPBUF_SIZE big.
+void av1_loop_restoration_filter_unit(
+ const RestorationTileLimits *limits, const RestorationUnitInfo *rui,
+ const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
+ const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
+ int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf, int optimized_lr);
+
+void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm, int optimized_lr,
+ void *lr_ctxt);
+void av1_loop_restoration_precal();
+
+typedef void (*rest_tile_start_visitor_t)(int tile_row, int tile_col,
+ void *priv);
+struct AV1LrSyncData;
+
+typedef void (*sync_read_fn_t)(void *const lr_sync, int r, int c, int plane);
+
+typedef void (*sync_write_fn_t)(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane);
+
+// Call on_rest_unit for each loop restoration unit in the plane.
+void av1_foreach_rest_unit_in_plane(const struct AV1Common *cm, int plane,
+ rest_unit_visitor_t on_rest_unit,
+ void *priv, AV1PixelRect *tile_rect,
+ int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs);
+
+// Return 1 iff the block at mi_row, mi_col with size bsize is a
+// top-level superblock containing the top-left corner of at least one
+// loop restoration unit.
+//
+// If the block is a top-level superblock, the function writes to
+// *rcol0, *rcol1, *rrow0, *rrow1. The rectangle of restoration unit
+// indices given by [*rcol0, *rcol1) x [*rrow0, *rrow1) are relative
+// to the current tile, whose starting index is returned as
+// *tile_tl_idx.
+int av1_loop_restoration_corners_in_sb(const struct AV1Common *cm, int plane,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rcol0, int *rcol1, int *rrow0,
+ int *rrow1);
+
+void av1_loop_restoration_save_boundary_lines(const YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int after_cdef);
+void av1_loop_restoration_filter_frame_init(AV1LrStruct *lr_ctxt,
+ YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int optimized_lr, int num_planes);
+void av1_loop_restoration_copy_planes(AV1LrStruct *loop_rest_ctxt,
+ struct AV1Common *cm, int num_planes);
+void av1_foreach_rest_unit_in_row(
+ RestorationTileLimits *limits, const AV1PixelRect *tile_rect,
+ rest_unit_visitor_t on_rest_unit, int row_number, int unit_size,
+ int unit_idx0, int hunits_per_tile, int vunits_per_tile, int plane,
+ void *priv, int32_t *tmpbuf, RestorationLineBuffers *rlbs,
+ sync_read_fn_t on_sync_read, sync_write_fn_t on_sync_write,
+ struct AV1LrSyncData *const lr_sync);
+AV1PixelRect av1_whole_frame_rect(const struct AV1Common *cm, int is_uv);
+int av1_lr_count_units_in_tile(int unit_size, int tile_size);
+void av1_lr_sync_read_dummy(void *const lr_sync, int r, int c, int plane);
+void av1_lr_sync_write_dummy(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_RESTORATION_H_
diff --git a/third_party/aom/av1/common/scale.c b/third_party/aom/av1/common/scale.c
new file mode 100644
index 0000000000..c525fe2296
--- /dev/null
+++ b/third_party/aom/av1/common/scale.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/scale.h"
+#include "aom_dsp/aom_filter.h"
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_x(int val, const struct scale_factors *sf) {
+ const int off =
+ (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+ const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
+ return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+ REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static INLINE int scaled_y(int val, const struct scale_factors *sf) {
+ const int off =
+ (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
+ const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
+ return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval,
+ REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
+}
+
+// Note: Expect val to be in q4 precision
+static int unscaled_value(int val, const struct scale_factors *sf) {
+ (void)sf;
+ return val << SCALE_EXTRA_BITS;
+}
+
+static int get_fixed_point_scale_factor(int other_size, int this_size) {
+ // Calculate scaling factor once for each reference frame
+ // and use fixed point scaling factors in decoding and encoding routines.
+ // Hardware implementations can calculate scale factor in device driver
+ // and use multiplication and shifting on hardware instead of division.
+ return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
+}
+
+// Given the fixed point scale, calculate coarse point scale.
+static int fixed_point_scale_to_coarse_point_scale(int scale_fp) {
+ return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
+}
+
+// Note: x and y are integer precision, mvq4 is q4 precision.
+MV32 av1_scale_mv(const MV *mvq4, int x, int y,
+ const struct scale_factors *sf) {
+ const int x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
+ const int y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
+ const MV32 res = { scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
+ scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4 };
+ return res;
+}
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+ int other_h, int this_w, int this_h) {
+ if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+ sf->x_scale_fp = REF_INVALID_SCALE;
+ sf->y_scale_fp = REF_INVALID_SCALE;
+ return;
+ }
+
+ sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
+ sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
+
+ sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
+ sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
+
+ if (av1_is_scaled(sf)) {
+ sf->scale_value_x = scaled_x;
+ sf->scale_value_y = scaled_y;
+ } else {
+ sf->scale_value_x = unscaled_value;
+ sf->scale_value_y = unscaled_value;
+ }
+
+ // AV1 convolve functions
+ // Special case convolve functions should produce the same result as
+ // av1_convolve_2d.
+ // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ sf->convolve[0][0][0] = av1_convolve_2d_copy_sr;
+ // subpel_x_q4 == 0
+ sf->convolve[0][1][0] = av1_convolve_y_sr;
+ // subpel_y_q4 == 0
+ sf->convolve[1][0][0] = av1_convolve_x_sr;
+ // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ sf->convolve[1][1][0] = av1_convolve_2d_sr;
+ // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ sf->convolve[0][0][1] = av1_jnt_convolve_2d_copy;
+ // subpel_x_q4 == 0
+ sf->convolve[0][1][1] = av1_jnt_convolve_y;
+ // subpel_y_q4 == 0
+ sf->convolve[1][0][1] = av1_jnt_convolve_x;
+ // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ sf->convolve[1][1][1] = av1_jnt_convolve_2d;
+ // AV1 High BD convolve functions
+ // Special case convolve functions should produce the same result as
+ // av1_highbd_convolve_2d.
+ // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ sf->highbd_convolve[0][0][0] = av1_highbd_convolve_2d_copy_sr;
+ // subpel_x_q4 == 0
+ sf->highbd_convolve[0][1][0] = av1_highbd_convolve_y_sr;
+ // subpel_y_q4 == 0
+ sf->highbd_convolve[1][0][0] = av1_highbd_convolve_x_sr;
+ // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ sf->highbd_convolve[1][1][0] = av1_highbd_convolve_2d_sr;
+ // subpel_x_q4 == 0 && subpel_y_q4 == 0
+ sf->highbd_convolve[0][0][1] = av1_highbd_jnt_convolve_2d_copy;
+ // subpel_x_q4 == 0
+ sf->highbd_convolve[0][1][1] = av1_highbd_jnt_convolve_y;
+ // subpel_y_q4 == 0
+ sf->highbd_convolve[1][0][1] = av1_highbd_jnt_convolve_x;
+ // subpel_x_q4 != 0 && subpel_y_q4 != 0
+ sf->highbd_convolve[1][1][1] = av1_highbd_jnt_convolve_2d;
+}
diff --git a/third_party/aom/av1/common/scale.h b/third_party/aom/av1/common/scale.h
new file mode 100644
index 0000000000..748e958c35
--- /dev/null
+++ b/third_party/aom/av1/common/scale.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCALE_H_
+#define AOM_AV1_COMMON_SCALE_H_
+
+#include "av1/common/convolve.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SCALE_NUMERATOR 8
+
+#define REF_SCALE_SHIFT 14
+#define REF_NO_SCALE (1 << REF_SCALE_SHIFT)
+#define REF_INVALID_SCALE -1
+
+struct scale_factors {
+ int x_scale_fp; // horizontal fixed point scale factor
+ int y_scale_fp; // vertical fixed point scale factor
+ int x_step_q4;
+ int y_step_q4;
+
+ int (*scale_value_x)(int val, const struct scale_factors *sf);
+ int (*scale_value_y)(int val, const struct scale_factors *sf);
+
+ // convolve_fn_ptr[subpel_x != 0][subpel_y != 0][is_compound]
+ aom_convolve_fn_t convolve[2][2][2];
+ aom_highbd_convolve_fn_t highbd_convolve[2][2][2];
+};
+
+MV32 av1_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
+
+void av1_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w,
+ int other_h, int this_w, int this_h);
+
+static INLINE int av1_is_valid_scale(const struct scale_factors *sf) {
+ return sf->x_scale_fp != REF_INVALID_SCALE &&
+ sf->y_scale_fp != REF_INVALID_SCALE;
+}
+
+static INLINE int av1_is_scaled(const struct scale_factors *sf) {
+ return av1_is_valid_scale(sf) &&
+ (sf->x_scale_fp != REF_NO_SCALE || sf->y_scale_fp != REF_NO_SCALE);
+}
+
+static INLINE int valid_ref_frame_size(int ref_width, int ref_height,
+ int this_width, int this_height) {
+ return 2 * this_width >= ref_width && 2 * this_height >= ref_height &&
+ this_width <= 16 * ref_width && this_height <= 16 * ref_height;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SCALE_H_
diff --git a/third_party/aom/av1/common/scan.c b/third_party/aom/av1/common/scan.c
new file mode 100644
index 0000000000..31a787b537
--- /dev/null
+++ b/third_party/aom/av1/common/scan.c
@@ -0,0 +1,3735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/common_data.h"
+#include "av1/common/scan.h"
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x4[16]) = {
+ 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x8[32]) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 27, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x8[32]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x8[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x4[32]) = {
+ 0, 8, 1, 16, 9, 2, 24, 17, 10, 3, 25, 18, 11, 4, 26, 19,
+ 12, 5, 27, 20, 13, 6, 28, 21, 14, 7, 29, 22, 15, 30, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x4[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x4[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_4x16[64]) = {
+ 0, 1, 4, 2, 5, 8, 3, 6, 9, 12, 7, 10, 13, 16, 11, 14,
+ 17, 20, 15, 18, 21, 24, 19, 22, 25, 28, 23, 26, 29, 32, 27, 30,
+ 33, 36, 31, 34, 37, 40, 35, 38, 41, 44, 39, 42, 45, 48, 43, 46,
+ 49, 52, 47, 50, 53, 56, 51, 54, 57, 60, 55, 58, 61, 59, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x4[64]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 49, 34, 19, 4, 50, 35,
+ 20, 5, 51, 36, 21, 6, 52, 37, 22, 7, 53, 38, 23, 8, 54, 39,
+ 24, 9, 55, 40, 25, 10, 56, 41, 26, 11, 57, 42, 27, 12, 58, 43,
+ 28, 13, 59, 44, 29, 14, 60, 45, 30, 15, 61, 46, 31, 62, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x16[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x4[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x16[64]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x4[64]) = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x32[256]) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 128, 87, 94, 101, 108, 115, 122, 129, 136, 95, 102, 109, 116,
+ 123, 130, 137, 144, 103, 110, 117, 124, 131, 138, 145, 152, 111, 118, 125,
+ 132, 139, 146, 153, 160, 119, 126, 133, 140, 147, 154, 161, 168, 127, 134,
+ 141, 148, 155, 162, 169, 176, 135, 142, 149, 156, 163, 170, 177, 184, 143,
+ 150, 157, 164, 171, 178, 185, 192, 151, 158, 165, 172, 179, 186, 193, 200,
+ 159, 166, 173, 180, 187, 194, 201, 208, 167, 174, 181, 188, 195, 202, 209,
+ 216, 175, 182, 189, 196, 203, 210, 217, 224, 183, 190, 197, 204, 211, 218,
+ 225, 232, 191, 198, 205, 212, 219, 226, 233, 240, 199, 206, 213, 220, 227,
+ 234, 241, 248, 207, 214, 221, 228, 235, 242, 249, 215, 222, 229, 236, 243,
+ 250, 223, 230, 237, 244, 251, 231, 238, 245, 252, 239, 246, 253, 247, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x8[256]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 225, 194, 163, 132, 101, 70, 39, 8, 226,
+ 195, 164, 133, 102, 71, 40, 9, 227, 196, 165, 134, 103, 72, 41, 10,
+ 228, 197, 166, 135, 104, 73, 42, 11, 229, 198, 167, 136, 105, 74, 43,
+ 12, 230, 199, 168, 137, 106, 75, 44, 13, 231, 200, 169, 138, 107, 76,
+ 45, 14, 232, 201, 170, 139, 108, 77, 46, 15, 233, 202, 171, 140, 109,
+ 78, 47, 16, 234, 203, 172, 141, 110, 79, 48, 17, 235, 204, 173, 142,
+ 111, 80, 49, 18, 236, 205, 174, 143, 112, 81, 50, 19, 237, 206, 175,
+ 144, 113, 82, 51, 20, 238, 207, 176, 145, 114, 83, 52, 21, 239, 208,
+ 177, 146, 115, 84, 53, 22, 240, 209, 178, 147, 116, 85, 54, 23, 241,
+ 210, 179, 148, 117, 86, 55, 24, 242, 211, 180, 149, 118, 87, 56, 25,
+ 243, 212, 181, 150, 119, 88, 57, 26, 244, 213, 182, 151, 120, 89, 58,
+ 27, 245, 214, 183, 152, 121, 90, 59, 28, 246, 215, 184, 153, 122, 91,
+ 60, 29, 247, 216, 185, 154, 123, 92, 61, 30, 248, 217, 186, 155, 124,
+ 93, 62, 31, 249, 218, 187, 156, 125, 94, 63, 250, 219, 188, 157, 126,
+ 95, 251, 220, 189, 158, 127, 252, 221, 190, 159, 253, 222, 191, 254, 223,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x32[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x8[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x32[256]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112,
+ 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+ 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,
+ 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+ 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67,
+ 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+ 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52,
+ 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+ 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22,
+ 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+ 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+ 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x8[256]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225,
+ 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227,
+ 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229,
+ 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231,
+ 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233,
+ 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+ 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+ 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+ 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+ 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+ 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+ 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+ 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+ 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+ 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+ 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
+ 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5,
+ 12, 19, 26, 33, 40, 48, 41, 34, 27, 20, 13, 6, 7, 14, 21, 28,
+ 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, 30, 37, 44, 51,
+ 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_8x16[128]) = {
+ 0, 1, 8, 2, 9, 16, 3, 10, 17, 24, 4, 11, 18, 25, 32,
+ 5, 12, 19, 26, 33, 40, 6, 13, 20, 27, 34, 41, 48, 7, 14,
+ 21, 28, 35, 42, 49, 56, 15, 22, 29, 36, 43, 50, 57, 64, 23,
+ 30, 37, 44, 51, 58, 65, 72, 31, 38, 45, 52, 59, 66, 73, 80,
+ 39, 46, 53, 60, 67, 74, 81, 88, 47, 54, 61, 68, 75, 82, 89,
+ 96, 55, 62, 69, 76, 83, 90, 97, 104, 63, 70, 77, 84, 91, 98,
+ 105, 112, 71, 78, 85, 92, 99, 106, 113, 120, 79, 86, 93, 100, 107,
+ 114, 121, 87, 94, 101, 108, 115, 122, 95, 102, 109, 116, 123, 103, 110,
+ 117, 124, 111, 118, 125, 119, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x8[128]) = {
+ 0, 16, 1, 32, 17, 2, 48, 33, 18, 3, 64, 49, 34, 19, 4, 80,
+ 65, 50, 35, 20, 5, 96, 81, 66, 51, 36, 21, 6, 112, 97, 82, 67,
+ 52, 37, 22, 7, 113, 98, 83, 68, 53, 38, 23, 8, 114, 99, 84, 69,
+ 54, 39, 24, 9, 115, 100, 85, 70, 55, 40, 25, 10, 116, 101, 86, 71,
+ 56, 41, 26, 11, 117, 102, 87, 72, 57, 42, 27, 12, 118, 103, 88, 73,
+ 58, 43, 28, 13, 119, 104, 89, 74, 59, 44, 29, 14, 120, 105, 90, 75,
+ 60, 45, 30, 15, 121, 106, 91, 76, 61, 46, 31, 122, 107, 92, 77, 62,
+ 47, 123, 108, 93, 78, 63, 124, 109, 94, 79, 125, 110, 95, 126, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x16[128]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x8[128]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x16[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x8[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x32[512]) = {
+ 0, 1, 16, 2, 17, 32, 3, 18, 33, 48, 4, 19, 34, 49, 64,
+ 5, 20, 35, 50, 65, 80, 6, 21, 36, 51, 66, 81, 96, 7, 22,
+ 37, 52, 67, 82, 97, 112, 8, 23, 38, 53, 68, 83, 98, 113, 128,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 10, 25, 40, 55, 70,
+ 85, 100, 115, 130, 145, 160, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 12, 27, 42, 57, 72, 87, 102, 117, 132, 147, 162, 177,
+ 192, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 14, 29, 44, 59, 74, 89, 104, 119, 134, 149, 164, 179, 194, 209, 224,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 31, 46, 61, 76, 91, 106, 121, 136, 151, 166, 181, 196, 211, 226,
+ 241, 256, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227,
+ 242, 257, 272, 63, 78, 93, 108, 123, 138, 153, 168, 183, 198, 213, 228,
+ 243, 258, 273, 288, 79, 94, 109, 124, 139, 154, 169, 184, 199, 214, 229,
+ 244, 259, 274, 289, 304, 95, 110, 125, 140, 155, 170, 185, 200, 215, 230,
+ 245, 260, 275, 290, 305, 320, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 261, 276, 291, 306, 321, 336, 127, 142, 157, 172, 187, 202, 217, 232,
+ 247, 262, 277, 292, 307, 322, 337, 352, 143, 158, 173, 188, 203, 218, 233,
+ 248, 263, 278, 293, 308, 323, 338, 353, 368, 159, 174, 189, 204, 219, 234,
+ 249, 264, 279, 294, 309, 324, 339, 354, 369, 384, 175, 190, 205, 220, 235,
+ 250, 265, 280, 295, 310, 325, 340, 355, 370, 385, 400, 191, 206, 221, 236,
+ 251, 266, 281, 296, 311, 326, 341, 356, 371, 386, 401, 416, 207, 222, 237,
+ 252, 267, 282, 297, 312, 327, 342, 357, 372, 387, 402, 417, 432, 223, 238,
+ 253, 268, 283, 298, 313, 328, 343, 358, 373, 388, 403, 418, 433, 448, 239,
+ 254, 269, 284, 299, 314, 329, 344, 359, 374, 389, 404, 419, 434, 449, 464,
+ 255, 270, 285, 300, 315, 330, 345, 360, 375, 390, 405, 420, 435, 450, 465,
+ 480, 271, 286, 301, 316, 331, 346, 361, 376, 391, 406, 421, 436, 451, 466,
+ 481, 496, 287, 302, 317, 332, 347, 362, 377, 392, 407, 422, 437, 452, 467,
+ 482, 497, 303, 318, 333, 348, 363, 378, 393, 408, 423, 438, 453, 468, 483,
+ 498, 319, 334, 349, 364, 379, 394, 409, 424, 439, 454, 469, 484, 499, 335,
+ 350, 365, 380, 395, 410, 425, 440, 455, 470, 485, 500, 351, 366, 381, 396,
+ 411, 426, 441, 456, 471, 486, 501, 367, 382, 397, 412, 427, 442, 457, 472,
+ 487, 502, 383, 398, 413, 428, 443, 458, 473, 488, 503, 399, 414, 429, 444,
+ 459, 474, 489, 504, 415, 430, 445, 460, 475, 490, 505, 431, 446, 461, 476,
+ 491, 506, 447, 462, 477, 492, 507, 463, 478, 493, 508, 479, 494, 509, 495,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x16[512]) = {
+ 0, 32, 1, 64, 33, 2, 96, 65, 34, 3, 128, 97, 66, 35, 4,
+ 160, 129, 98, 67, 36, 5, 192, 161, 130, 99, 68, 37, 6, 224, 193,
+ 162, 131, 100, 69, 38, 7, 256, 225, 194, 163, 132, 101, 70, 39, 8,
+ 288, 257, 226, 195, 164, 133, 102, 71, 40, 9, 320, 289, 258, 227, 196,
+ 165, 134, 103, 72, 41, 10, 352, 321, 290, 259, 228, 197, 166, 135, 104,
+ 73, 42, 11, 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43,
+ 12, 416, 385, 354, 323, 292, 261, 230, 199, 168, 137, 106, 75, 44, 13,
+ 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107, 76, 45, 14,
+ 480, 449, 418, 387, 356, 325, 294, 263, 232, 201, 170, 139, 108, 77, 46,
+ 15, 481, 450, 419, 388, 357, 326, 295, 264, 233, 202, 171, 140, 109, 78,
+ 47, 16, 482, 451, 420, 389, 358, 327, 296, 265, 234, 203, 172, 141, 110,
+ 79, 48, 17, 483, 452, 421, 390, 359, 328, 297, 266, 235, 204, 173, 142,
+ 111, 80, 49, 18, 484, 453, 422, 391, 360, 329, 298, 267, 236, 205, 174,
+ 143, 112, 81, 50, 19, 485, 454, 423, 392, 361, 330, 299, 268, 237, 206,
+ 175, 144, 113, 82, 51, 20, 486, 455, 424, 393, 362, 331, 300, 269, 238,
+ 207, 176, 145, 114, 83, 52, 21, 487, 456, 425, 394, 363, 332, 301, 270,
+ 239, 208, 177, 146, 115, 84, 53, 22, 488, 457, 426, 395, 364, 333, 302,
+ 271, 240, 209, 178, 147, 116, 85, 54, 23, 489, 458, 427, 396, 365, 334,
+ 303, 272, 241, 210, 179, 148, 117, 86, 55, 24, 490, 459, 428, 397, 366,
+ 335, 304, 273, 242, 211, 180, 149, 118, 87, 56, 25, 491, 460, 429, 398,
+ 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57, 26, 492, 461, 430,
+ 399, 368, 337, 306, 275, 244, 213, 182, 151, 120, 89, 58, 27, 493, 462,
+ 431, 400, 369, 338, 307, 276, 245, 214, 183, 152, 121, 90, 59, 28, 494,
+ 463, 432, 401, 370, 339, 308, 277, 246, 215, 184, 153, 122, 91, 60, 29,
+ 495, 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92, 61,
+ 30, 496, 465, 434, 403, 372, 341, 310, 279, 248, 217, 186, 155, 124, 93,
+ 62, 31, 497, 466, 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125,
+ 94, 63, 498, 467, 436, 405, 374, 343, 312, 281, 250, 219, 188, 157, 126,
+ 95, 499, 468, 437, 406, 375, 344, 313, 282, 251, 220, 189, 158, 127, 500,
+ 469, 438, 407, 376, 345, 314, 283, 252, 221, 190, 159, 501, 470, 439, 408,
+ 377, 346, 315, 284, 253, 222, 191, 502, 471, 440, 409, 378, 347, 316, 285,
+ 254, 223, 503, 472, 441, 410, 379, 348, 317, 286, 255, 504, 473, 442, 411,
+ 380, 349, 318, 287, 505, 474, 443, 412, 381, 350, 319, 506, 475, 444, 413,
+ 382, 351, 507, 476, 445, 414, 383, 508, 477, 446, 415, 509, 478, 447, 510,
+ 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x32[512]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224,
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+ 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193,
+ 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+ 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162,
+ 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+ 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131,
+ 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+ 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100,
+ 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+ 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69,
+ 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+ 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38,
+ 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+ 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+ 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+ 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216,
+ 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+ 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185,
+ 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+ 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154,
+ 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+ 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123,
+ 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+ 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92,
+ 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+ 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61,
+ 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+ 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30,
+ 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+ 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239,
+ 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+ 495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x16[512]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+ 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+ 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+ 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+ 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+ 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+ 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+ 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+ 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+ 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+ 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+ 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+ 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+ 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+ 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+ 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+ 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+ 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+ 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+ 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+ 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+ 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+ 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+ 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x32[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x16[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_16x16[256]) = {
+ 0, 1, 16, 32, 17, 2, 3, 18, 33, 48, 64, 49, 34, 19, 4,
+ 5, 20, 35, 50, 65, 80, 96, 81, 66, 51, 36, 21, 6, 7, 22,
+ 37, 52, 67, 82, 97, 112, 128, 113, 98, 83, 68, 53, 38, 23, 8,
+ 9, 24, 39, 54, 69, 84, 99, 114, 129, 144, 160, 145, 130, 115, 100,
+ 85, 70, 55, 40, 25, 10, 11, 26, 41, 56, 71, 86, 101, 116, 131,
+ 146, 161, 176, 192, 177, 162, 147, 132, 117, 102, 87, 72, 57, 42, 27,
+ 12, 13, 28, 43, 58, 73, 88, 103, 118, 133, 148, 163, 178, 193, 208,
+ 224, 209, 194, 179, 164, 149, 134, 119, 104, 89, 74, 59, 44, 29, 14,
+ 15, 30, 45, 60, 75, 90, 105, 120, 135, 150, 165, 180, 195, 210, 225,
+ 240, 241, 226, 211, 196, 181, 166, 151, 136, 121, 106, 91, 76, 61, 46,
+ 31, 47, 62, 77, 92, 107, 122, 137, 152, 167, 182, 197, 212, 227, 242,
+ 243, 228, 213, 198, 183, 168, 153, 138, 123, 108, 93, 78, 63, 79, 94,
+ 109, 124, 139, 154, 169, 184, 199, 214, 229, 244, 245, 230, 215, 200, 185,
+ 170, 155, 140, 125, 110, 95, 111, 126, 141, 156, 171, 186, 201, 216, 231,
+ 246, 247, 232, 217, 202, 187, 172, 157, 142, 127, 143, 158, 173, 188, 203,
+ 218, 233, 248, 249, 234, 219, 204, 189, 174, 159, 175, 190, 205, 220, 235,
+ 250, 251, 236, 221, 206, 191, 207, 222, 237, 252, 253, 238, 223, 239, 254,
+ 255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+ 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864,
+ 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289,
+ 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737,
+ 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162,
+ 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610,
+ 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35,
+ 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931,
+ 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356,
+ 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804,
+ 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229,
+ 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677,
+ 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102,
+ 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550,
+ 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+ 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871,
+ 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296,
+ 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744,
+ 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169,
+ 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617,
+ 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42,
+ 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938,
+ 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363,
+ 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+ 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236,
+ 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684,
+ 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109,
+ 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557,
+ 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430,
+ 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878,
+ 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303,
+ 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751,
+ 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176,
+ 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624,
+ 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49,
+ 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945,
+ 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370,
+ 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818,
+ 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243,
+ 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691,
+ 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116,
+ 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564,
+ 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437,
+ 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885,
+ 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310,
+ 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758,
+ 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183,
+ 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631,
+ 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56,
+ 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952,
+ 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377,
+ 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825,
+ 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250,
+ 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698,
+ 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123,
+ 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571,
+ 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444,
+ 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892,
+ 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317,
+ 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765,
+ 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190,
+ 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638,
+ 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63,
+ 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+ 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959,
+ 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+ 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+ 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+ 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+ 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
+ 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
+ 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
+ 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
+ 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+ 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+ 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+ 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350,
+ 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376,
+ 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402,
+ 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+ 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+ 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+ 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454,
+ 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467,
+ 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480,
+ 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493,
+ 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506,
+ 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519,
+ 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
+ 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
+ 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558,
+ 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571,
+ 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584,
+ 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+ 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+ 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
+ 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636,
+ 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649,
+ 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662,
+ 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675,
+ 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
+ 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
+ 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+ 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727,
+ 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740,
+ 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753,
+ 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766,
+ 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779,
+ 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792,
+ 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805,
+ 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818,
+ 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+ 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+ 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+ 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870,
+ 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+ 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896,
+ 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909,
+ 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922,
+ 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935,
+ 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948,
+ 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961,
+ 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974,
+ 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987,
+ 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000,
+ 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+ 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
+ 0, 1, 32, 64, 33, 2, 3, 34, 65, 96, 128, 97, 66,
+ 35, 4, 5, 36, 67, 98, 129, 160, 192, 161, 130, 99, 68,
+ 37, 6, 7, 38, 69, 100, 131, 162, 193, 224, 256, 225, 194,
+ 163, 132, 101, 70, 39, 8, 9, 40, 71, 102, 133, 164, 195,
+ 226, 257, 288, 320, 289, 258, 227, 196, 165, 134, 103, 72, 41,
+ 10, 11, 42, 73, 104, 135, 166, 197, 228, 259, 290, 321, 352,
+ 384, 353, 322, 291, 260, 229, 198, 167, 136, 105, 74, 43, 12,
+ 13, 44, 75, 106, 137, 168, 199, 230, 261, 292, 323, 354, 385,
+ 416, 448, 417, 386, 355, 324, 293, 262, 231, 200, 169, 138, 107,
+ 76, 45, 14, 15, 46, 77, 108, 139, 170, 201, 232, 263, 294,
+ 325, 356, 387, 418, 449, 480, 512, 481, 450, 419, 388, 357, 326,
+ 295, 264, 233, 202, 171, 140, 109, 78, 47, 16, 17, 48, 79,
+ 110, 141, 172, 203, 234, 265, 296, 327, 358, 389, 420, 451, 482,
+ 513, 544, 576, 545, 514, 483, 452, 421, 390, 359, 328, 297, 266,
+ 235, 204, 173, 142, 111, 80, 49, 18, 19, 50, 81, 112, 143,
+ 174, 205, 236, 267, 298, 329, 360, 391, 422, 453, 484, 515, 546,
+ 577, 608, 640, 609, 578, 547, 516, 485, 454, 423, 392, 361, 330,
+ 299, 268, 237, 206, 175, 144, 113, 82, 51, 20, 21, 52, 83,
+ 114, 145, 176, 207, 238, 269, 300, 331, 362, 393, 424, 455, 486,
+ 517, 548, 579, 610, 641, 672, 704, 673, 642, 611, 580, 549, 518,
+ 487, 456, 425, 394, 363, 332, 301, 270, 239, 208, 177, 146, 115,
+ 84, 53, 22, 23, 54, 85, 116, 147, 178, 209, 240, 271, 302,
+ 333, 364, 395, 426, 457, 488, 519, 550, 581, 612, 643, 674, 705,
+ 736, 768, 737, 706, 675, 644, 613, 582, 551, 520, 489, 458, 427,
+ 396, 365, 334, 303, 272, 241, 210, 179, 148, 117, 86, 55, 24,
+ 25, 56, 87, 118, 149, 180, 211, 242, 273, 304, 335, 366, 397,
+ 428, 459, 490, 521, 552, 583, 614, 645, 676, 707, 738, 769, 800,
+ 832, 801, 770, 739, 708, 677, 646, 615, 584, 553, 522, 491, 460,
+ 429, 398, 367, 336, 305, 274, 243, 212, 181, 150, 119, 88, 57,
+ 26, 27, 58, 89, 120, 151, 182, 213, 244, 275, 306, 337, 368,
+ 399, 430, 461, 492, 523, 554, 585, 616, 647, 678, 709, 740, 771,
+ 802, 833, 864, 896, 865, 834, 803, 772, 741, 710, 679, 648, 617,
+ 586, 555, 524, 493, 462, 431, 400, 369, 338, 307, 276, 245, 214,
+ 183, 152, 121, 90, 59, 28, 29, 60, 91, 122, 153, 184, 215,
+ 246, 277, 308, 339, 370, 401, 432, 463, 494, 525, 556, 587, 618,
+ 649, 680, 711, 742, 773, 804, 835, 866, 897, 928, 960, 929, 898,
+ 867, 836, 805, 774, 743, 712, 681, 650, 619, 588, 557, 526, 495,
+ 464, 433, 402, 371, 340, 309, 278, 247, 216, 185, 154, 123, 92,
+ 61, 30, 31, 62, 93, 124, 155, 186, 217, 248, 279, 310, 341,
+ 372, 403, 434, 465, 496, 527, 558, 589, 620, 651, 682, 713, 744,
+ 775, 806, 837, 868, 899, 930, 961, 992, 993, 962, 931, 900, 869,
+ 838, 807, 776, 745, 714, 683, 652, 621, 590, 559, 528, 497, 466,
+ 435, 404, 373, 342, 311, 280, 249, 218, 187, 156, 125, 94, 63,
+ 95, 126, 157, 188, 219, 250, 281, 312, 343, 374, 405, 436, 467,
+ 498, 529, 560, 591, 622, 653, 684, 715, 746, 777, 808, 839, 870,
+ 901, 932, 963, 994, 995, 964, 933, 902, 871, 840, 809, 778, 747,
+ 716, 685, 654, 623, 592, 561, 530, 499, 468, 437, 406, 375, 344,
+ 313, 282, 251, 220, 189, 158, 127, 159, 190, 221, 252, 283, 314,
+ 345, 376, 407, 438, 469, 500, 531, 562, 593, 624, 655, 686, 717,
+ 748, 779, 810, 841, 872, 903, 934, 965, 996, 997, 966, 935, 904,
+ 873, 842, 811, 780, 749, 718, 687, 656, 625, 594, 563, 532, 501,
+ 470, 439, 408, 377, 346, 315, 284, 253, 222, 191, 223, 254, 285,
+ 316, 347, 378, 409, 440, 471, 502, 533, 564, 595, 626, 657, 688,
+ 719, 750, 781, 812, 843, 874, 905, 936, 967, 998, 999, 968, 937,
+ 906, 875, 844, 813, 782, 751, 720, 689, 658, 627, 596, 565, 534,
+ 503, 472, 441, 410, 379, 348, 317, 286, 255, 287, 318, 349, 380,
+ 411, 442, 473, 504, 535, 566, 597, 628, 659, 690, 721, 752, 783,
+ 814, 845, 876, 907, 938, 969, 1000, 1001, 970, 939, 908, 877, 846,
+ 815, 784, 753, 722, 691, 660, 629, 598, 567, 536, 505, 474, 443,
+ 412, 381, 350, 319, 351, 382, 413, 444, 475, 506, 537, 568, 599,
+ 630, 661, 692, 723, 754, 785, 816, 847, 878, 909, 940, 971, 1002,
+ 1003, 972, 941, 910, 879, 848, 817, 786, 755, 724, 693, 662, 631,
+ 600, 569, 538, 507, 476, 445, 414, 383, 415, 446, 477, 508, 539,
+ 570, 601, 632, 663, 694, 725, 756, 787, 818, 849, 880, 911, 942,
+ 973, 1004, 1005, 974, 943, 912, 881, 850, 819, 788, 757, 726, 695,
+ 664, 633, 602, 571, 540, 509, 478, 447, 479, 510, 541, 572, 603,
+ 634, 665, 696, 727, 758, 789, 820, 851, 882, 913, 944, 975, 1006,
+ 1007, 976, 945, 914, 883, 852, 821, 790, 759, 728, 697, 666, 635,
+ 604, 573, 542, 511, 543, 574, 605, 636, 667, 698, 729, 760, 791,
+ 822, 853, 884, 915, 946, 977, 1008, 1009, 978, 947, 916, 885, 854,
+ 823, 792, 761, 730, 699, 668, 637, 606, 575, 607, 638, 669, 700,
+ 731, 762, 793, 824, 855, 886, 917, 948, 979, 1010, 1011, 980, 949,
+ 918, 887, 856, 825, 794, 763, 732, 701, 670, 639, 671, 702, 733,
+ 764, 795, 826, 857, 888, 919, 950, 981, 1012, 1013, 982, 951, 920,
+ 889, 858, 827, 796, 765, 734, 703, 735, 766, 797, 828, 859, 890,
+ 921, 952, 983, 1014, 1015, 984, 953, 922, 891, 860, 829, 798, 767,
+ 799, 830, 861, 892, 923, 954, 985, 1016, 1017, 986, 955, 924, 893,
+ 862, 831, 863, 894, 925, 956, 987, 1018, 1019, 988, 957, 926, 895,
+ 927, 958, 989, 1020, 1021, 990, 959, 991, 1022, 1023
+};
+
+// Neighborhood 2-tuples for various scans and blocksizes,
+// in {top, left} order for each position in corresponding scan order.
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 4, 4, 1, 4, 1, 1, 2, 2, 2, 5, 5,
+ 8, 8, 8, 9, 12, 6, 9, 3, 6, 7, 10, 10, 13, 11, 14, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1,
+ 1, 2, 5, 6, 9, 10, 13, 2, 2, 3, 6, 7, 10, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4,
+ 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6,
+ 9, 2, 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16,
+ 11, 14, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+ 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 0,
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 1, 1,
+ 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 2, 2, 3,
+ 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_4x8_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4,
+ 4, 5, 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12,
+ 13, 16, 14, 17, 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21,
+ 24, 22, 25, 23, 26, 24, 24, 25, 28, 26, 29, 27, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 8, 1, 1, 8, 8, 2, 9, 9, 16, 10,
+ 17, 2, 2, 16, 16, 3, 10, 17, 24, 11, 18, 18, 25, 3, 3, 4, 11,
+ 19, 26, 12, 19, 4, 4, 20, 27, 5, 12, 13, 20, 21, 28, 5, 5, 6,
+ 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 8, 8, 16, 16, 0, 0, 1, 8, 9, 16, 17, 24, 1,
+ 1, 2, 9, 10, 17, 18, 25, 2, 2, 3, 10, 11, 18, 19, 26, 3, 3,
+ 4, 11, 12, 19, 20, 27, 4, 4, 5, 12, 13, 20, 21, 28, 5, 5, 6,
+ 13, 14, 21, 22, 29, 6, 6, 7, 14, 15, 22, 23, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_8x4_neighbors[33 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0,
+ 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8,
+ 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17,
+ 24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29, 23, 30, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 4, 1, 1, 4, 4, 2, 5, 5, 8, 6, 9, 2,
+ 2, 8, 8, 3, 6, 9, 12, 7, 10, 10, 13, 12, 12, 13, 16, 11, 14, 14, 17,
+ 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
+ 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
+ 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
+ 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
+ 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 16, 1, 1, 16, 16, 2, 17, 17, 32, 18, 33, 2,
+ 2, 32, 32, 3, 18, 33, 48, 19, 34, 34, 49, 3, 3, 4, 19, 35, 50, 20, 35,
+ 4, 4, 36, 51, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6,
+ 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24,
+ 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
+ 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
+ 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5,
+ 8, 6, 9, 7, 10, 8, 8, 9, 12, 10, 13, 11, 14, 12, 12, 13, 16, 14, 17,
+ 15, 18, 16, 16, 17, 20, 18, 21, 19, 22, 20, 20, 21, 24, 22, 25, 23, 26, 24,
+ 24, 25, 28, 26, 29, 27, 30, 28, 28, 29, 32, 30, 33, 31, 34, 32, 32, 33, 36,
+ 34, 37, 35, 38, 36, 36, 37, 40, 38, 41, 39, 42, 40, 40, 41, 44, 42, 45, 43,
+ 46, 44, 44, 45, 48, 46, 49, 47, 50, 48, 48, 49, 52, 50, 53, 51, 54, 52, 52,
+ 53, 56, 54, 57, 55, 58, 56, 56, 57, 60, 58, 61, 59, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 0, 0, 1, 16, 2, 17,
+ 3, 18, 4, 19, 5, 20, 6, 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12,
+ 27, 13, 28, 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36,
+ 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31,
+ 46, 32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54, 40, 55,
+ 41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_4x16_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 4, 4, 8, 8, 12, 12, 16, 16, 20, 20, 24, 24, 28, 28, 32,
+ 32, 36, 36, 40, 40, 44, 44, 48, 48, 52, 52, 56, 56, 0, 0, 1, 4, 5, 8,
+ 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 32, 33, 36, 37, 40, 41, 44, 45,
+ 48, 49, 52, 53, 56, 57, 60, 1, 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21,
+ 22, 25, 26, 29, 30, 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58,
+ 61, 2, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31, 34,
+ 35, 38, 39, 42, 43, 46, 47, 50, 51, 54, 55, 58, 59, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_16x4_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 1, 16, 17, 32, 33, 48, 1, 1, 2,
+ 17, 18, 33, 34, 49, 2, 2, 3, 18, 19, 34, 35, 50, 3, 3, 4, 19, 20, 35,
+ 36, 51, 4, 4, 5, 20, 21, 36, 37, 52, 5, 5, 6, 21, 22, 37, 38, 53, 6,
+ 6, 7, 22, 23, 38, 39, 54, 7, 7, 8, 23, 24, 39, 40, 55, 8, 8, 9, 24,
+ 25, 40, 41, 56, 9, 9, 10, 25, 26, 41, 42, 57, 10, 10, 11, 26, 27, 42, 43,
+ 58, 11, 11, 12, 27, 28, 43, 44, 59, 12, 12, 13, 28, 29, 44, 45, 60, 13, 13,
+ 14, 29, 30, 45, 46, 61, 14, 14, 15, 30, 31, 46, 47, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2,
+ 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24,
+ 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5,
+ 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13,
+ 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14,
+ 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22,
+ 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23,
+ 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72,
+ 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80,
+ 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88,
+ 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89,
+ 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97,
+ 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98,
+ 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106,
+ 106, 113, 113, 120, 120, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107,
+ 114, 114, 121, 121, 128, 128, 128, 87, 94, 94, 101, 101, 108, 108, 115,
+ 115, 122, 122, 129, 129, 136, 136, 136, 95, 102, 102, 109, 109, 116, 116,
+ 123, 123, 130, 130, 137, 137, 144, 144, 144, 103, 110, 110, 117, 117, 124,
+ 124, 131, 131, 138, 138, 145, 145, 152, 152, 152, 111, 118, 118, 125, 125,
+ 132, 132, 139, 139, 146, 146, 153, 153, 160, 160, 160, 119, 126, 126, 133,
+ 133, 140, 140, 147, 147, 154, 154, 161, 161, 168, 168, 168, 127, 134, 134,
+ 141, 141, 148, 148, 155, 155, 162, 162, 169, 169, 176, 176, 176, 135, 142,
+ 142, 149, 149, 156, 156, 163, 163, 170, 170, 177, 177, 184, 184, 184, 143,
+ 150, 150, 157, 157, 164, 164, 171, 171, 178, 178, 185, 185, 192, 192, 192,
+ 151, 158, 158, 165, 165, 172, 172, 179, 179, 186, 186, 193, 193, 200, 200,
+ 200, 159, 166, 166, 173, 173, 180, 180, 187, 187, 194, 194, 201, 201, 208,
+ 208, 208, 167, 174, 174, 181, 181, 188, 188, 195, 195, 202, 202, 209, 209,
+ 216, 216, 216, 175, 182, 182, 189, 189, 196, 196, 203, 203, 210, 210, 217,
+ 217, 224, 224, 224, 183, 190, 190, 197, 197, 204, 204, 211, 211, 218, 218,
+ 225, 225, 232, 232, 232, 191, 198, 198, 205, 205, 212, 212, 219, 219, 226,
+ 226, 233, 233, 240, 240, 240, 199, 206, 206, 213, 213, 220, 220, 227, 227,
+ 234, 234, 241, 241, 248, 207, 214, 214, 221, 221, 228, 228, 235, 235, 242,
+ 242, 249, 215, 222, 222, 229, 229, 236, 236, 243, 243, 250, 223, 230, 230,
+ 237, 237, 244, 244, 251, 231, 238, 238, 245, 245, 252, 239, 246, 246, 253,
+ 247, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2,
+ 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96,
+ 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5,
+ 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37,
+ 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7,
+ 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 8, 8,
+ 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194, 225, 9,
+ 9, 9, 40, 40, 71, 71, 102, 102, 133, 133, 164, 164, 195, 195, 226,
+ 10, 10, 10, 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196,
+ 227, 11, 11, 11, 42, 42, 73, 73, 104, 104, 135, 135, 166, 166, 197,
+ 197, 228, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167,
+ 198, 198, 229, 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168,
+ 168, 199, 199, 230, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138,
+ 169, 169, 200, 200, 231, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139,
+ 139, 170, 170, 201, 201, 232, 16, 16, 16, 47, 47, 78, 78, 109, 109,
+ 140, 140, 171, 171, 202, 202, 233, 17, 17, 17, 48, 48, 79, 79, 110,
+ 110, 141, 141, 172, 172, 203, 203, 234, 18, 18, 18, 49, 49, 80, 80,
+ 111, 111, 142, 142, 173, 173, 204, 204, 235, 19, 19, 19, 50, 50, 81,
+ 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 20, 20, 20, 51, 51,
+ 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 21, 21, 21, 52,
+ 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238, 22, 22, 22,
+ 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208, 208, 239, 23, 23,
+ 23, 54, 54, 85, 85, 116, 116, 147, 147, 178, 178, 209, 209, 240, 24,
+ 24, 24, 55, 55, 86, 86, 117, 117, 148, 148, 179, 179, 210, 210, 241,
+ 25, 25, 25, 56, 56, 87, 87, 118, 118, 149, 149, 180, 180, 211, 211,
+ 242, 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212,
+ 212, 243, 27, 27, 27, 58, 58, 89, 89, 120, 120, 151, 151, 182, 182,
+ 213, 213, 244, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183,
+ 183, 214, 214, 245, 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153,
+ 184, 184, 215, 215, 246, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154,
+ 154, 185, 185, 216, 216, 247, 31, 62, 62, 93, 93, 124, 124, 155, 155,
+ 186, 186, 217, 217, 248, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218,
+ 218, 249, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 127, 158, 158,
+ 189, 189, 220, 220, 251, 159, 190, 190, 221, 221, 252, 191, 222, 222, 253,
+ 223, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13,
+ 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14,
+ 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28,
+ 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29,
+ 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43,
+ 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44,
+ 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58,
+ 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59,
+ 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73,
+ 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74,
+ 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88,
+ 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89,
+ 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96,
+ 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+ 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+ 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+ 126, 120, 120, 121, 128, 122, 129, 123, 130, 124, 131, 125, 132, 126, 133,
+ 127, 134, 128, 128, 129, 136, 130, 137, 131, 138, 132, 139, 133, 140, 134,
+ 141, 135, 142, 136, 136, 137, 144, 138, 145, 139, 146, 140, 147, 141, 148,
+ 142, 149, 143, 150, 144, 144, 145, 152, 146, 153, 147, 154, 148, 155, 149,
+ 156, 150, 157, 151, 158, 152, 152, 153, 160, 154, 161, 155, 162, 156, 163,
+ 157, 164, 158, 165, 159, 166, 160, 160, 161, 168, 162, 169, 163, 170, 164,
+ 171, 165, 172, 166, 173, 167, 174, 168, 168, 169, 176, 170, 177, 171, 178,
+ 172, 179, 173, 180, 174, 181, 175, 182, 176, 176, 177, 184, 178, 185, 179,
+ 186, 180, 187, 181, 188, 182, 189, 183, 190, 184, 184, 185, 192, 186, 193,
+ 187, 194, 188, 195, 189, 196, 190, 197, 191, 198, 192, 192, 193, 200, 194,
+ 201, 195, 202, 196, 203, 197, 204, 198, 205, 199, 206, 200, 200, 201, 208,
+ 202, 209, 203, 210, 204, 211, 205, 212, 206, 213, 207, 214, 208, 208, 209,
+ 216, 210, 217, 211, 218, 212, 219, 213, 220, 214, 221, 215, 222, 216, 216,
+ 217, 224, 218, 225, 219, 226, 220, 227, 221, 228, 222, 229, 223, 230, 224,
+ 224, 225, 232, 226, 233, 227, 234, 228, 235, 229, 236, 230, 237, 231, 238,
+ 232, 232, 233, 240, 234, 241, 235, 242, 236, 243, 237, 244, 238, 245, 239,
+ 246, 240, 240, 241, 248, 242, 249, 243, 250, 244, 251, 245, 252, 246, 253,
+ 247, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
+ 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5,
+ 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43,
+ 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20,
+ 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58,
+ 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35,
+ 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73,
+ 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50,
+ 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88,
+ 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65,
+ 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
+ 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80,
+ 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118,
+ 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95,
+ 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133,
+ 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+ 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+ 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+ 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+ 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+ 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+ 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+ 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+ 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+ 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+ 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+ 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+ 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+ 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+ 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+ 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+ 223, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_8x32_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48,
+ 48, 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104,
+ 112, 112, 120, 120, 128, 128, 136, 136, 144, 144, 152, 152, 160, 160, 168,
+ 168, 176, 176, 184, 184, 192, 192, 200, 200, 208, 208, 216, 216, 224, 224,
+ 232, 232, 240, 240, 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33,
+ 40, 41, 48, 49, 56, 57, 64, 65, 72, 73, 80, 81, 88, 89, 96,
+ 97, 104, 105, 112, 113, 120, 121, 128, 129, 136, 137, 144, 145, 152, 153,
+ 160, 161, 168, 169, 176, 177, 184, 185, 192, 193, 200, 201, 208, 209, 216,
+ 217, 224, 225, 232, 233, 240, 241, 248, 1, 1, 2, 9, 10, 17, 18,
+ 25, 26, 33, 34, 41, 42, 49, 50, 57, 58, 65, 66, 73, 74, 81,
+ 82, 89, 90, 97, 98, 105, 106, 113, 114, 121, 122, 129, 130, 137, 138,
+ 145, 146, 153, 154, 161, 162, 169, 170, 177, 178, 185, 186, 193, 194, 201,
+ 202, 209, 210, 217, 218, 225, 226, 233, 234, 241, 242, 249, 2, 2, 3,
+ 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58, 59, 66,
+ 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122, 123,
+ 130, 131, 138, 139, 146, 147, 154, 155, 162, 163, 170, 171, 178, 179, 186,
+ 187, 194, 195, 202, 203, 210, 211, 218, 219, 226, 227, 234, 235, 242, 243,
+ 250, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51,
+ 52, 59, 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108,
+ 115, 116, 123, 124, 131, 132, 139, 140, 147, 148, 155, 156, 163, 164, 171,
+ 172, 179, 180, 187, 188, 195, 196, 203, 204, 211, 212, 219, 220, 227, 228,
+ 235, 236, 243, 244, 251, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36,
+ 37, 44, 45, 52, 53, 60, 61, 68, 69, 76, 77, 84, 85, 92, 93,
+ 100, 101, 108, 109, 116, 117, 124, 125, 132, 133, 140, 141, 148, 149, 156,
+ 157, 164, 165, 172, 173, 180, 181, 188, 189, 196, 197, 204, 205, 212, 213,
+ 220, 221, 228, 229, 236, 237, 244, 245, 252, 5, 5, 6, 13, 14, 21,
+ 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 62, 69, 70, 77, 78,
+ 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125, 126, 133, 134, 141,
+ 142, 149, 150, 157, 158, 165, 166, 173, 174, 181, 182, 189, 190, 197, 198,
+ 205, 206, 213, 214, 221, 222, 229, 230, 237, 238, 245, 246, 253, 6, 6,
+ 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 63,
+ 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
+ 127, 134, 135, 142, 143, 150, 151, 158, 159, 166, 167, 174, 175, 182, 183,
+ 190, 191, 198, 199, 206, 207, 214, 215, 222, 223, 230, 231, 238, 239, 246,
+ 247, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_32x8_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192,
+ 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, 192, 193, 224,
+ 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, 162, 193, 194, 225,
+ 2, 2, 3, 34, 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226,
+ 3, 3, 4, 35, 36, 67, 68, 99, 100, 131, 132, 163, 164, 195, 196, 227,
+ 4, 4, 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
+ 5, 5, 6, 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229,
+ 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
+ 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200, 231,
+ 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201, 232,
+ 9, 9, 10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233,
+ 10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202, 203, 234,
+ 11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
+ 12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236,
+ 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237,
+ 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207, 238,
+ 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207, 208, 239,
+ 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240,
+ 17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209, 210, 241,
+ 18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
+ 19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243,
+ 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244,
+ 21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
+ 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214, 215, 246,
+ 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247,
+ 24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216, 217, 248,
+ 25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
+ 26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218, 219, 250,
+ 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251,
+ 28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252,
+ 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222, 253,
+ 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254,
+ 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1,
+ 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1, 1, 2, 9, 10, 17,
+ 18, 25, 26, 33, 34, 41, 42, 49, 50, 57, 2, 2, 3, 10, 11, 18, 19, 26, 27,
+ 34, 35, 42, 43, 50, 51, 58, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43,
+ 44, 51, 52, 59, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53,
+ 60, 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6, 6,
+ 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, 0, 1,
+ 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13, 7, 14, 8, 8, 9, 16, 10, 17,
+ 11, 18, 12, 19, 13, 20, 14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20,
+ 27, 21, 28, 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36,
+ 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39,
+ 46, 40, 40, 41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48,
+ 49, 56, 50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 2, 2, 2, 9, 9, 16, 16,
+ 16, 24, 24, 17, 24, 10, 17, 3, 10, 3, 3, 4, 4, 4, 11, 11, 18, 18, 25,
+ 25, 32, 32, 32, 40, 40, 33, 40, 26, 33, 19, 26, 12, 19, 5, 12, 5, 5, 6,
+ 6, 6, 13, 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 49, 56, 42, 49,
+ 35, 42, 28, 35, 21, 28, 14, 21, 7, 14, 15, 22, 22, 29, 29, 36, 36, 43, 43,
+ 50, 50, 57, 51, 58, 44, 51, 37, 44, 30, 37, 23, 30, 31, 38, 38, 45, 45, 52,
+ 52, 59, 53, 60, 46, 53, 39, 46, 47, 54, 54, 61, 55, 62, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 8, 8, 8, 2, 2, 2,
+ 9, 9, 16, 16, 16, 3, 3, 3, 10, 10, 17, 17, 24, 24, 24,
+ 4, 4, 4, 11, 11, 18, 18, 25, 25, 32, 32, 32, 5, 5, 5,
+ 12, 12, 19, 19, 26, 26, 33, 33, 40, 40, 40, 6, 6, 6, 13,
+ 13, 20, 20, 27, 27, 34, 34, 41, 41, 48, 48, 48, 7, 14, 14,
+ 21, 21, 28, 28, 35, 35, 42, 42, 49, 49, 56, 56, 56, 15, 22,
+ 22, 29, 29, 36, 36, 43, 43, 50, 50, 57, 57, 64, 64, 64, 23,
+ 30, 30, 37, 37, 44, 44, 51, 51, 58, 58, 65, 65, 72, 72, 72,
+ 31, 38, 38, 45, 45, 52, 52, 59, 59, 66, 66, 73, 73, 80, 80,
+ 80, 39, 46, 46, 53, 53, 60, 60, 67, 67, 74, 74, 81, 81, 88,
+ 88, 88, 47, 54, 54, 61, 61, 68, 68, 75, 75, 82, 82, 89, 89,
+ 96, 96, 96, 55, 62, 62, 69, 69, 76, 76, 83, 83, 90, 90, 97,
+ 97, 104, 104, 104, 63, 70, 70, 77, 77, 84, 84, 91, 91, 98, 98,
+ 105, 105, 112, 112, 112, 71, 78, 78, 85, 85, 92, 92, 99, 99, 106,
+ 106, 113, 113, 120, 79, 86, 86, 93, 93, 100, 100, 107, 107, 114, 114,
+ 121, 87, 94, 94, 101, 101, 108, 108, 115, 115, 122, 95, 102, 102, 109,
+ 109, 116, 116, 123, 103, 110, 110, 117, 117, 124, 111, 118, 118, 125, 119,
+ 126, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2,
+ 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48,
+ 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5,
+ 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21,
+ 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7,
+ 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 8, 8,
+ 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98, 113, 9,
+ 9, 9, 24, 24, 39, 39, 54, 54, 69, 69, 84, 84, 99, 99, 114,
+ 10, 10, 10, 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100,
+ 115, 11, 11, 11, 26, 26, 41, 41, 56, 56, 71, 71, 86, 86, 101,
+ 101, 116, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
+ 102, 102, 117, 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88,
+ 88, 103, 103, 118, 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74,
+ 89, 89, 104, 104, 119, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90,
+ 90, 105, 105, 120, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106,
+ 121, 47, 62, 62, 77, 77, 92, 92, 107, 107, 122, 63, 78, 78, 93,
+ 93, 108, 108, 123, 79, 94, 94, 109, 109, 124, 95, 110, 110, 125, 111,
+ 126, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48,
+ 56, 56, 64, 64, 72, 72, 80, 80, 88, 88, 96, 96, 104, 104, 112, 112,
+ 0, 0, 1, 8, 9, 16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56,
+ 57, 64, 65, 72, 73, 80, 81, 88, 89, 96, 97, 104, 105, 112, 113, 120,
+ 1, 1, 2, 9, 10, 17, 18, 25, 26, 33, 34, 41, 42, 49, 50, 57,
+ 58, 65, 66, 73, 74, 81, 82, 89, 90, 97, 98, 105, 106, 113, 114, 121,
+ 2, 2, 3, 10, 11, 18, 19, 26, 27, 34, 35, 42, 43, 50, 51, 58,
+ 59, 66, 67, 74, 75, 82, 83, 90, 91, 98, 99, 106, 107, 114, 115, 122,
+ 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51, 52, 59,
+ 60, 67, 68, 75, 76, 83, 84, 91, 92, 99, 100, 107, 108, 115, 116, 123,
+ 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60,
+ 61, 68, 69, 76, 77, 84, 85, 92, 93, 100, 101, 108, 109, 116, 117, 124,
+ 5, 5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61,
+ 62, 69, 70, 77, 78, 85, 86, 93, 94, 101, 102, 109, 110, 117, 118, 125,
+ 6, 6, 7, 14, 15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62,
+ 63, 70, 71, 78, 79, 86, 87, 94, 95, 102, 103, 110, 111, 118, 119, 126,
+ 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96,
+ 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112,
+ 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97, 98, 113,
+ 2, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114,
+ 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, 100, 115,
+ 4, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+ 5, 5, 6, 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+ 6, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+ 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+ 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+ 9, 9, 10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+ 10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+ 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+ 12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+ 13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+ 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+ 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_8x16_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 0, 0, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13,
+ 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20, 14,
+ 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28,
+ 22, 29, 23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29,
+ 36, 30, 37, 31, 38, 32, 32, 33, 40, 34, 41, 35, 42, 36, 43,
+ 37, 44, 38, 45, 39, 46, 40, 40, 41, 48, 42, 49, 43, 50, 44,
+ 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56, 50, 57, 51, 58,
+ 52, 59, 53, 60, 54, 61, 55, 62, 56, 56, 57, 64, 58, 65, 59,
+ 66, 60, 67, 61, 68, 62, 69, 63, 70, 64, 64, 65, 72, 66, 73,
+ 67, 74, 68, 75, 69, 76, 70, 77, 71, 78, 72, 72, 73, 80, 74,
+ 81, 75, 82, 76, 83, 77, 84, 78, 85, 79, 86, 80, 80, 81, 88,
+ 82, 89, 83, 90, 84, 91, 85, 92, 86, 93, 87, 94, 88, 88, 89,
+ 96, 90, 97, 91, 98, 92, 99, 93, 100, 94, 101, 95, 102, 96, 96,
+ 97, 104, 98, 105, 99, 106, 100, 107, 101, 108, 102, 109, 103, 110, 104,
+ 104, 105, 112, 106, 113, 107, 114, 108, 115, 109, 116, 110, 117, 111, 118,
+ 112, 112, 113, 120, 114, 121, 115, 122, 116, 123, 117, 124, 118, 125, 119,
+ 126, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_16x8_neighbors[129 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
+ 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
+ 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
+ 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
+ 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
+ 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
+ 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
+ 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
+ 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
+ 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
+ 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
+ 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
+ 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
+ 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
+ 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+ 126, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 16, 16, 16, 2, 2, 2,
+ 17, 17, 32, 32, 32, 3, 3, 3, 18, 18, 33, 33, 48, 48, 48,
+ 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 5, 5, 5,
+ 20, 20, 35, 35, 50, 50, 65, 65, 80, 80, 80, 6, 6, 6, 21,
+ 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 7, 7, 7,
+ 22, 22, 37, 37, 52, 52, 67, 67, 82, 82, 97, 97, 112, 112, 112,
+ 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98,
+ 113, 113, 128, 128, 128, 9, 9, 9, 24, 24, 39, 39, 54, 54, 69,
+ 69, 84, 84, 99, 99, 114, 114, 129, 129, 144, 144, 144, 10, 10, 10,
+ 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130,
+ 130, 145, 145, 160, 160, 160, 11, 11, 11, 26, 26, 41, 41, 56, 56,
+ 71, 71, 86, 86, 101, 101, 116, 116, 131, 131, 146, 146, 161, 161, 176,
+ 176, 176, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
+ 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+ 13, 13, 13, 28, 28, 43, 43, 58, 58, 73, 73, 88, 88, 103, 103,
+ 118, 118, 133, 133, 148, 148, 163, 163, 178, 178, 193, 193, 208, 208, 208,
+ 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104,
+ 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+ 224, 224, 15, 30, 30, 45, 45, 60, 60, 75, 75, 90, 90, 105, 105,
+ 120, 120, 135, 135, 150, 150, 165, 165, 180, 180, 195, 195, 210, 210, 225,
+ 225, 240, 240, 240, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106,
+ 121, 121, 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226,
+ 226, 241, 241, 256, 256, 256, 47, 62, 62, 77, 77, 92, 92, 107, 107,
+ 122, 122, 137, 137, 152, 152, 167, 167, 182, 182, 197, 197, 212, 212, 227,
+ 227, 242, 242, 257, 257, 272, 272, 272, 63, 78, 78, 93, 93, 108, 108,
+ 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198, 213, 213, 228,
+ 228, 243, 243, 258, 258, 273, 273, 288, 288, 288, 79, 94, 94, 109, 109,
+ 124, 124, 139, 139, 154, 154, 169, 169, 184, 184, 199, 199, 214, 214, 229,
+ 229, 244, 244, 259, 259, 274, 274, 289, 289, 304, 304, 304, 95, 110, 110,
+ 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+ 230, 245, 245, 260, 260, 275, 275, 290, 290, 305, 305, 320, 320, 320, 111,
+ 126, 126, 141, 141, 156, 156, 171, 171, 186, 186, 201, 201, 216, 216, 231,
+ 231, 246, 246, 261, 261, 276, 276, 291, 291, 306, 306, 321, 321, 336, 336,
+ 336, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202, 202, 217, 217, 232,
+ 232, 247, 247, 262, 262, 277, 277, 292, 292, 307, 307, 322, 322, 337, 337,
+ 352, 352, 352, 143, 158, 158, 173, 173, 188, 188, 203, 203, 218, 218, 233,
+ 233, 248, 248, 263, 263, 278, 278, 293, 293, 308, 308, 323, 323, 338, 338,
+ 353, 353, 368, 368, 368, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+ 234, 249, 249, 264, 264, 279, 279, 294, 294, 309, 309, 324, 324, 339, 339,
+ 354, 354, 369, 369, 384, 384, 384, 175, 190, 190, 205, 205, 220, 220, 235,
+ 235, 250, 250, 265, 265, 280, 280, 295, 295, 310, 310, 325, 325, 340, 340,
+ 355, 355, 370, 370, 385, 385, 400, 400, 400, 191, 206, 206, 221, 221, 236,
+ 236, 251, 251, 266, 266, 281, 281, 296, 296, 311, 311, 326, 326, 341, 341,
+ 356, 356, 371, 371, 386, 386, 401, 401, 416, 416, 416, 207, 222, 222, 237,
+ 237, 252, 252, 267, 267, 282, 282, 297, 297, 312, 312, 327, 327, 342, 342,
+ 357, 357, 372, 372, 387, 387, 402, 402, 417, 417, 432, 432, 432, 223, 238,
+ 238, 253, 253, 268, 268, 283, 283, 298, 298, 313, 313, 328, 328, 343, 343,
+ 358, 358, 373, 373, 388, 388, 403, 403, 418, 418, 433, 433, 448, 448, 448,
+ 239, 254, 254, 269, 269, 284, 284, 299, 299, 314, 314, 329, 329, 344, 344,
+ 359, 359, 374, 374, 389, 389, 404, 404, 419, 419, 434, 434, 449, 449, 464,
+ 464, 464, 255, 270, 270, 285, 285, 300, 300, 315, 315, 330, 330, 345, 345,
+ 360, 360, 375, 375, 390, 390, 405, 405, 420, 420, 435, 435, 450, 450, 465,
+ 465, 480, 480, 480, 271, 286, 286, 301, 301, 316, 316, 331, 331, 346, 346,
+ 361, 361, 376, 376, 391, 391, 406, 406, 421, 421, 436, 436, 451, 451, 466,
+ 466, 481, 481, 496, 287, 302, 302, 317, 317, 332, 332, 347, 347, 362, 362,
+ 377, 377, 392, 392, 407, 407, 422, 422, 437, 437, 452, 452, 467, 467, 482,
+ 482, 497, 303, 318, 318, 333, 333, 348, 348, 363, 363, 378, 378, 393, 393,
+ 408, 408, 423, 423, 438, 438, 453, 453, 468, 468, 483, 483, 498, 319, 334,
+ 334, 349, 349, 364, 364, 379, 379, 394, 394, 409, 409, 424, 424, 439, 439,
+ 454, 454, 469, 469, 484, 484, 499, 335, 350, 350, 365, 365, 380, 380, 395,
+ 395, 410, 410, 425, 425, 440, 440, 455, 455, 470, 470, 485, 485, 500, 351,
+ 366, 366, 381, 381, 396, 396, 411, 411, 426, 426, 441, 441, 456, 456, 471,
+ 471, 486, 486, 501, 367, 382, 382, 397, 397, 412, 412, 427, 427, 442, 442,
+ 457, 457, 472, 472, 487, 487, 502, 383, 398, 398, 413, 413, 428, 428, 443,
+ 443, 458, 458, 473, 473, 488, 488, 503, 399, 414, 414, 429, 429, 444, 444,
+ 459, 459, 474, 474, 489, 489, 504, 415, 430, 430, 445, 445, 460, 460, 475,
+ 475, 490, 490, 505, 431, 446, 446, 461, 461, 476, 476, 491, 491, 506, 447,
+ 462, 462, 477, 477, 492, 492, 507, 463, 478, 478, 493, 493, 508, 479, 494,
+ 494, 509, 495, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 1, 1, 1, 32, 32, 32, 2, 2, 2,
+ 33, 33, 64, 64, 64, 3, 3, 3, 34, 34, 65, 65, 96, 96, 96,
+ 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128, 5, 5, 5,
+ 36, 36, 67, 67, 98, 98, 129, 129, 160, 160, 160, 6, 6, 6, 37,
+ 37, 68, 68, 99, 99, 130, 130, 161, 161, 192, 192, 192, 7, 7, 7,
+ 38, 38, 69, 69, 100, 100, 131, 131, 162, 162, 193, 193, 224, 224, 224,
+ 8, 8, 8, 39, 39, 70, 70, 101, 101, 132, 132, 163, 163, 194, 194,
+ 225, 225, 256, 256, 256, 9, 9, 9, 40, 40, 71, 71, 102, 102, 133,
+ 133, 164, 164, 195, 195, 226, 226, 257, 257, 288, 288, 288, 10, 10, 10,
+ 41, 41, 72, 72, 103, 103, 134, 134, 165, 165, 196, 196, 227, 227, 258,
+ 258, 289, 289, 320, 320, 320, 11, 11, 11, 42, 42, 73, 73, 104, 104,
+ 135, 135, 166, 166, 197, 197, 228, 228, 259, 259, 290, 290, 321, 321, 352,
+ 352, 352, 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167,
+ 198, 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
+ 13, 13, 13, 44, 44, 75, 75, 106, 106, 137, 137, 168, 168, 199, 199,
+ 230, 230, 261, 261, 292, 292, 323, 323, 354, 354, 385, 385, 416, 416, 416,
+ 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169, 169, 200, 200,
+ 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386, 386, 417, 417, 448,
+ 448, 448, 15, 15, 15, 46, 46, 77, 77, 108, 108, 139, 139, 170, 170,
+ 201, 201, 232, 232, 263, 263, 294, 294, 325, 325, 356, 356, 387, 387, 418,
+ 418, 449, 449, 480, 16, 16, 16, 47, 47, 78, 78, 109, 109, 140, 140,
+ 171, 171, 202, 202, 233, 233, 264, 264, 295, 295, 326, 326, 357, 357, 388,
+ 388, 419, 419, 450, 450, 481, 17, 17, 17, 48, 48, 79, 79, 110, 110,
+ 141, 141, 172, 172, 203, 203, 234, 234, 265, 265, 296, 296, 327, 327, 358,
+ 358, 389, 389, 420, 420, 451, 451, 482, 18, 18, 18, 49, 49, 80, 80,
+ 111, 111, 142, 142, 173, 173, 204, 204, 235, 235, 266, 266, 297, 297, 328,
+ 328, 359, 359, 390, 390, 421, 421, 452, 452, 483, 19, 19, 19, 50, 50,
+ 81, 81, 112, 112, 143, 143, 174, 174, 205, 205, 236, 236, 267, 267, 298,
+ 298, 329, 329, 360, 360, 391, 391, 422, 422, 453, 453, 484, 20, 20, 20,
+ 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206, 206, 237, 237, 268,
+ 268, 299, 299, 330, 330, 361, 361, 392, 392, 423, 423, 454, 454, 485, 21,
+ 21, 21, 52, 52, 83, 83, 114, 114, 145, 145, 176, 176, 207, 207, 238,
+ 238, 269, 269, 300, 300, 331, 331, 362, 362, 393, 393, 424, 424, 455, 455,
+ 486, 22, 22, 22, 53, 53, 84, 84, 115, 115, 146, 146, 177, 177, 208,
+ 208, 239, 239, 270, 270, 301, 301, 332, 332, 363, 363, 394, 394, 425, 425,
+ 456, 456, 487, 23, 23, 23, 54, 54, 85, 85, 116, 116, 147, 147, 178,
+ 178, 209, 209, 240, 240, 271, 271, 302, 302, 333, 333, 364, 364, 395, 395,
+ 426, 426, 457, 457, 488, 24, 24, 24, 55, 55, 86, 86, 117, 117, 148,
+ 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334, 334, 365, 365,
+ 396, 396, 427, 427, 458, 458, 489, 25, 25, 25, 56, 56, 87, 87, 118,
+ 118, 149, 149, 180, 180, 211, 211, 242, 242, 273, 273, 304, 304, 335, 335,
+ 366, 366, 397, 397, 428, 428, 459, 459, 490, 26, 26, 26, 57, 57, 88,
+ 88, 119, 119, 150, 150, 181, 181, 212, 212, 243, 243, 274, 274, 305, 305,
+ 336, 336, 367, 367, 398, 398, 429, 429, 460, 460, 491, 27, 27, 27, 58,
+ 58, 89, 89, 120, 120, 151, 151, 182, 182, 213, 213, 244, 244, 275, 275,
+ 306, 306, 337, 337, 368, 368, 399, 399, 430, 430, 461, 461, 492, 28, 28,
+ 28, 59, 59, 90, 90, 121, 121, 152, 152, 183, 183, 214, 214, 245, 245,
+ 276, 276, 307, 307, 338, 338, 369, 369, 400, 400, 431, 431, 462, 462, 493,
+ 29, 29, 29, 60, 60, 91, 91, 122, 122, 153, 153, 184, 184, 215, 215,
+ 246, 246, 277, 277, 308, 308, 339, 339, 370, 370, 401, 401, 432, 432, 463,
+ 463, 494, 30, 30, 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185,
+ 216, 216, 247, 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433,
+ 433, 464, 464, 495, 31, 62, 62, 93, 93, 124, 124, 155, 155, 186, 186,
+ 217, 217, 248, 248, 279, 279, 310, 310, 341, 341, 372, 372, 403, 403, 434,
+ 434, 465, 465, 496, 63, 94, 94, 125, 125, 156, 156, 187, 187, 218, 218,
+ 249, 249, 280, 280, 311, 311, 342, 342, 373, 373, 404, 404, 435, 435, 466,
+ 466, 497, 95, 126, 126, 157, 157, 188, 188, 219, 219, 250, 250, 281, 281,
+ 312, 312, 343, 343, 374, 374, 405, 405, 436, 436, 467, 467, 498, 127, 158,
+ 158, 189, 189, 220, 220, 251, 251, 282, 282, 313, 313, 344, 344, 375, 375,
+ 406, 406, 437, 437, 468, 468, 499, 159, 190, 190, 221, 221, 252, 252, 283,
+ 283, 314, 314, 345, 345, 376, 376, 407, 407, 438, 438, 469, 469, 500, 191,
+ 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377, 377, 408, 408, 439,
+ 439, 470, 470, 501, 223, 254, 254, 285, 285, 316, 316, 347, 347, 378, 378,
+ 409, 409, 440, 440, 471, 471, 502, 255, 286, 286, 317, 317, 348, 348, 379,
+ 379, 410, 410, 441, 441, 472, 472, 503, 287, 318, 318, 349, 349, 380, 380,
+ 411, 411, 442, 442, 473, 473, 504, 319, 350, 350, 381, 381, 412, 412, 443,
+ 443, 474, 474, 505, 351, 382, 382, 413, 413, 444, 444, 475, 475, 506, 383,
+ 414, 414, 445, 445, 476, 476, 507, 415, 446, 446, 477, 477, 508, 447, 478,
+ 478, 509, 479, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96,
+ 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+ 224, 224, 240, 240, 256, 256, 272, 272, 288, 288, 304, 304, 320, 320, 336,
+ 336, 352, 352, 368, 368, 384, 384, 400, 400, 416, 416, 432, 432, 448, 448,
+ 464, 464, 480, 480, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65,
+ 80, 81, 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192,
+ 193, 208, 209, 224, 225, 240, 241, 256, 257, 272, 273, 288, 289, 304, 305,
+ 320, 321, 336, 337, 352, 353, 368, 369, 384, 385, 400, 401, 416, 417, 432,
+ 433, 448, 449, 464, 465, 480, 481, 496, 1, 1, 2, 17, 18, 33, 34,
+ 49, 50, 65, 66, 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161,
+ 162, 177, 178, 193, 194, 209, 210, 225, 226, 241, 242, 257, 258, 273, 274,
+ 289, 290, 305, 306, 321, 322, 337, 338, 353, 354, 369, 370, 385, 386, 401,
+ 402, 417, 418, 433, 434, 449, 450, 465, 466, 481, 482, 497, 2, 2, 3,
+ 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114, 115, 130,
+ 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211, 226, 227, 242, 243,
+ 258, 259, 274, 275, 290, 291, 306, 307, 322, 323, 338, 339, 354, 355, 370,
+ 371, 386, 387, 402, 403, 418, 419, 434, 435, 450, 451, 466, 467, 482, 483,
+ 498, 3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99,
+ 100, 115, 116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+ 227, 228, 243, 244, 259, 260, 275, 276, 291, 292, 307, 308, 323, 324, 339,
+ 340, 355, 356, 371, 372, 387, 388, 403, 404, 419, 420, 435, 436, 451, 452,
+ 467, 468, 483, 484, 499, 4, 4, 5, 20, 21, 36, 37, 52, 53, 68,
+ 69, 84, 85, 100, 101, 116, 117, 132, 133, 148, 149, 164, 165, 180, 181,
+ 196, 197, 212, 213, 228, 229, 244, 245, 260, 261, 276, 277, 292, 293, 308,
+ 309, 324, 325, 340, 341, 356, 357, 372, 373, 388, 389, 404, 405, 420, 421,
+ 436, 437, 452, 453, 468, 469, 484, 485, 500, 5, 5, 6, 21, 22, 37,
+ 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133, 134, 149, 150,
+ 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 246, 261, 262, 277,
+ 278, 293, 294, 309, 310, 325, 326, 341, 342, 357, 358, 373, 374, 389, 390,
+ 405, 406, 421, 422, 437, 438, 453, 454, 469, 470, 485, 486, 501, 6, 6,
+ 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118, 119,
+ 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231, 246,
+ 247, 262, 263, 278, 279, 294, 295, 310, 311, 326, 327, 342, 343, 358, 359,
+ 374, 375, 390, 391, 406, 407, 422, 423, 438, 439, 454, 455, 470, 471, 486,
+ 487, 502, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88,
+ 103, 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215,
+ 216, 231, 232, 247, 248, 263, 264, 279, 280, 295, 296, 311, 312, 327, 328,
+ 343, 344, 359, 360, 375, 376, 391, 392, 407, 408, 423, 424, 439, 440, 455,
+ 456, 471, 472, 487, 488, 503, 8, 8, 9, 24, 25, 40, 41, 56, 57,
+ 72, 73, 88, 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184,
+ 185, 200, 201, 216, 217, 232, 233, 248, 249, 264, 265, 280, 281, 296, 297,
+ 312, 313, 328, 329, 344, 345, 360, 361, 376, 377, 392, 393, 408, 409, 424,
+ 425, 440, 441, 456, 457, 472, 473, 488, 489, 504, 9, 9, 10, 25, 26,
+ 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121, 122, 137, 138, 153,
+ 154, 169, 170, 185, 186, 201, 202, 217, 218, 233, 234, 249, 250, 265, 266,
+ 281, 282, 297, 298, 313, 314, 329, 330, 345, 346, 361, 362, 377, 378, 393,
+ 394, 409, 410, 425, 426, 441, 442, 457, 458, 473, 474, 489, 490, 505, 10,
+ 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+ 123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219, 234, 235,
+ 250, 251, 266, 267, 282, 283, 298, 299, 314, 315, 330, 331, 346, 347, 362,
+ 363, 378, 379, 394, 395, 410, 411, 426, 427, 442, 443, 458, 459, 474, 475,
+ 490, 491, 506, 11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91,
+ 92, 107, 108, 123, 124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204,
+ 219, 220, 235, 236, 251, 252, 267, 268, 283, 284, 299, 300, 315, 316, 331,
+ 332, 347, 348, 363, 364, 379, 380, 395, 396, 411, 412, 427, 428, 443, 444,
+ 459, 460, 475, 476, 491, 492, 507, 12, 12, 13, 28, 29, 44, 45, 60,
+ 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141, 156, 157, 172, 173,
+ 188, 189, 204, 205, 220, 221, 236, 237, 252, 253, 268, 269, 284, 285, 300,
+ 301, 316, 317, 332, 333, 348, 349, 364, 365, 380, 381, 396, 397, 412, 413,
+ 428, 429, 444, 445, 460, 461, 476, 477, 492, 493, 508, 13, 13, 14, 29,
+ 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126, 141, 142,
+ 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253, 254, 269,
+ 270, 285, 286, 301, 302, 317, 318, 333, 334, 349, 350, 365, 366, 381, 382,
+ 397, 398, 413, 414, 429, 430, 445, 446, 461, 462, 477, 478, 493, 494, 509,
+ 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111,
+ 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+ 239, 254, 255, 270, 271, 286, 287, 302, 303, 318, 319, 334, 335, 350, 351,
+ 366, 367, 382, 383, 398, 399, 414, 415, 430, 431, 446, 447, 462, 463, 478,
+ 479, 494, 495, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192,
+ 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416, 416,
+ 448, 448, 0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161,
+ 192, 193, 224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+ 417, 448, 449, 480, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130,
+ 161, 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
+ 386, 417, 418, 449, 450, 481, 2, 2, 3, 34, 35, 66, 67, 98, 99,
+ 130, 131, 162, 163, 194, 195, 226, 227, 258, 259, 290, 291, 322, 323, 354,
+ 355, 386, 387, 418, 419, 450, 451, 482, 3, 3, 4, 35, 36, 67, 68,
+ 99, 100, 131, 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323,
+ 324, 355, 356, 387, 388, 419, 420, 451, 452, 483, 4, 4, 5, 36, 37,
+ 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228, 229, 260, 261, 292,
+ 293, 324, 325, 356, 357, 388, 389, 420, 421, 452, 453, 484, 5, 5, 6,
+ 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198, 229, 230, 261,
+ 262, 293, 294, 325, 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 6,
+ 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199, 230,
+ 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422, 423, 454, 455,
+ 486, 7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199,
+ 200, 231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423, 424,
+ 455, 456, 487, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168,
+ 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393,
+ 424, 425, 456, 457, 488, 9, 9, 10, 41, 42, 73, 74, 105, 106, 137,
+ 138, 169, 170, 201, 202, 233, 234, 265, 266, 297, 298, 329, 330, 361, 362,
+ 393, 394, 425, 426, 457, 458, 489, 10, 10, 11, 42, 43, 74, 75, 106,
+ 107, 138, 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331,
+ 362, 363, 394, 395, 426, 427, 458, 459, 490, 11, 11, 12, 43, 44, 75,
+ 76, 107, 108, 139, 140, 171, 172, 203, 204, 235, 236, 267, 268, 299, 300,
+ 331, 332, 363, 364, 395, 396, 427, 428, 459, 460, 491, 12, 12, 13, 44,
+ 45, 76, 77, 108, 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269,
+ 300, 301, 332, 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 13, 13,
+ 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205, 206, 237, 238,
+ 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429, 430, 461, 462, 493,
+ 14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206, 207,
+ 238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399, 430, 431, 462,
+ 463, 494, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176,
+ 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400, 431,
+ 432, 463, 464, 495, 16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145,
+ 176, 177, 208, 209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400,
+ 401, 432, 433, 464, 465, 496, 17, 17, 18, 49, 50, 81, 82, 113, 114,
+ 145, 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
+ 370, 401, 402, 433, 434, 465, 466, 497, 18, 18, 19, 50, 51, 82, 83,
+ 114, 115, 146, 147, 178, 179, 210, 211, 242, 243, 274, 275, 306, 307, 338,
+ 339, 370, 371, 402, 403, 434, 435, 466, 467, 498, 19, 19, 20, 51, 52,
+ 83, 84, 115, 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307,
+ 308, 339, 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 20, 20, 21,
+ 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212, 213, 244, 245, 276,
+ 277, 308, 309, 340, 341, 372, 373, 404, 405, 436, 437, 468, 469, 500, 21,
+ 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213, 214, 245,
+ 246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470,
+ 501, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214,
+ 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407, 438, 439,
+ 470, 471, 502, 23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183,
+ 184, 215, 216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+ 439, 440, 471, 472, 503, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152,
+ 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377,
+ 408, 409, 440, 441, 472, 473, 504, 25, 25, 26, 57, 58, 89, 90, 121,
+ 122, 153, 154, 185, 186, 217, 218, 249, 250, 281, 282, 313, 314, 345, 346,
+ 377, 378, 409, 410, 441, 442, 473, 474, 505, 26, 26, 27, 58, 59, 90,
+ 91, 122, 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315,
+ 346, 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 27, 27, 28, 59,
+ 60, 91, 92, 123, 124, 155, 156, 187, 188, 219, 220, 251, 252, 283, 284,
+ 315, 316, 347, 348, 379, 380, 411, 412, 443, 444, 475, 476, 507, 28, 28,
+ 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253,
+ 284, 285, 316, 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508,
+ 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221, 222,
+ 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414, 445, 446, 477,
+ 478, 509, 30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191,
+ 222, 223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415, 446,
+ 447, 478, 479, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_16x32_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
+ 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
+ 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
+ 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
+ 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
+ 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
+ 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
+ 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
+ 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
+ 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
+ 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
+ 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
+ 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
+ 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
+ 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+ 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+ 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+ 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+ 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+ 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+ 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+ 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+ 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+ 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+ 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+ 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+ 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+ 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+ 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+ 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+ 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+ 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+ 239, 254, 240, 240, 241, 256, 242, 257, 243, 258, 244, 259, 245, 260, 246,
+ 261, 247, 262, 248, 263, 249, 264, 250, 265, 251, 266, 252, 267, 253, 268,
+ 254, 269, 255, 270, 256, 256, 257, 272, 258, 273, 259, 274, 260, 275, 261,
+ 276, 262, 277, 263, 278, 264, 279, 265, 280, 266, 281, 267, 282, 268, 283,
+ 269, 284, 270, 285, 271, 286, 272, 272, 273, 288, 274, 289, 275, 290, 276,
+ 291, 277, 292, 278, 293, 279, 294, 280, 295, 281, 296, 282, 297, 283, 298,
+ 284, 299, 285, 300, 286, 301, 287, 302, 288, 288, 289, 304, 290, 305, 291,
+ 306, 292, 307, 293, 308, 294, 309, 295, 310, 296, 311, 297, 312, 298, 313,
+ 299, 314, 300, 315, 301, 316, 302, 317, 303, 318, 304, 304, 305, 320, 306,
+ 321, 307, 322, 308, 323, 309, 324, 310, 325, 311, 326, 312, 327, 313, 328,
+ 314, 329, 315, 330, 316, 331, 317, 332, 318, 333, 319, 334, 320, 320, 321,
+ 336, 322, 337, 323, 338, 324, 339, 325, 340, 326, 341, 327, 342, 328, 343,
+ 329, 344, 330, 345, 331, 346, 332, 347, 333, 348, 334, 349, 335, 350, 336,
+ 336, 337, 352, 338, 353, 339, 354, 340, 355, 341, 356, 342, 357, 343, 358,
+ 344, 359, 345, 360, 346, 361, 347, 362, 348, 363, 349, 364, 350, 365, 351,
+ 366, 352, 352, 353, 368, 354, 369, 355, 370, 356, 371, 357, 372, 358, 373,
+ 359, 374, 360, 375, 361, 376, 362, 377, 363, 378, 364, 379, 365, 380, 366,
+ 381, 367, 382, 368, 368, 369, 384, 370, 385, 371, 386, 372, 387, 373, 388,
+ 374, 389, 375, 390, 376, 391, 377, 392, 378, 393, 379, 394, 380, 395, 381,
+ 396, 382, 397, 383, 398, 384, 384, 385, 400, 386, 401, 387, 402, 388, 403,
+ 389, 404, 390, 405, 391, 406, 392, 407, 393, 408, 394, 409, 395, 410, 396,
+ 411, 397, 412, 398, 413, 399, 414, 400, 400, 401, 416, 402, 417, 403, 418,
+ 404, 419, 405, 420, 406, 421, 407, 422, 408, 423, 409, 424, 410, 425, 411,
+ 426, 412, 427, 413, 428, 414, 429, 415, 430, 416, 416, 417, 432, 418, 433,
+ 419, 434, 420, 435, 421, 436, 422, 437, 423, 438, 424, 439, 425, 440, 426,
+ 441, 427, 442, 428, 443, 429, 444, 430, 445, 431, 446, 432, 432, 433, 448,
+ 434, 449, 435, 450, 436, 451, 437, 452, 438, 453, 439, 454, 440, 455, 441,
+ 456, 442, 457, 443, 458, 444, 459, 445, 460, 446, 461, 447, 462, 448, 448,
+ 449, 464, 450, 465, 451, 466, 452, 467, 453, 468, 454, 469, 455, 470, 456,
+ 471, 457, 472, 458, 473, 459, 474, 460, 475, 461, 476, 462, 477, 463, 478,
+ 464, 464, 465, 480, 466, 481, 467, 482, 468, 483, 469, 484, 470, 485, 471,
+ 486, 472, 487, 473, 488, 474, 489, 475, 490, 476, 491, 477, 492, 478, 493,
+ 479, 494, 480, 480, 481, 496, 482, 497, 483, 498, 484, 499, 485, 500, 486,
+ 501, 487, 502, 488, 503, 489, 504, 490, 505, 491, 506, 492, 507, 493, 508,
+ 494, 509, 495, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_32x16_neighbors[513 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21,
+ 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28,
+ 29, 29, 30, 30, 0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5,
+ 36, 6, 37, 7, 38, 8, 39, 9, 40, 10, 41, 11, 42, 12, 43,
+ 13, 44, 14, 45, 15, 46, 16, 47, 17, 48, 18, 49, 19, 50, 20,
+ 51, 21, 52, 22, 53, 23, 54, 24, 55, 25, 56, 26, 57, 27, 58,
+ 28, 59, 29, 60, 30, 61, 31, 62, 32, 32, 33, 64, 34, 65, 35,
+ 66, 36, 67, 37, 68, 38, 69, 39, 70, 40, 71, 41, 72, 42, 73,
+ 43, 74, 44, 75, 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50,
+ 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88,
+ 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65,
+ 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
+ 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110, 80,
+ 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117, 87, 118,
+ 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124, 94, 125, 95,
+ 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102, 133,
+ 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108, 139, 109, 140, 110,
+ 141, 111, 142, 112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+ 118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154, 124, 155, 125,
+ 156, 126, 157, 127, 158, 128, 128, 129, 160, 130, 161, 131, 162, 132, 163,
+ 133, 164, 134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170, 140,
+ 171, 141, 172, 142, 173, 143, 174, 144, 175, 145, 176, 146, 177, 147, 178,
+ 148, 179, 149, 180, 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155,
+ 186, 156, 187, 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193,
+ 163, 194, 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170,
+ 201, 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+ 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215, 185,
+ 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222, 192, 192,
+ 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229, 199, 230, 200,
+ 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236, 206, 237, 207, 238,
+ 208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244, 214, 245, 215,
+ 246, 216, 247, 217, 248, 218, 249, 219, 250, 220, 251, 221, 252, 222, 253,
+ 223, 254, 224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260, 230,
+ 261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266, 236, 267, 237, 268,
+ 238, 269, 239, 270, 240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245,
+ 276, 246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282, 252, 283,
+ 253, 284, 254, 285, 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260,
+ 291, 261, 292, 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+ 268, 299, 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275,
+ 306, 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
+ 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320, 290,
+ 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327, 297, 328,
+ 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334, 304, 335, 305,
+ 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341, 311, 342, 312, 343,
+ 313, 344, 314, 345, 315, 346, 316, 347, 317, 348, 318, 349, 319, 350, 320,
+ 320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356, 326, 357, 327, 358,
+ 328, 359, 329, 360, 330, 361, 331, 362, 332, 363, 333, 364, 334, 365, 335,
+ 366, 336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372, 342, 373,
+ 343, 374, 344, 375, 345, 376, 346, 377, 347, 378, 348, 379, 349, 380, 350,
+ 381, 351, 382, 352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+ 358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365,
+ 396, 366, 397, 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403,
+ 373, 404, 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380,
+ 411, 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
+ 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395,
+ 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432, 402, 433,
+ 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439, 409, 440, 410,
+ 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446, 416, 416, 417, 448,
+ 418, 449, 419, 450, 420, 451, 421, 452, 422, 453, 423, 454, 424, 455, 425,
+ 456, 426, 457, 427, 458, 428, 459, 429, 460, 430, 461, 431, 462, 432, 463,
+ 433, 464, 434, 465, 435, 466, 436, 467, 437, 468, 438, 469, 439, 470, 440,
+ 471, 441, 472, 442, 473, 443, 474, 444, 475, 445, 476, 446, 477, 447, 478,
+ 448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484, 454, 485, 455,
+ 486, 456, 487, 457, 488, 458, 489, 459, 490, 460, 491, 461, 492, 462, 493,
+ 463, 494, 464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470,
+ 501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508,
+ 478, 509, 479, 510, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96,
+ 96, 112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208, 208,
+ 224, 224, 0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81,
+ 96, 97, 112, 113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208,
+ 209, 224, 225, 240, 1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66,
+ 81, 82, 97, 98, 113, 114, 129, 130, 145, 146, 161, 162, 177, 178, 193,
+ 194, 209, 210, 225, 226, 241, 2, 2, 3, 18, 19, 34, 35, 50, 51,
+ 66, 67, 82, 83, 98, 99, 114, 115, 130, 131, 146, 147, 162, 163, 178,
+ 179, 194, 195, 210, 211, 226, 227, 242, 3, 3, 4, 19, 20, 35, 36,
+ 51, 52, 67, 68, 83, 84, 99, 100, 115, 116, 131, 132, 147, 148, 163,
+ 164, 179, 180, 195, 196, 211, 212, 227, 228, 243, 4, 4, 5, 20, 21,
+ 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116, 117, 132, 133, 148,
+ 149, 164, 165, 180, 181, 196, 197, 212, 213, 228, 229, 244, 5, 5, 6,
+ 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117, 118, 133,
+ 134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214, 229, 230, 245, 6,
+ 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+ 119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215, 230, 231,
+ 246, 7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103,
+ 104, 119, 120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+ 231, 232, 247, 8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88,
+ 89, 104, 105, 120, 121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201,
+ 216, 217, 232, 233, 248, 9, 9, 10, 25, 26, 41, 42, 57, 58, 73,
+ 74, 89, 90, 105, 106, 121, 122, 137, 138, 153, 154, 169, 170, 185, 186,
+ 201, 202, 217, 218, 233, 234, 249, 10, 10, 11, 26, 27, 42, 43, 58,
+ 59, 74, 75, 90, 91, 106, 107, 122, 123, 138, 139, 154, 155, 170, 171,
+ 186, 187, 202, 203, 218, 219, 234, 235, 250, 11, 11, 12, 27, 28, 43,
+ 44, 59, 60, 75, 76, 91, 92, 107, 108, 123, 124, 139, 140, 155, 156,
+ 171, 172, 187, 188, 203, 204, 219, 220, 235, 236, 251, 12, 12, 13, 28,
+ 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124, 125, 140, 141,
+ 156, 157, 172, 173, 188, 189, 204, 205, 220, 221, 236, 237, 252, 13, 13,
+ 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125, 126,
+ 141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222, 237, 238, 253,
+ 14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111,
+ 126, 127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223, 238,
+ 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
+ 14, 14, 0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6,
+ 21, 7, 22, 8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28,
+ 14, 29, 15, 30, 16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21,
+ 36, 22, 37, 23, 38, 24, 39, 25, 40, 26, 41, 27, 42, 28, 43,
+ 29, 44, 30, 45, 31, 46, 32, 32, 33, 48, 34, 49, 35, 50, 36,
+ 51, 37, 52, 38, 53, 39, 54, 40, 55, 41, 56, 42, 57, 43, 58,
+ 44, 59, 45, 60, 46, 61, 47, 62, 48, 48, 49, 64, 50, 65, 51,
+ 66, 52, 67, 53, 68, 54, 69, 55, 70, 56, 71, 57, 72, 58, 73,
+ 59, 74, 60, 75, 61, 76, 62, 77, 63, 78, 64, 64, 65, 80, 66,
+ 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86, 72, 87, 73, 88,
+ 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94, 80, 80, 81,
+ 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102, 88, 103,
+ 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110, 96,
+ 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117, 103, 118,
+ 104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110, 125, 111,
+ 126, 112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118, 133,
+ 119, 134, 120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+ 141, 127, 142, 128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148,
+ 134, 149, 135, 150, 136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141,
+ 156, 142, 157, 143, 158, 144, 144, 145, 160, 146, 161, 147, 162, 148, 163,
+ 149, 164, 150, 165, 151, 166, 152, 167, 153, 168, 154, 169, 155, 170, 156,
+ 171, 157, 172, 158, 173, 159, 174, 160, 160, 161, 176, 162, 177, 163, 178,
+ 164, 179, 165, 180, 166, 181, 167, 182, 168, 183, 169, 184, 170, 185, 171,
+ 186, 172, 187, 173, 188, 174, 189, 175, 190, 176, 176, 177, 192, 178, 193,
+ 179, 194, 180, 195, 181, 196, 182, 197, 183, 198, 184, 199, 185, 200, 186,
+ 201, 187, 202, 188, 203, 189, 204, 190, 205, 191, 206, 192, 192, 193, 208,
+ 194, 209, 195, 210, 196, 211, 197, 212, 198, 213, 199, 214, 200, 215, 201,
+ 216, 202, 217, 203, 218, 204, 219, 205, 220, 206, 221, 207, 222, 208, 208,
+ 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214, 229, 215, 230, 216,
+ 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222, 237, 223, 238,
+ 224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230, 245, 231,
+ 246, 232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238, 253,
+ 239, 254, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 2, 2, 2,
+ 17, 17, 32, 32, 32, 48, 48, 33, 48, 18, 33, 3, 18, 3, 3,
+ 4, 4, 4, 19, 19, 34, 34, 49, 49, 64, 64, 64, 80, 80, 65,
+ 80, 50, 65, 35, 50, 20, 35, 5, 20, 5, 5, 6, 6, 6, 21,
+ 21, 36, 36, 51, 51, 66, 66, 81, 81, 96, 96, 96, 112, 112, 97,
+ 112, 82, 97, 67, 82, 52, 67, 37, 52, 22, 37, 7, 22, 7, 7,
+ 8, 8, 8, 23, 23, 38, 38, 53, 53, 68, 68, 83, 83, 98, 98,
+ 113, 113, 128, 128, 128, 144, 144, 129, 144, 114, 129, 99, 114, 84, 99,
+ 69, 84, 54, 69, 39, 54, 24, 39, 9, 24, 9, 9, 10, 10, 10,
+ 25, 25, 40, 40, 55, 55, 70, 70, 85, 85, 100, 100, 115, 115, 130,
+ 130, 145, 145, 160, 160, 160, 176, 176, 161, 176, 146, 161, 131, 146, 116,
+ 131, 101, 116, 86, 101, 71, 86, 56, 71, 41, 56, 26, 41, 11, 26,
+ 11, 11, 12, 12, 12, 27, 27, 42, 42, 57, 57, 72, 72, 87, 87,
+ 102, 102, 117, 117, 132, 132, 147, 147, 162, 162, 177, 177, 192, 192, 192,
+ 208, 208, 193, 208, 178, 193, 163, 178, 148, 163, 133, 148, 118, 133, 103,
+ 118, 88, 103, 73, 88, 58, 73, 43, 58, 28, 43, 13, 28, 13, 13,
+ 14, 14, 14, 29, 29, 44, 44, 59, 59, 74, 74, 89, 89, 104, 104,
+ 119, 119, 134, 134, 149, 149, 164, 164, 179, 179, 194, 194, 209, 209, 224,
+ 224, 224, 225, 240, 210, 225, 195, 210, 180, 195, 165, 180, 150, 165, 135,
+ 150, 120, 135, 105, 120, 90, 105, 75, 90, 60, 75, 45, 60, 30, 45,
+ 15, 30, 31, 46, 46, 61, 61, 76, 76, 91, 91, 106, 106, 121, 121,
+ 136, 136, 151, 151, 166, 166, 181, 181, 196, 196, 211, 211, 226, 226, 241,
+ 227, 242, 212, 227, 197, 212, 182, 197, 167, 182, 152, 167, 137, 152, 122,
+ 137, 107, 122, 92, 107, 77, 92, 62, 77, 47, 62, 63, 78, 78, 93,
+ 93, 108, 108, 123, 123, 138, 138, 153, 153, 168, 168, 183, 183, 198, 198,
+ 213, 213, 228, 228, 243, 229, 244, 214, 229, 199, 214, 184, 199, 169, 184,
+ 154, 169, 139, 154, 124, 139, 109, 124, 94, 109, 79, 94, 95, 110, 110,
+ 125, 125, 140, 140, 155, 155, 170, 170, 185, 185, 200, 200, 215, 215, 230,
+ 230, 245, 231, 246, 216, 231, 201, 216, 186, 201, 171, 186, 156, 171, 141,
+ 156, 126, 141, 111, 126, 127, 142, 142, 157, 157, 172, 172, 187, 187, 202,
+ 202, 217, 217, 232, 232, 247, 233, 248, 218, 233, 203, 218, 188, 203, 173,
+ 188, 158, 173, 143, 158, 159, 174, 174, 189, 189, 204, 204, 219, 219, 234,
+ 234, 249, 235, 250, 220, 235, 205, 220, 190, 205, 175, 190, 191, 206, 206,
+ 221, 221, 236, 236, 251, 237, 252, 222, 237, 207, 222, 223, 238, 238, 253,
+ 239, 254, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160,
+ 192, 192, 224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384,
+ 416, 416, 448, 448, 480, 480, 512, 512, 544, 544, 576, 576, 608, 608,
+ 640, 640, 672, 672, 704, 704, 736, 736, 768, 768, 800, 800, 832, 832,
+ 864, 864, 896, 896, 928, 928, 960, 960, 0, 0, 1, 32, 33, 64,
+ 65, 96, 97, 128, 129, 160, 161, 192, 193, 224, 225, 256, 257, 288,
+ 289, 320, 321, 352, 353, 384, 385, 416, 417, 448, 449, 480, 481, 512,
+ 513, 544, 545, 576, 577, 608, 609, 640, 641, 672, 673, 704, 705, 736,
+ 737, 768, 769, 800, 801, 832, 833, 864, 865, 896, 897, 928, 929, 960,
+ 961, 992, 1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161,
+ 162, 193, 194, 225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385,
+ 386, 417, 418, 449, 450, 481, 482, 513, 514, 545, 546, 577, 578, 609,
+ 610, 641, 642, 673, 674, 705, 706, 737, 738, 769, 770, 801, 802, 833,
+ 834, 865, 866, 897, 898, 929, 930, 961, 962, 993, 2, 2, 3, 34,
+ 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195, 226, 227, 258,
+ 259, 290, 291, 322, 323, 354, 355, 386, 387, 418, 419, 450, 451, 482,
+ 483, 514, 515, 546, 547, 578, 579, 610, 611, 642, 643, 674, 675, 706,
+ 707, 738, 739, 770, 771, 802, 803, 834, 835, 866, 867, 898, 899, 930,
+ 931, 962, 963, 994, 3, 3, 4, 35, 36, 67, 68, 99, 100, 131,
+ 132, 163, 164, 195, 196, 227, 228, 259, 260, 291, 292, 323, 324, 355,
+ 356, 387, 388, 419, 420, 451, 452, 483, 484, 515, 516, 547, 548, 579,
+ 580, 611, 612, 643, 644, 675, 676, 707, 708, 739, 740, 771, 772, 803,
+ 804, 835, 836, 867, 868, 899, 900, 931, 932, 963, 964, 995, 4, 4,
+ 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197, 228,
+ 229, 260, 261, 292, 293, 324, 325, 356, 357, 388, 389, 420, 421, 452,
+ 453, 484, 485, 516, 517, 548, 549, 580, 581, 612, 613, 644, 645, 676,
+ 677, 708, 709, 740, 741, 772, 773, 804, 805, 836, 837, 868, 869, 900,
+ 901, 932, 933, 964, 965, 996, 5, 5, 6, 37, 38, 69, 70, 101,
+ 102, 133, 134, 165, 166, 197, 198, 229, 230, 261, 262, 293, 294, 325,
+ 326, 357, 358, 389, 390, 421, 422, 453, 454, 485, 486, 517, 518, 549,
+ 550, 581, 582, 613, 614, 645, 646, 677, 678, 709, 710, 741, 742, 773,
+ 774, 805, 806, 837, 838, 869, 870, 901, 902, 933, 934, 965, 966, 997,
+ 6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198,
+ 199, 230, 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422,
+ 423, 454, 455, 486, 487, 518, 519, 550, 551, 582, 583, 614, 615, 646,
+ 647, 678, 679, 710, 711, 742, 743, 774, 775, 806, 807, 838, 839, 870,
+ 871, 902, 903, 934, 935, 966, 967, 998, 7, 7, 8, 39, 40, 71,
+ 72, 103, 104, 135, 136, 167, 168, 199, 200, 231, 232, 263, 264, 295,
+ 296, 327, 328, 359, 360, 391, 392, 423, 424, 455, 456, 487, 488, 519,
+ 520, 551, 552, 583, 584, 615, 616, 647, 648, 679, 680, 711, 712, 743,
+ 744, 775, 776, 807, 808, 839, 840, 871, 872, 903, 904, 935, 936, 967,
+ 968, 999, 8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168,
+ 169, 200, 201, 232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392,
+ 393, 424, 425, 456, 457, 488, 489, 520, 521, 552, 553, 584, 585, 616,
+ 617, 648, 649, 680, 681, 712, 713, 744, 745, 776, 777, 808, 809, 840,
+ 841, 872, 873, 904, 905, 936, 937, 968, 969, 1000, 9, 9, 10, 41,
+ 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202, 233, 234, 265,
+ 266, 297, 298, 329, 330, 361, 362, 393, 394, 425, 426, 457, 458, 489,
+ 490, 521, 522, 553, 554, 585, 586, 617, 618, 649, 650, 681, 682, 713,
+ 714, 745, 746, 777, 778, 809, 810, 841, 842, 873, 874, 905, 906, 937,
+ 938, 969, 970, 1001, 10, 10, 11, 42, 43, 74, 75, 106, 107, 138,
+ 139, 170, 171, 202, 203, 234, 235, 266, 267, 298, 299, 330, 331, 362,
+ 363, 394, 395, 426, 427, 458, 459, 490, 491, 522, 523, 554, 555, 586,
+ 587, 618, 619, 650, 651, 682, 683, 714, 715, 746, 747, 778, 779, 810,
+ 811, 842, 843, 874, 875, 906, 907, 938, 939, 970, 971, 1002, 11, 11,
+ 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203, 204, 235,
+ 236, 267, 268, 299, 300, 331, 332, 363, 364, 395, 396, 427, 428, 459,
+ 460, 491, 492, 523, 524, 555, 556, 587, 588, 619, 620, 651, 652, 683,
+ 684, 715, 716, 747, 748, 779, 780, 811, 812, 843, 844, 875, 876, 907,
+ 908, 939, 940, 971, 972, 1003, 12, 12, 13, 44, 45, 76, 77, 108,
+ 109, 140, 141, 172, 173, 204, 205, 236, 237, 268, 269, 300, 301, 332,
+ 333, 364, 365, 396, 397, 428, 429, 460, 461, 492, 493, 524, 525, 556,
+ 557, 588, 589, 620, 621, 652, 653, 684, 685, 716, 717, 748, 749, 780,
+ 781, 812, 813, 844, 845, 876, 877, 908, 909, 940, 941, 972, 973, 1004,
+ 13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205,
+ 206, 237, 238, 269, 270, 301, 302, 333, 334, 365, 366, 397, 398, 429,
+ 430, 461, 462, 493, 494, 525, 526, 557, 558, 589, 590, 621, 622, 653,
+ 654, 685, 686, 717, 718, 749, 750, 781, 782, 813, 814, 845, 846, 877,
+ 878, 909, 910, 941, 942, 973, 974, 1005, 14, 14, 15, 46, 47, 78,
+ 79, 110, 111, 142, 143, 174, 175, 206, 207, 238, 239, 270, 271, 302,
+ 303, 334, 335, 366, 367, 398, 399, 430, 431, 462, 463, 494, 495, 526,
+ 527, 558, 559, 590, 591, 622, 623, 654, 655, 686, 687, 718, 719, 750,
+ 751, 782, 783, 814, 815, 846, 847, 878, 879, 910, 911, 942, 943, 974,
+ 975, 1006, 15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175,
+ 176, 207, 208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399,
+ 400, 431, 432, 463, 464, 495, 496, 527, 528, 559, 560, 591, 592, 623,
+ 624, 655, 656, 687, 688, 719, 720, 751, 752, 783, 784, 815, 816, 847,
+ 848, 879, 880, 911, 912, 943, 944, 975, 976, 1007, 16, 16, 17, 48,
+ 49, 80, 81, 112, 113, 144, 145, 176, 177, 208, 209, 240, 241, 272,
+ 273, 304, 305, 336, 337, 368, 369, 400, 401, 432, 433, 464, 465, 496,
+ 497, 528, 529, 560, 561, 592, 593, 624, 625, 656, 657, 688, 689, 720,
+ 721, 752, 753, 784, 785, 816, 817, 848, 849, 880, 881, 912, 913, 944,
+ 945, 976, 977, 1008, 17, 17, 18, 49, 50, 81, 82, 113, 114, 145,
+ 146, 177, 178, 209, 210, 241, 242, 273, 274, 305, 306, 337, 338, 369,
+ 370, 401, 402, 433, 434, 465, 466, 497, 498, 529, 530, 561, 562, 593,
+ 594, 625, 626, 657, 658, 689, 690, 721, 722, 753, 754, 785, 786, 817,
+ 818, 849, 850, 881, 882, 913, 914, 945, 946, 977, 978, 1009, 18, 18,
+ 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210, 211, 242,
+ 243, 274, 275, 306, 307, 338, 339, 370, 371, 402, 403, 434, 435, 466,
+ 467, 498, 499, 530, 531, 562, 563, 594, 595, 626, 627, 658, 659, 690,
+ 691, 722, 723, 754, 755, 786, 787, 818, 819, 850, 851, 882, 883, 914,
+ 915, 946, 947, 978, 979, 1010, 19, 19, 20, 51, 52, 83, 84, 115,
+ 116, 147, 148, 179, 180, 211, 212, 243, 244, 275, 276, 307, 308, 339,
+ 340, 371, 372, 403, 404, 435, 436, 467, 468, 499, 500, 531, 532, 563,
+ 564, 595, 596, 627, 628, 659, 660, 691, 692, 723, 724, 755, 756, 787,
+ 788, 819, 820, 851, 852, 883, 884, 915, 916, 947, 948, 979, 980, 1011,
+ 20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212,
+ 213, 244, 245, 276, 277, 308, 309, 340, 341, 372, 373, 404, 405, 436,
+ 437, 468, 469, 500, 501, 532, 533, 564, 565, 596, 597, 628, 629, 660,
+ 661, 692, 693, 724, 725, 756, 757, 788, 789, 820, 821, 852, 853, 884,
+ 885, 916, 917, 948, 949, 980, 981, 1012, 21, 21, 22, 53, 54, 85,
+ 86, 117, 118, 149, 150, 181, 182, 213, 214, 245, 246, 277, 278, 309,
+ 310, 341, 342, 373, 374, 405, 406, 437, 438, 469, 470, 501, 502, 533,
+ 534, 565, 566, 597, 598, 629, 630, 661, 662, 693, 694, 725, 726, 757,
+ 758, 789, 790, 821, 822, 853, 854, 885, 886, 917, 918, 949, 950, 981,
+ 982, 1013, 22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182,
+ 183, 214, 215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406,
+ 407, 438, 439, 470, 471, 502, 503, 534, 535, 566, 567, 598, 599, 630,
+ 631, 662, 663, 694, 695, 726, 727, 758, 759, 790, 791, 822, 823, 854,
+ 855, 886, 887, 918, 919, 950, 951, 982, 983, 1014, 23, 23, 24, 55,
+ 56, 87, 88, 119, 120, 151, 152, 183, 184, 215, 216, 247, 248, 279,
+ 280, 311, 312, 343, 344, 375, 376, 407, 408, 439, 440, 471, 472, 503,
+ 504, 535, 536, 567, 568, 599, 600, 631, 632, 663, 664, 695, 696, 727,
+ 728, 759, 760, 791, 792, 823, 824, 855, 856, 887, 888, 919, 920, 951,
+ 952, 983, 984, 1015, 24, 24, 25, 56, 57, 88, 89, 120, 121, 152,
+ 153, 184, 185, 216, 217, 248, 249, 280, 281, 312, 313, 344, 345, 376,
+ 377, 408, 409, 440, 441, 472, 473, 504, 505, 536, 537, 568, 569, 600,
+ 601, 632, 633, 664, 665, 696, 697, 728, 729, 760, 761, 792, 793, 824,
+ 825, 856, 857, 888, 889, 920, 921, 952, 953, 984, 985, 1016, 25, 25,
+ 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217, 218, 249,
+ 250, 281, 282, 313, 314, 345, 346, 377, 378, 409, 410, 441, 442, 473,
+ 474, 505, 506, 537, 538, 569, 570, 601, 602, 633, 634, 665, 666, 697,
+ 698, 729, 730, 761, 762, 793, 794, 825, 826, 857, 858, 889, 890, 921,
+ 922, 953, 954, 985, 986, 1017, 26, 26, 27, 58, 59, 90, 91, 122,
+ 123, 154, 155, 186, 187, 218, 219, 250, 251, 282, 283, 314, 315, 346,
+ 347, 378, 379, 410, 411, 442, 443, 474, 475, 506, 507, 538, 539, 570,
+ 571, 602, 603, 634, 635, 666, 667, 698, 699, 730, 731, 762, 763, 794,
+ 795, 826, 827, 858, 859, 890, 891, 922, 923, 954, 955, 986, 987, 1018,
+ 27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219,
+ 220, 251, 252, 283, 284, 315, 316, 347, 348, 379, 380, 411, 412, 443,
+ 444, 475, 476, 507, 508, 539, 540, 571, 572, 603, 604, 635, 636, 667,
+ 668, 699, 700, 731, 732, 763, 764, 795, 796, 827, 828, 859, 860, 891,
+ 892, 923, 924, 955, 956, 987, 988, 1019, 28, 28, 29, 60, 61, 92,
+ 93, 124, 125, 156, 157, 188, 189, 220, 221, 252, 253, 284, 285, 316,
+ 317, 348, 349, 380, 381, 412, 413, 444, 445, 476, 477, 508, 509, 540,
+ 541, 572, 573, 604, 605, 636, 637, 668, 669, 700, 701, 732, 733, 764,
+ 765, 796, 797, 828, 829, 860, 861, 892, 893, 924, 925, 956, 957, 988,
+ 989, 1020, 29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189,
+ 190, 221, 222, 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413,
+ 414, 445, 446, 477, 478, 509, 510, 541, 542, 573, 574, 605, 606, 637,
+ 638, 669, 670, 701, 702, 733, 734, 765, 766, 797, 798, 829, 830, 861,
+ 862, 893, 894, 925, 926, 957, 958, 989, 990, 1021, 30, 30, 31, 62,
+ 63, 94, 95, 126, 127, 158, 159, 190, 191, 222, 223, 254, 255, 286,
+ 287, 318, 319, 350, 351, 382, 383, 414, 415, 446, 447, 478, 479, 510,
+ 511, 542, 543, 574, 575, 606, 607, 638, 639, 670, 671, 702, 703, 734,
+ 735, 766, 767, 798, 799, 830, 831, 862, 863, 894, 895, 926, 927, 958,
+ 959, 990, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+ 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12,
+ 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
+ 20, 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26,
+ 27, 27, 28, 28, 29, 29, 30, 30, 0, 0, 1, 32, 2, 33,
+ 3, 34, 4, 35, 5, 36, 6, 37, 7, 38, 8, 39, 9, 40,
+ 10, 41, 11, 42, 12, 43, 13, 44, 14, 45, 15, 46, 16, 47,
+ 17, 48, 18, 49, 19, 50, 20, 51, 21, 52, 22, 53, 23, 54,
+ 24, 55, 25, 56, 26, 57, 27, 58, 28, 59, 29, 60, 30, 61,
+ 31, 62, 32, 32, 33, 64, 34, 65, 35, 66, 36, 67, 37, 68,
+ 38, 69, 39, 70, 40, 71, 41, 72, 42, 73, 43, 74, 44, 75,
+ 45, 76, 46, 77, 47, 78, 48, 79, 49, 80, 50, 81, 51, 82,
+ 52, 83, 53, 84, 54, 85, 55, 86, 56, 87, 57, 88, 58, 89,
+ 59, 90, 60, 91, 61, 92, 62, 93, 63, 94, 64, 64, 65, 96,
+ 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71, 102, 72, 103,
+ 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78, 109, 79, 110,
+ 80, 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117,
+ 87, 118, 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124,
+ 94, 125, 95, 126, 96, 96, 97, 128, 98, 129, 99, 130, 100, 131,
+ 101, 132, 102, 133, 103, 134, 104, 135, 105, 136, 106, 137, 107, 138,
+ 108, 139, 109, 140, 110, 141, 111, 142, 112, 143, 113, 144, 114, 145,
+ 115, 146, 116, 147, 117, 148, 118, 149, 119, 150, 120, 151, 121, 152,
+ 122, 153, 123, 154, 124, 155, 125, 156, 126, 157, 127, 158, 128, 128,
+ 129, 160, 130, 161, 131, 162, 132, 163, 133, 164, 134, 165, 135, 166,
+ 136, 167, 137, 168, 138, 169, 139, 170, 140, 171, 141, 172, 142, 173,
+ 143, 174, 144, 175, 145, 176, 146, 177, 147, 178, 148, 179, 149, 180,
+ 150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, 186, 156, 187,
+ 157, 188, 158, 189, 159, 190, 160, 160, 161, 192, 162, 193, 163, 194,
+ 164, 195, 165, 196, 166, 197, 167, 198, 168, 199, 169, 200, 170, 201,
+ 171, 202, 172, 203, 173, 204, 174, 205, 175, 206, 176, 207, 177, 208,
+ 178, 209, 179, 210, 180, 211, 181, 212, 182, 213, 183, 214, 184, 215,
+ 185, 216, 186, 217, 187, 218, 188, 219, 189, 220, 190, 221, 191, 222,
+ 192, 192, 193, 224, 194, 225, 195, 226, 196, 227, 197, 228, 198, 229,
+ 199, 230, 200, 231, 201, 232, 202, 233, 203, 234, 204, 235, 205, 236,
+ 206, 237, 207, 238, 208, 239, 209, 240, 210, 241, 211, 242, 212, 243,
+ 213, 244, 214, 245, 215, 246, 216, 247, 217, 248, 218, 249, 219, 250,
+ 220, 251, 221, 252, 222, 253, 223, 254, 224, 224, 225, 256, 226, 257,
+ 227, 258, 228, 259, 229, 260, 230, 261, 231, 262, 232, 263, 233, 264,
+ 234, 265, 235, 266, 236, 267, 237, 268, 238, 269, 239, 270, 240, 271,
+ 241, 272, 242, 273, 243, 274, 244, 275, 245, 276, 246, 277, 247, 278,
+ 248, 279, 249, 280, 250, 281, 251, 282, 252, 283, 253, 284, 254, 285,
+ 255, 286, 256, 256, 257, 288, 258, 289, 259, 290, 260, 291, 261, 292,
+ 262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298, 268, 299,
+ 269, 300, 270, 301, 271, 302, 272, 303, 273, 304, 274, 305, 275, 306,
+ 276, 307, 277, 308, 278, 309, 279, 310, 280, 311, 281, 312, 282, 313,
+ 283, 314, 284, 315, 285, 316, 286, 317, 287, 318, 288, 288, 289, 320,
+ 290, 321, 291, 322, 292, 323, 293, 324, 294, 325, 295, 326, 296, 327,
+ 297, 328, 298, 329, 299, 330, 300, 331, 301, 332, 302, 333, 303, 334,
+ 304, 335, 305, 336, 306, 337, 307, 338, 308, 339, 309, 340, 310, 341,
+ 311, 342, 312, 343, 313, 344, 314, 345, 315, 346, 316, 347, 317, 348,
+ 318, 349, 319, 350, 320, 320, 321, 352, 322, 353, 323, 354, 324, 355,
+ 325, 356, 326, 357, 327, 358, 328, 359, 329, 360, 330, 361, 331, 362,
+ 332, 363, 333, 364, 334, 365, 335, 366, 336, 367, 337, 368, 338, 369,
+ 339, 370, 340, 371, 341, 372, 342, 373, 343, 374, 344, 375, 345, 376,
+ 346, 377, 347, 378, 348, 379, 349, 380, 350, 381, 351, 382, 352, 352,
+ 353, 384, 354, 385, 355, 386, 356, 387, 357, 388, 358, 389, 359, 390,
+ 360, 391, 361, 392, 362, 393, 363, 394, 364, 395, 365, 396, 366, 397,
+ 367, 398, 368, 399, 369, 400, 370, 401, 371, 402, 372, 403, 373, 404,
+ 374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410, 380, 411,
+ 381, 412, 382, 413, 383, 414, 384, 384, 385, 416, 386, 417, 387, 418,
+ 388, 419, 389, 420, 390, 421, 391, 422, 392, 423, 393, 424, 394, 425,
+ 395, 426, 396, 427, 397, 428, 398, 429, 399, 430, 400, 431, 401, 432,
+ 402, 433, 403, 434, 404, 435, 405, 436, 406, 437, 407, 438, 408, 439,
+ 409, 440, 410, 441, 411, 442, 412, 443, 413, 444, 414, 445, 415, 446,
+ 416, 416, 417, 448, 418, 449, 419, 450, 420, 451, 421, 452, 422, 453,
+ 423, 454, 424, 455, 425, 456, 426, 457, 427, 458, 428, 459, 429, 460,
+ 430, 461, 431, 462, 432, 463, 433, 464, 434, 465, 435, 466, 436, 467,
+ 437, 468, 438, 469, 439, 470, 440, 471, 441, 472, 442, 473, 443, 474,
+ 444, 475, 445, 476, 446, 477, 447, 478, 448, 448, 449, 480, 450, 481,
+ 451, 482, 452, 483, 453, 484, 454, 485, 455, 486, 456, 487, 457, 488,
+ 458, 489, 459, 490, 460, 491, 461, 492, 462, 493, 463, 494, 464, 495,
+ 465, 496, 466, 497, 467, 498, 468, 499, 469, 500, 470, 501, 471, 502,
+ 472, 503, 473, 504, 474, 505, 475, 506, 476, 507, 477, 508, 478, 509,
+ 479, 510, 480, 480, 481, 512, 482, 513, 483, 514, 484, 515, 485, 516,
+ 486, 517, 487, 518, 488, 519, 489, 520, 490, 521, 491, 522, 492, 523,
+ 493, 524, 494, 525, 495, 526, 496, 527, 497, 528, 498, 529, 499, 530,
+ 500, 531, 501, 532, 502, 533, 503, 534, 504, 535, 505, 536, 506, 537,
+ 507, 538, 508, 539, 509, 540, 510, 541, 511, 542, 512, 512, 513, 544,
+ 514, 545, 515, 546, 516, 547, 517, 548, 518, 549, 519, 550, 520, 551,
+ 521, 552, 522, 553, 523, 554, 524, 555, 525, 556, 526, 557, 527, 558,
+ 528, 559, 529, 560, 530, 561, 531, 562, 532, 563, 533, 564, 534, 565,
+ 535, 566, 536, 567, 537, 568, 538, 569, 539, 570, 540, 571, 541, 572,
+ 542, 573, 543, 574, 544, 544, 545, 576, 546, 577, 547, 578, 548, 579,
+ 549, 580, 550, 581, 551, 582, 552, 583, 553, 584, 554, 585, 555, 586,
+ 556, 587, 557, 588, 558, 589, 559, 590, 560, 591, 561, 592, 562, 593,
+ 563, 594, 564, 595, 565, 596, 566, 597, 567, 598, 568, 599, 569, 600,
+ 570, 601, 571, 602, 572, 603, 573, 604, 574, 605, 575, 606, 576, 576,
+ 577, 608, 578, 609, 579, 610, 580, 611, 581, 612, 582, 613, 583, 614,
+ 584, 615, 585, 616, 586, 617, 587, 618, 588, 619, 589, 620, 590, 621,
+ 591, 622, 592, 623, 593, 624, 594, 625, 595, 626, 596, 627, 597, 628,
+ 598, 629, 599, 630, 600, 631, 601, 632, 602, 633, 603, 634, 604, 635,
+ 605, 636, 606, 637, 607, 638, 608, 608, 609, 640, 610, 641, 611, 642,
+ 612, 643, 613, 644, 614, 645, 615, 646, 616, 647, 617, 648, 618, 649,
+ 619, 650, 620, 651, 621, 652, 622, 653, 623, 654, 624, 655, 625, 656,
+ 626, 657, 627, 658, 628, 659, 629, 660, 630, 661, 631, 662, 632, 663,
+ 633, 664, 634, 665, 635, 666, 636, 667, 637, 668, 638, 669, 639, 670,
+ 640, 640, 641, 672, 642, 673, 643, 674, 644, 675, 645, 676, 646, 677,
+ 647, 678, 648, 679, 649, 680, 650, 681, 651, 682, 652, 683, 653, 684,
+ 654, 685, 655, 686, 656, 687, 657, 688, 658, 689, 659, 690, 660, 691,
+ 661, 692, 662, 693, 663, 694, 664, 695, 665, 696, 666, 697, 667, 698,
+ 668, 699, 669, 700, 670, 701, 671, 702, 672, 672, 673, 704, 674, 705,
+ 675, 706, 676, 707, 677, 708, 678, 709, 679, 710, 680, 711, 681, 712,
+ 682, 713, 683, 714, 684, 715, 685, 716, 686, 717, 687, 718, 688, 719,
+ 689, 720, 690, 721, 691, 722, 692, 723, 693, 724, 694, 725, 695, 726,
+ 696, 727, 697, 728, 698, 729, 699, 730, 700, 731, 701, 732, 702, 733,
+ 703, 734, 704, 704, 705, 736, 706, 737, 707, 738, 708, 739, 709, 740,
+ 710, 741, 711, 742, 712, 743, 713, 744, 714, 745, 715, 746, 716, 747,
+ 717, 748, 718, 749, 719, 750, 720, 751, 721, 752, 722, 753, 723, 754,
+ 724, 755, 725, 756, 726, 757, 727, 758, 728, 759, 729, 760, 730, 761,
+ 731, 762, 732, 763, 733, 764, 734, 765, 735, 766, 736, 736, 737, 768,
+ 738, 769, 739, 770, 740, 771, 741, 772, 742, 773, 743, 774, 744, 775,
+ 745, 776, 746, 777, 747, 778, 748, 779, 749, 780, 750, 781, 751, 782,
+ 752, 783, 753, 784, 754, 785, 755, 786, 756, 787, 757, 788, 758, 789,
+ 759, 790, 760, 791, 761, 792, 762, 793, 763, 794, 764, 795, 765, 796,
+ 766, 797, 767, 798, 768, 768, 769, 800, 770, 801, 771, 802, 772, 803,
+ 773, 804, 774, 805, 775, 806, 776, 807, 777, 808, 778, 809, 779, 810,
+ 780, 811, 781, 812, 782, 813, 783, 814, 784, 815, 785, 816, 786, 817,
+ 787, 818, 788, 819, 789, 820, 790, 821, 791, 822, 792, 823, 793, 824,
+ 794, 825, 795, 826, 796, 827, 797, 828, 798, 829, 799, 830, 800, 800,
+ 801, 832, 802, 833, 803, 834, 804, 835, 805, 836, 806, 837, 807, 838,
+ 808, 839, 809, 840, 810, 841, 811, 842, 812, 843, 813, 844, 814, 845,
+ 815, 846, 816, 847, 817, 848, 818, 849, 819, 850, 820, 851, 821, 852,
+ 822, 853, 823, 854, 824, 855, 825, 856, 826, 857, 827, 858, 828, 859,
+ 829, 860, 830, 861, 831, 862, 832, 832, 833, 864, 834, 865, 835, 866,
+ 836, 867, 837, 868, 838, 869, 839, 870, 840, 871, 841, 872, 842, 873,
+ 843, 874, 844, 875, 845, 876, 846, 877, 847, 878, 848, 879, 849, 880,
+ 850, 881, 851, 882, 852, 883, 853, 884, 854, 885, 855, 886, 856, 887,
+ 857, 888, 858, 889, 859, 890, 860, 891, 861, 892, 862, 893, 863, 894,
+ 864, 864, 865, 896, 866, 897, 867, 898, 868, 899, 869, 900, 870, 901,
+ 871, 902, 872, 903, 873, 904, 874, 905, 875, 906, 876, 907, 877, 908,
+ 878, 909, 879, 910, 880, 911, 881, 912, 882, 913, 883, 914, 884, 915,
+ 885, 916, 886, 917, 887, 918, 888, 919, 889, 920, 890, 921, 891, 922,
+ 892, 923, 893, 924, 894, 925, 895, 926, 896, 896, 897, 928, 898, 929,
+ 899, 930, 900, 931, 901, 932, 902, 933, 903, 934, 904, 935, 905, 936,
+ 906, 937, 907, 938, 908, 939, 909, 940, 910, 941, 911, 942, 912, 943,
+ 913, 944, 914, 945, 915, 946, 916, 947, 917, 948, 918, 949, 919, 950,
+ 920, 951, 921, 952, 922, 953, 923, 954, 924, 955, 925, 956, 926, 957,
+ 927, 958, 928, 928, 929, 960, 930, 961, 931, 962, 932, 963, 933, 964,
+ 934, 965, 935, 966, 936, 967, 937, 968, 938, 969, 939, 970, 940, 971,
+ 941, 972, 942, 973, 943, 974, 944, 975, 945, 976, 946, 977, 947, 978,
+ 948, 979, 949, 980, 950, 981, 951, 982, 952, 983, 953, 984, 954, 985,
+ 955, 986, 956, 987, 957, 988, 958, 989, 959, 990, 960, 960, 961, 992,
+ 962, 993, 963, 994, 964, 995, 965, 996, 966, 997, 967, 998, 968, 999,
+ 969, 1000, 970, 1001, 971, 1002, 972, 1003, 973, 1004, 974, 1005, 975, 1006,
+ 976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981, 1012, 982, 1013,
+ 983, 1014, 984, 1015, 985, 1016, 986, 1017, 987, 1018, 988, 1019, 989, 1020,
+ 990, 1021, 991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+ 0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 2, 2,
+ 2, 33, 33, 64, 64, 64, 96, 96, 65, 96, 34, 65, 3, 34,
+ 3, 3, 4, 4, 4, 35, 35, 66, 66, 97, 97, 128, 128, 128,
+ 160, 160, 129, 160, 98, 129, 67, 98, 36, 67, 5, 36, 5, 5,
+ 6, 6, 6, 37, 37, 68, 68, 99, 99, 130, 130, 161, 161, 192,
+ 192, 192, 224, 224, 193, 224, 162, 193, 131, 162, 100, 131, 69, 100,
+ 38, 69, 7, 38, 7, 7, 8, 8, 8, 39, 39, 70, 70, 101,
+ 101, 132, 132, 163, 163, 194, 194, 225, 225, 256, 256, 256, 288, 288,
+ 257, 288, 226, 257, 195, 226, 164, 195, 133, 164, 102, 133, 71, 102,
+ 40, 71, 9, 40, 9, 9, 10, 10, 10, 41, 41, 72, 72, 103,
+ 103, 134, 134, 165, 165, 196, 196, 227, 227, 258, 258, 289, 289, 320,
+ 320, 320, 352, 352, 321, 352, 290, 321, 259, 290, 228, 259, 197, 228,
+ 166, 197, 135, 166, 104, 135, 73, 104, 42, 73, 11, 42, 11, 11,
+ 12, 12, 12, 43, 43, 74, 74, 105, 105, 136, 136, 167, 167, 198,
+ 198, 229, 229, 260, 260, 291, 291, 322, 322, 353, 353, 384, 384, 384,
+ 416, 416, 385, 416, 354, 385, 323, 354, 292, 323, 261, 292, 230, 261,
+ 199, 230, 168, 199, 137, 168, 106, 137, 75, 106, 44, 75, 13, 44,
+ 13, 13, 14, 14, 14, 45, 45, 76, 76, 107, 107, 138, 138, 169,
+ 169, 200, 200, 231, 231, 262, 262, 293, 293, 324, 324, 355, 355, 386,
+ 386, 417, 417, 448, 448, 448, 480, 480, 449, 480, 418, 449, 387, 418,
+ 356, 387, 325, 356, 294, 325, 263, 294, 232, 263, 201, 232, 170, 201,
+ 139, 170, 108, 139, 77, 108, 46, 77, 15, 46, 15, 15, 16, 16,
+ 16, 47, 47, 78, 78, 109, 109, 140, 140, 171, 171, 202, 202, 233,
+ 233, 264, 264, 295, 295, 326, 326, 357, 357, 388, 388, 419, 419, 450,
+ 450, 481, 481, 512, 512, 512, 544, 544, 513, 544, 482, 513, 451, 482,
+ 420, 451, 389, 420, 358, 389, 327, 358, 296, 327, 265, 296, 234, 265,
+ 203, 234, 172, 203, 141, 172, 110, 141, 79, 110, 48, 79, 17, 48,
+ 17, 17, 18, 18, 18, 49, 49, 80, 80, 111, 111, 142, 142, 173,
+ 173, 204, 204, 235, 235, 266, 266, 297, 297, 328, 328, 359, 359, 390,
+ 390, 421, 421, 452, 452, 483, 483, 514, 514, 545, 545, 576, 576, 576,
+ 608, 608, 577, 608, 546, 577, 515, 546, 484, 515, 453, 484, 422, 453,
+ 391, 422, 360, 391, 329, 360, 298, 329, 267, 298, 236, 267, 205, 236,
+ 174, 205, 143, 174, 112, 143, 81, 112, 50, 81, 19, 50, 19, 19,
+ 20, 20, 20, 51, 51, 82, 82, 113, 113, 144, 144, 175, 175, 206,
+ 206, 237, 237, 268, 268, 299, 299, 330, 330, 361, 361, 392, 392, 423,
+ 423, 454, 454, 485, 485, 516, 516, 547, 547, 578, 578, 609, 609, 640,
+ 640, 640, 672, 672, 641, 672, 610, 641, 579, 610, 548, 579, 517, 548,
+ 486, 517, 455, 486, 424, 455, 393, 424, 362, 393, 331, 362, 300, 331,
+ 269, 300, 238, 269, 207, 238, 176, 207, 145, 176, 114, 145, 83, 114,
+ 52, 83, 21, 52, 21, 21, 22, 22, 22, 53, 53, 84, 84, 115,
+ 115, 146, 146, 177, 177, 208, 208, 239, 239, 270, 270, 301, 301, 332,
+ 332, 363, 363, 394, 394, 425, 425, 456, 456, 487, 487, 518, 518, 549,
+ 549, 580, 580, 611, 611, 642, 642, 673, 673, 704, 704, 704, 736, 736,
+ 705, 736, 674, 705, 643, 674, 612, 643, 581, 612, 550, 581, 519, 550,
+ 488, 519, 457, 488, 426, 457, 395, 426, 364, 395, 333, 364, 302, 333,
+ 271, 302, 240, 271, 209, 240, 178, 209, 147, 178, 116, 147, 85, 116,
+ 54, 85, 23, 54, 23, 23, 24, 24, 24, 55, 55, 86, 86, 117,
+ 117, 148, 148, 179, 179, 210, 210, 241, 241, 272, 272, 303, 303, 334,
+ 334, 365, 365, 396, 396, 427, 427, 458, 458, 489, 489, 520, 520, 551,
+ 551, 582, 582, 613, 613, 644, 644, 675, 675, 706, 706, 737, 737, 768,
+ 768, 768, 800, 800, 769, 800, 738, 769, 707, 738, 676, 707, 645, 676,
+ 614, 645, 583, 614, 552, 583, 521, 552, 490, 521, 459, 490, 428, 459,
+ 397, 428, 366, 397, 335, 366, 304, 335, 273, 304, 242, 273, 211, 242,
+ 180, 211, 149, 180, 118, 149, 87, 118, 56, 87, 25, 56, 25, 25,
+ 26, 26, 26, 57, 57, 88, 88, 119, 119, 150, 150, 181, 181, 212,
+ 212, 243, 243, 274, 274, 305, 305, 336, 336, 367, 367, 398, 398, 429,
+ 429, 460, 460, 491, 491, 522, 522, 553, 553, 584, 584, 615, 615, 646,
+ 646, 677, 677, 708, 708, 739, 739, 770, 770, 801, 801, 832, 832, 832,
+ 864, 864, 833, 864, 802, 833, 771, 802, 740, 771, 709, 740, 678, 709,
+ 647, 678, 616, 647, 585, 616, 554, 585, 523, 554, 492, 523, 461, 492,
+ 430, 461, 399, 430, 368, 399, 337, 368, 306, 337, 275, 306, 244, 275,
+ 213, 244, 182, 213, 151, 182, 120, 151, 89, 120, 58, 89, 27, 58,
+ 27, 27, 28, 28, 28, 59, 59, 90, 90, 121, 121, 152, 152, 183,
+ 183, 214, 214, 245, 245, 276, 276, 307, 307, 338, 338, 369, 369, 400,
+ 400, 431, 431, 462, 462, 493, 493, 524, 524, 555, 555, 586, 586, 617,
+ 617, 648, 648, 679, 679, 710, 710, 741, 741, 772, 772, 803, 803, 834,
+ 834, 865, 865, 896, 896, 896, 928, 928, 897, 928, 866, 897, 835, 866,
+ 804, 835, 773, 804, 742, 773, 711, 742, 680, 711, 649, 680, 618, 649,
+ 587, 618, 556, 587, 525, 556, 494, 525, 463, 494, 432, 463, 401, 432,
+ 370, 401, 339, 370, 308, 339, 277, 308, 246, 277, 215, 246, 184, 215,
+ 153, 184, 122, 153, 91, 122, 60, 91, 29, 60, 29, 29, 30, 30,
+ 30, 61, 61, 92, 92, 123, 123, 154, 154, 185, 185, 216, 216, 247,
+ 247, 278, 278, 309, 309, 340, 340, 371, 371, 402, 402, 433, 433, 464,
+ 464, 495, 495, 526, 526, 557, 557, 588, 588, 619, 619, 650, 650, 681,
+ 681, 712, 712, 743, 743, 774, 774, 805, 805, 836, 836, 867, 867, 898,
+ 898, 929, 929, 960, 960, 960, 961, 992, 930, 961, 899, 930, 868, 899,
+ 837, 868, 806, 837, 775, 806, 744, 775, 713, 744, 682, 713, 651, 682,
+ 620, 651, 589, 620, 558, 589, 527, 558, 496, 527, 465, 496, 434, 465,
+ 403, 434, 372, 403, 341, 372, 310, 341, 279, 310, 248, 279, 217, 248,
+ 186, 217, 155, 186, 124, 155, 93, 124, 62, 93, 31, 62, 63, 94,
+ 94, 125, 125, 156, 156, 187, 187, 218, 218, 249, 249, 280, 280, 311,
+ 311, 342, 342, 373, 373, 404, 404, 435, 435, 466, 466, 497, 497, 528,
+ 528, 559, 559, 590, 590, 621, 621, 652, 652, 683, 683, 714, 714, 745,
+ 745, 776, 776, 807, 807, 838, 838, 869, 869, 900, 900, 931, 931, 962,
+ 962, 993, 963, 994, 932, 963, 901, 932, 870, 901, 839, 870, 808, 839,
+ 777, 808, 746, 777, 715, 746, 684, 715, 653, 684, 622, 653, 591, 622,
+ 560, 591, 529, 560, 498, 529, 467, 498, 436, 467, 405, 436, 374, 405,
+ 343, 374, 312, 343, 281, 312, 250, 281, 219, 250, 188, 219, 157, 188,
+ 126, 157, 95, 126, 127, 158, 158, 189, 189, 220, 220, 251, 251, 282,
+ 282, 313, 313, 344, 344, 375, 375, 406, 406, 437, 437, 468, 468, 499,
+ 499, 530, 530, 561, 561, 592, 592, 623, 623, 654, 654, 685, 685, 716,
+ 716, 747, 747, 778, 778, 809, 809, 840, 840, 871, 871, 902, 902, 933,
+ 933, 964, 964, 995, 965, 996, 934, 965, 903, 934, 872, 903, 841, 872,
+ 810, 841, 779, 810, 748, 779, 717, 748, 686, 717, 655, 686, 624, 655,
+ 593, 624, 562, 593, 531, 562, 500, 531, 469, 500, 438, 469, 407, 438,
+ 376, 407, 345, 376, 314, 345, 283, 314, 252, 283, 221, 252, 190, 221,
+ 159, 190, 191, 222, 222, 253, 253, 284, 284, 315, 315, 346, 346, 377,
+ 377, 408, 408, 439, 439, 470, 470, 501, 501, 532, 532, 563, 563, 594,
+ 594, 625, 625, 656, 656, 687, 687, 718, 718, 749, 749, 780, 780, 811,
+ 811, 842, 842, 873, 873, 904, 904, 935, 935, 966, 966, 997, 967, 998,
+ 936, 967, 905, 936, 874, 905, 843, 874, 812, 843, 781, 812, 750, 781,
+ 719, 750, 688, 719, 657, 688, 626, 657, 595, 626, 564, 595, 533, 564,
+ 502, 533, 471, 502, 440, 471, 409, 440, 378, 409, 347, 378, 316, 347,
+ 285, 316, 254, 285, 223, 254, 255, 286, 286, 317, 317, 348, 348, 379,
+ 379, 410, 410, 441, 441, 472, 472, 503, 503, 534, 534, 565, 565, 596,
+ 596, 627, 627, 658, 658, 689, 689, 720, 720, 751, 751, 782, 782, 813,
+ 813, 844, 844, 875, 875, 906, 906, 937, 937, 968, 968, 999, 969, 1000,
+ 938, 969, 907, 938, 876, 907, 845, 876, 814, 845, 783, 814, 752, 783,
+ 721, 752, 690, 721, 659, 690, 628, 659, 597, 628, 566, 597, 535, 566,
+ 504, 535, 473, 504, 442, 473, 411, 442, 380, 411, 349, 380, 318, 349,
+ 287, 318, 319, 350, 350, 381, 381, 412, 412, 443, 443, 474, 474, 505,
+ 505, 536, 536, 567, 567, 598, 598, 629, 629, 660, 660, 691, 691, 722,
+ 722, 753, 753, 784, 784, 815, 815, 846, 846, 877, 877, 908, 908, 939,
+ 939, 970, 970, 1001, 971, 1002, 940, 971, 909, 940, 878, 909, 847, 878,
+ 816, 847, 785, 816, 754, 785, 723, 754, 692, 723, 661, 692, 630, 661,
+ 599, 630, 568, 599, 537, 568, 506, 537, 475, 506, 444, 475, 413, 444,
+ 382, 413, 351, 382, 383, 414, 414, 445, 445, 476, 476, 507, 507, 538,
+ 538, 569, 569, 600, 600, 631, 631, 662, 662, 693, 693, 724, 724, 755,
+ 755, 786, 786, 817, 817, 848, 848, 879, 879, 910, 910, 941, 941, 972,
+ 972, 1003, 973, 1004, 942, 973, 911, 942, 880, 911, 849, 880, 818, 849,
+ 787, 818, 756, 787, 725, 756, 694, 725, 663, 694, 632, 663, 601, 632,
+ 570, 601, 539, 570, 508, 539, 477, 508, 446, 477, 415, 446, 447, 478,
+ 478, 509, 509, 540, 540, 571, 571, 602, 602, 633, 633, 664, 664, 695,
+ 695, 726, 726, 757, 757, 788, 788, 819, 819, 850, 850, 881, 881, 912,
+ 912, 943, 943, 974, 974, 1005, 975, 1006, 944, 975, 913, 944, 882, 913,
+ 851, 882, 820, 851, 789, 820, 758, 789, 727, 758, 696, 727, 665, 696,
+ 634, 665, 603, 634, 572, 603, 541, 572, 510, 541, 479, 510, 511, 542,
+ 542, 573, 573, 604, 604, 635, 635, 666, 666, 697, 697, 728, 728, 759,
+ 759, 790, 790, 821, 821, 852, 852, 883, 883, 914, 914, 945, 945, 976,
+ 976, 1007, 977, 1008, 946, 977, 915, 946, 884, 915, 853, 884, 822, 853,
+ 791, 822, 760, 791, 729, 760, 698, 729, 667, 698, 636, 667, 605, 636,
+ 574, 605, 543, 574, 575, 606, 606, 637, 637, 668, 668, 699, 699, 730,
+ 730, 761, 761, 792, 792, 823, 823, 854, 854, 885, 885, 916, 916, 947,
+ 947, 978, 978, 1009, 979, 1010, 948, 979, 917, 948, 886, 917, 855, 886,
+ 824, 855, 793, 824, 762, 793, 731, 762, 700, 731, 669, 700, 638, 669,
+ 607, 638, 639, 670, 670, 701, 701, 732, 732, 763, 763, 794, 794, 825,
+ 825, 856, 856, 887, 887, 918, 918, 949, 949, 980, 980, 1011, 981, 1012,
+ 950, 981, 919, 950, 888, 919, 857, 888, 826, 857, 795, 826, 764, 795,
+ 733, 764, 702, 733, 671, 702, 703, 734, 734, 765, 765, 796, 796, 827,
+ 827, 858, 858, 889, 889, 920, 920, 951, 951, 982, 982, 1013, 983, 1014,
+ 952, 983, 921, 952, 890, 921, 859, 890, 828, 859, 797, 828, 766, 797,
+ 735, 766, 767, 798, 798, 829, 829, 860, 860, 891, 891, 922, 922, 953,
+ 953, 984, 984, 1015, 985, 1016, 954, 985, 923, 954, 892, 923, 861, 892,
+ 830, 861, 799, 830, 831, 862, 862, 893, 893, 924, 924, 955, 955, 986,
+ 986, 1017, 987, 1018, 956, 987, 925, 956, 894, 925, 863, 894, 895, 926,
+ 926, 957, 957, 988, 988, 1019, 989, 1020, 958, 989, 927, 958, 959, 990,
+ 990, 1021, 991, 1022, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x4[16]) = {
+ 0, 1, 5, 6, 2, 4, 7, 12, 3, 8, 11, 13, 9, 10, 14, 15
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x4[16]) = {
+ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x4[16]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x8[32]) = {
+ 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18,
+ 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 29, 25, 28, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x8[32]) = {
+ 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+ 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x8[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x4[32]) = {
+ 0, 2, 5, 9, 13, 17, 21, 25, 1, 4, 8, 12, 16, 20, 24, 28,
+ 3, 7, 11, 15, 19, 23, 27, 30, 6, 10, 14, 18, 22, 26, 29, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x4[32]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29,
+ 2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x4[32]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_4x16[64]) = {
+ 0, 1, 3, 6, 2, 4, 7, 10, 5, 8, 11, 14, 9, 12, 15, 18,
+ 13, 16, 19, 22, 17, 20, 23, 26, 21, 24, 27, 30, 25, 28, 31, 34,
+ 29, 32, 35, 38, 33, 36, 39, 42, 37, 40, 43, 46, 41, 44, 47, 50,
+ 45, 48, 51, 54, 49, 52, 55, 58, 53, 56, 59, 61, 57, 60, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x4[64]) = {
+ 0, 2, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57,
+ 1, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 62,
+ 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 61, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_4x16[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x4[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_4x16[64]) = {
+ 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51,
+ 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55,
+ 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59,
+ 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x4[64]) = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60,
+ 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61,
+ 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62,
+ 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x32[256]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29,
+ 36, 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38,
+ 45, 52, 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47,
+ 54, 61, 68, 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56,
+ 63, 70, 77, 84, 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65,
+ 72, 79, 86, 93, 100, 59, 66, 73, 80, 87, 94, 101, 108, 67, 74,
+ 81, 88, 95, 102, 109, 116, 75, 82, 89, 96, 103, 110, 117, 124, 83,
+ 90, 97, 104, 111, 118, 125, 132, 91, 98, 105, 112, 119, 126, 133, 140,
+ 99, 106, 113, 120, 127, 134, 141, 148, 107, 114, 121, 128, 135, 142, 149,
+ 156, 115, 122, 129, 136, 143, 150, 157, 164, 123, 130, 137, 144, 151, 158,
+ 165, 172, 131, 138, 145, 152, 159, 166, 173, 180, 139, 146, 153, 160, 167,
+ 174, 181, 188, 147, 154, 161, 168, 175, 182, 189, 196, 155, 162, 169, 176,
+ 183, 190, 197, 204, 163, 170, 177, 184, 191, 198, 205, 212, 171, 178, 185,
+ 192, 199, 206, 213, 220, 179, 186, 193, 200, 207, 214, 221, 228, 187, 194,
+ 201, 208, 215, 222, 229, 235, 195, 202, 209, 216, 223, 230, 236, 241, 203,
+ 210, 217, 224, 231, 237, 242, 246, 211, 218, 225, 232, 238, 243, 247, 250,
+ 219, 226, 233, 239, 244, 248, 251, 253, 227, 234, 240, 245, 249, 252, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x8[256]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91,
+ 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187, 195, 203, 211,
+ 219, 227, 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73,
+ 81, 89, 97, 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193,
+ 201, 209, 217, 225, 233, 240, 6, 11, 17, 24, 32, 40, 48, 56, 64,
+ 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184,
+ 192, 200, 208, 216, 224, 232, 239, 245, 10, 16, 23, 31, 39, 47, 55,
+ 63, 71, 79, 87, 95, 103, 111, 119, 127, 135, 143, 151, 159, 167, 175,
+ 183, 191, 199, 207, 215, 223, 231, 238, 244, 249, 15, 22, 30, 38, 46,
+ 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142, 150, 158, 166,
+ 174, 182, 190, 198, 206, 214, 222, 230, 237, 243, 248, 252, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 236, 242, 247, 251, 254, 28,
+ 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148,
+ 156, 164, 172, 180, 188, 196, 204, 212, 220, 228, 235, 241, 246, 250, 253,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x32[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x8[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x32[256]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 1, 33, 65, 97, 129, 161, 193, 225,
+ 2, 34, 66, 98, 130, 162, 194, 226, 3, 35, 67, 99, 131, 163, 195, 227,
+ 4, 36, 68, 100, 132, 164, 196, 228, 5, 37, 69, 101, 133, 165, 197, 229,
+ 6, 38, 70, 102, 134, 166, 198, 230, 7, 39, 71, 103, 135, 167, 199, 231,
+ 8, 40, 72, 104, 136, 168, 200, 232, 9, 41, 73, 105, 137, 169, 201, 233,
+ 10, 42, 74, 106, 138, 170, 202, 234, 11, 43, 75, 107, 139, 171, 203, 235,
+ 12, 44, 76, 108, 140, 172, 204, 236, 13, 45, 77, 109, 141, 173, 205, 237,
+ 14, 46, 78, 110, 142, 174, 206, 238, 15, 47, 79, 111, 143, 175, 207, 239,
+ 16, 48, 80, 112, 144, 176, 208, 240, 17, 49, 81, 113, 145, 177, 209, 241,
+ 18, 50, 82, 114, 146, 178, 210, 242, 19, 51, 83, 115, 147, 179, 211, 243,
+ 20, 52, 84, 116, 148, 180, 212, 244, 21, 53, 85, 117, 149, 181, 213, 245,
+ 22, 54, 86, 118, 150, 182, 214, 246, 23, 55, 87, 119, 151, 183, 215, 247,
+ 24, 56, 88, 120, 152, 184, 216, 248, 25, 57, 89, 121, 153, 185, 217, 249,
+ 26, 58, 90, 122, 154, 186, 218, 250, 27, 59, 91, 123, 155, 187, 219, 251,
+ 28, 60, 92, 124, 156, 188, 220, 252, 29, 61, 93, 125, 157, 189, 221, 253,
+ 30, 62, 94, 126, 158, 190, 222, 254, 31, 63, 95, 127, 159, 191, 223, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x8[256]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112,
+ 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232,
+ 240, 248, 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97,
+ 105, 113, 121, 129, 137, 145, 153, 161, 169, 177, 185, 193, 201, 209, 217,
+ 225, 233, 241, 249, 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82,
+ 90, 98, 106, 114, 122, 130, 138, 146, 154, 162, 170, 178, 186, 194, 202,
+ 210, 218, 226, 234, 242, 250, 3, 11, 19, 27, 35, 43, 51, 59, 67,
+ 75, 83, 91, 99, 107, 115, 123, 131, 139, 147, 155, 163, 171, 179, 187,
+ 195, 203, 211, 219, 227, 235, 243, 251, 4, 12, 20, 28, 36, 44, 52,
+ 60, 68, 76, 84, 92, 100, 108, 116, 124, 132, 140, 148, 156, 164, 172,
+ 180, 188, 196, 204, 212, 220, 228, 236, 244, 252, 5, 13, 21, 29, 37,
+ 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125, 133, 141, 149, 157,
+ 165, 173, 181, 189, 197, 205, 213, 221, 229, 237, 245, 253, 6, 14, 22,
+ 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126, 134, 142,
+ 150, 158, 166, 174, 182, 190, 198, 206, 214, 222, 230, 238, 246, 254, 7,
+ 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+ 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x8[64]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57,
+ 2, 10, 18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59,
+ 4, 12, 20, 28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61,
+ 6, 14, 22, 30, 38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x8[64]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x8[64]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 2, 4, 7, 13, 16, 26, 29, 42,
+ 3, 8, 12, 17, 25, 30, 41, 43, 9, 11, 18, 24, 31, 40, 44, 53,
+ 10, 19, 23, 32, 39, 45, 52, 54, 20, 22, 33, 38, 46, 51, 55, 60,
+ 21, 34, 37, 47, 50, 56, 59, 61, 35, 36, 48, 49, 57, 58, 62, 63
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_8x16[128]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 2, 4, 7, 11, 16, 22, 29, 36,
+ 5, 8, 12, 17, 23, 30, 37, 44, 9, 13, 18, 24, 31, 38, 45, 52,
+ 14, 19, 25, 32, 39, 46, 53, 60, 20, 26, 33, 40, 47, 54, 61, 68,
+ 27, 34, 41, 48, 55, 62, 69, 76, 35, 42, 49, 56, 63, 70, 77, 84,
+ 43, 50, 57, 64, 71, 78, 85, 92, 51, 58, 65, 72, 79, 86, 93, 100,
+ 59, 66, 73, 80, 87, 94, 101, 107, 67, 74, 81, 88, 95, 102, 108, 113,
+ 75, 82, 89, 96, 103, 109, 114, 118, 83, 90, 97, 104, 110, 115, 119, 122,
+ 91, 98, 105, 111, 116, 120, 123, 125, 99, 106, 112, 117, 121, 124, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x8[128]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99,
+ 1, 4, 8, 13, 19, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106,
+ 3, 7, 12, 18, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 112,
+ 6, 11, 17, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 111, 117,
+ 10, 16, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 110, 116, 121,
+ 15, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 109, 115, 120, 124,
+ 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 108, 114, 119, 123, 126,
+ 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 107, 113, 118, 122, 125, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_8x16[128]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 1, 17, 33, 49, 65, 81, 97, 113,
+ 2, 18, 34, 50, 66, 82, 98, 114, 3, 19, 35, 51, 67, 83, 99, 115,
+ 4, 20, 36, 52, 68, 84, 100, 116, 5, 21, 37, 53, 69, 85, 101, 117,
+ 6, 22, 38, 54, 70, 86, 102, 118, 7, 23, 39, 55, 71, 87, 103, 119,
+ 8, 24, 40, 56, 72, 88, 104, 120, 9, 25, 41, 57, 73, 89, 105, 121,
+ 10, 26, 42, 58, 74, 90, 106, 122, 11, 27, 43, 59, 75, 91, 107, 123,
+ 12, 28, 44, 60, 76, 92, 108, 124, 13, 29, 45, 61, 77, 93, 109, 125,
+ 14, 30, 46, 62, 78, 94, 110, 126, 15, 31, 47, 63, 79, 95, 111, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x8[128]) = {
+ 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120,
+ 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, 97, 105, 113, 121,
+ 2, 10, 18, 26, 34, 42, 50, 58, 66, 74, 82, 90, 98, 106, 114, 122,
+ 3, 11, 19, 27, 35, 43, 51, 59, 67, 75, 83, 91, 99, 107, 115, 123,
+ 4, 12, 20, 28, 36, 44, 52, 60, 68, 76, 84, 92, 100, 108, 116, 124,
+ 5, 13, 21, 29, 37, 45, 53, 61, 69, 77, 85, 93, 101, 109, 117, 125,
+ 6, 14, 22, 30, 38, 46, 54, 62, 70, 78, 86, 94, 102, 110, 118, 126,
+ 7, 15, 23, 31, 39, 47, 55, 63, 71, 79, 87, 95, 103, 111, 119, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_8x16[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x8[128]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x32[512]) = {
+ 0, 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 66, 78, 91, 105,
+ 120, 2, 4, 7, 11, 16, 22, 29, 37, 46, 56, 67, 79, 92, 106,
+ 121, 136, 5, 8, 12, 17, 23, 30, 38, 47, 57, 68, 80, 93, 107,
+ 122, 137, 152, 9, 13, 18, 24, 31, 39, 48, 58, 69, 81, 94, 108,
+ 123, 138, 153, 168, 14, 19, 25, 32, 40, 49, 59, 70, 82, 95, 109,
+ 124, 139, 154, 169, 184, 20, 26, 33, 41, 50, 60, 71, 83, 96, 110,
+ 125, 140, 155, 170, 185, 200, 27, 34, 42, 51, 61, 72, 84, 97, 111,
+ 126, 141, 156, 171, 186, 201, 216, 35, 43, 52, 62, 73, 85, 98, 112,
+ 127, 142, 157, 172, 187, 202, 217, 232, 44, 53, 63, 74, 86, 99, 113,
+ 128, 143, 158, 173, 188, 203, 218, 233, 248, 54, 64, 75, 87, 100, 114,
+ 129, 144, 159, 174, 189, 204, 219, 234, 249, 264, 65, 76, 88, 101, 115,
+ 130, 145, 160, 175, 190, 205, 220, 235, 250, 265, 280, 77, 89, 102, 116,
+ 131, 146, 161, 176, 191, 206, 221, 236, 251, 266, 281, 296, 90, 103, 117,
+ 132, 147, 162, 177, 192, 207, 222, 237, 252, 267, 282, 297, 312, 104, 118,
+ 133, 148, 163, 178, 193, 208, 223, 238, 253, 268, 283, 298, 313, 328, 119,
+ 134, 149, 164, 179, 194, 209, 224, 239, 254, 269, 284, 299, 314, 329, 344,
+ 135, 150, 165, 180, 195, 210, 225, 240, 255, 270, 285, 300, 315, 330, 345,
+ 360, 151, 166, 181, 196, 211, 226, 241, 256, 271, 286, 301, 316, 331, 346,
+ 361, 376, 167, 182, 197, 212, 227, 242, 257, 272, 287, 302, 317, 332, 347,
+ 362, 377, 392, 183, 198, 213, 228, 243, 258, 273, 288, 303, 318, 333, 348,
+ 363, 378, 393, 407, 199, 214, 229, 244, 259, 274, 289, 304, 319, 334, 349,
+ 364, 379, 394, 408, 421, 215, 230, 245, 260, 275, 290, 305, 320, 335, 350,
+ 365, 380, 395, 409, 422, 434, 231, 246, 261, 276, 291, 306, 321, 336, 351,
+ 366, 381, 396, 410, 423, 435, 446, 247, 262, 277, 292, 307, 322, 337, 352,
+ 367, 382, 397, 411, 424, 436, 447, 457, 263, 278, 293, 308, 323, 338, 353,
+ 368, 383, 398, 412, 425, 437, 448, 458, 467, 279, 294, 309, 324, 339, 354,
+ 369, 384, 399, 413, 426, 438, 449, 459, 468, 476, 295, 310, 325, 340, 355,
+ 370, 385, 400, 414, 427, 439, 450, 460, 469, 477, 484, 311, 326, 341, 356,
+ 371, 386, 401, 415, 428, 440, 451, 461, 470, 478, 485, 491, 327, 342, 357,
+ 372, 387, 402, 416, 429, 441, 452, 462, 471, 479, 486, 492, 497, 343, 358,
+ 373, 388, 403, 417, 430, 442, 453, 463, 472, 480, 487, 493, 498, 502, 359,
+ 374, 389, 404, 418, 431, 443, 454, 464, 473, 481, 488, 494, 499, 503, 506,
+ 375, 390, 405, 419, 432, 444, 455, 465, 474, 482, 489, 495, 500, 504, 507,
+ 509, 391, 406, 420, 433, 445, 456, 466, 475, 483, 490, 496, 501, 505, 508,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x16[512]) = {
+ 0, 2, 5, 9, 14, 20, 27, 35, 44, 54, 65, 77, 90, 104, 119,
+ 135, 151, 167, 183, 199, 215, 231, 247, 263, 279, 295, 311, 327, 343, 359,
+ 375, 391, 1, 4, 8, 13, 19, 26, 34, 43, 53, 64, 76, 89, 103,
+ 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278, 294, 310, 326, 342,
+ 358, 374, 390, 406, 3, 7, 12, 18, 25, 33, 42, 52, 63, 75, 88,
+ 102, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309, 325,
+ 341, 357, 373, 389, 405, 420, 6, 11, 17, 24, 32, 41, 51, 62, 74,
+ 87, 101, 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308,
+ 324, 340, 356, 372, 388, 404, 419, 433, 10, 16, 23, 31, 40, 50, 61,
+ 73, 86, 100, 115, 131, 147, 163, 179, 195, 211, 227, 243, 259, 275, 291,
+ 307, 323, 339, 355, 371, 387, 403, 418, 432, 445, 15, 22, 30, 39, 49,
+ 60, 72, 85, 99, 114, 130, 146, 162, 178, 194, 210, 226, 242, 258, 274,
+ 290, 306, 322, 338, 354, 370, 386, 402, 417, 431, 444, 456, 21, 29, 38,
+ 48, 59, 71, 84, 98, 113, 129, 145, 161, 177, 193, 209, 225, 241, 257,
+ 273, 289, 305, 321, 337, 353, 369, 385, 401, 416, 430, 443, 455, 466, 28,
+ 37, 47, 58, 70, 83, 97, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 415, 429, 442, 454, 465,
+ 475, 36, 46, 57, 69, 82, 96, 111, 127, 143, 159, 175, 191, 207, 223,
+ 239, 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 414, 428, 441, 453,
+ 464, 474, 483, 45, 56, 68, 81, 95, 110, 126, 142, 158, 174, 190, 206,
+ 222, 238, 254, 270, 286, 302, 318, 334, 350, 366, 382, 398, 413, 427, 440,
+ 452, 463, 473, 482, 490, 55, 67, 80, 94, 109, 125, 141, 157, 173, 189,
+ 205, 221, 237, 253, 269, 285, 301, 317, 333, 349, 365, 381, 397, 412, 426,
+ 439, 451, 462, 472, 481, 489, 496, 66, 79, 93, 108, 124, 140, 156, 172,
+ 188, 204, 220, 236, 252, 268, 284, 300, 316, 332, 348, 364, 380, 396, 411,
+ 425, 438, 450, 461, 471, 480, 488, 495, 501, 78, 92, 107, 123, 139, 155,
+ 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363, 379, 395,
+ 410, 424, 437, 449, 460, 470, 479, 487, 494, 500, 505, 91, 106, 122, 138,
+ 154, 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378,
+ 394, 409, 423, 436, 448, 459, 469, 478, 486, 493, 499, 504, 508, 105, 121,
+ 137, 153, 169, 185, 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361,
+ 377, 393, 408, 422, 435, 447, 458, 468, 477, 485, 492, 498, 503, 507, 510,
+ 120, 136, 152, 168, 184, 200, 216, 232, 248, 264, 280, 296, 312, 328, 344,
+ 360, 376, 392, 407, 421, 434, 446, 457, 467, 476, 484, 491, 497, 502, 506,
+ 509, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x32[512]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+ 1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+ 2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+ 3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+ 5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+ 6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+ 8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+ 9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+ 10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459, 491,
+ 12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460, 492,
+ 13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462, 494,
+ 15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463, 495,
+ 16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464, 496,
+ 17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466, 498,
+ 19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467, 499,
+ 20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469, 501,
+ 22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470, 502,
+ 23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471, 503,
+ 24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473, 505,
+ 26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474, 506,
+ 27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476, 508,
+ 29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477, 509,
+ 30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478, 510,
+ 31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x16[512]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224,
+ 240, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464,
+ 480, 496, 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193,
+ 209, 225, 241, 257, 273, 289, 305, 321, 337, 353, 369, 385, 401, 417, 433,
+ 449, 465, 481, 497, 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162,
+ 178, 194, 210, 226, 242, 258, 274, 290, 306, 322, 338, 354, 370, 386, 402,
+ 418, 434, 450, 466, 482, 498, 3, 19, 35, 51, 67, 83, 99, 115, 131,
+ 147, 163, 179, 195, 211, 227, 243, 259, 275, 291, 307, 323, 339, 355, 371,
+ 387, 403, 419, 435, 451, 467, 483, 499, 4, 20, 36, 52, 68, 84, 100,
+ 116, 132, 148, 164, 180, 196, 212, 228, 244, 260, 276, 292, 308, 324, 340,
+ 356, 372, 388, 404, 420, 436, 452, 468, 484, 500, 5, 21, 37, 53, 69,
+ 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245, 261, 277, 293, 309,
+ 325, 341, 357, 373, 389, 405, 421, 437, 453, 469, 485, 501, 6, 22, 38,
+ 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246, 262, 278,
+ 294, 310, 326, 342, 358, 374, 390, 406, 422, 438, 454, 470, 486, 502, 7,
+ 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 263, 279, 295, 311, 327, 343, 359, 375, 391, 407, 423, 439, 455, 471, 487,
+ 503, 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216,
+ 232, 248, 264, 280, 296, 312, 328, 344, 360, 376, 392, 408, 424, 440, 456,
+ 472, 488, 504, 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185,
+ 201, 217, 233, 249, 265, 281, 297, 313, 329, 345, 361, 377, 393, 409, 425,
+ 441, 457, 473, 489, 505, 10, 26, 42, 58, 74, 90, 106, 122, 138, 154,
+ 170, 186, 202, 218, 234, 250, 266, 282, 298, 314, 330, 346, 362, 378, 394,
+ 410, 426, 442, 458, 474, 490, 506, 11, 27, 43, 59, 75, 91, 107, 123,
+ 139, 155, 171, 187, 203, 219, 235, 251, 267, 283, 299, 315, 331, 347, 363,
+ 379, 395, 411, 427, 443, 459, 475, 491, 507, 12, 28, 44, 60, 76, 92,
+ 108, 124, 140, 156, 172, 188, 204, 220, 236, 252, 268, 284, 300, 316, 332,
+ 348, 364, 380, 396, 412, 428, 444, 460, 476, 492, 508, 13, 29, 45, 61,
+ 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253, 269, 285, 301,
+ 317, 333, 349, 365, 381, 397, 413, 429, 445, 461, 477, 493, 509, 14, 30,
+ 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254, 270,
+ 286, 302, 318, 334, 350, 366, 382, 398, 414, 430, 446, 462, 478, 494, 510,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239,
+ 255, 271, 287, 303, 319, 335, 351, 367, 383, 399, 415, 431, 447, 463, 479,
+ 495, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x32[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x16[512]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269,
+ 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284,
+ 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
+ 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314,
+ 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329,
+ 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344,
+ 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359,
+ 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374,
+ 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404,
+ 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419,
+ 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434,
+ 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449,
+ 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464,
+ 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+ 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
+ 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509,
+ 510, 511,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_16x16[256]) = {
+ 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+ 1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+ 2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+ 3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+ 4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+ 5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+ 6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+ 7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+ 8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+ 9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+ 10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+ 11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+ 12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+ 13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+ 14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+ 15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_16x16[256]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
+ 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
+ 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
+ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
+ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224,
+ 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254,
+ 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_16x16[256]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90, 91, 119,
+ 120, 2, 4, 7, 13, 16, 26, 29, 43, 46, 64, 67, 89, 92, 118,
+ 121, 150, 3, 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117,
+ 122, 149, 151, 9, 11, 18, 24, 31, 41, 48, 62, 69, 87, 94, 116,
+ 123, 148, 152, 177, 10, 19, 23, 32, 40, 49, 61, 70, 86, 95, 115,
+ 124, 147, 153, 176, 178, 20, 22, 33, 39, 50, 60, 71, 85, 96, 114,
+ 125, 146, 154, 175, 179, 200, 21, 34, 38, 51, 59, 72, 84, 97, 113,
+ 126, 145, 155, 174, 180, 199, 201, 35, 37, 52, 58, 73, 83, 98, 112,
+ 127, 144, 156, 173, 181, 198, 202, 219, 36, 53, 57, 74, 82, 99, 111,
+ 128, 143, 157, 172, 182, 197, 203, 218, 220, 54, 56, 75, 81, 100, 110,
+ 129, 142, 158, 171, 183, 196, 204, 217, 221, 234, 55, 76, 80, 101, 109,
+ 130, 141, 159, 170, 184, 195, 205, 216, 222, 233, 235, 77, 79, 102, 108,
+ 131, 140, 160, 169, 185, 194, 206, 215, 223, 232, 236, 245, 78, 103, 107,
+ 132, 139, 161, 168, 186, 193, 207, 214, 224, 231, 237, 244, 246, 104, 106,
+ 133, 138, 162, 167, 187, 192, 208, 213, 225, 230, 238, 243, 247, 252, 105,
+ 134, 137, 163, 166, 188, 191, 209, 212, 226, 229, 239, 242, 248, 251, 253,
+ 135, 136, 164, 165, 189, 190, 210, 211, 227, 228, 240, 241, 249, 250, 254,
+ 255
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mcol_iscan_32x32[1024]) = {
+ 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+ 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864,
+ 896, 928, 960, 992, 1, 33, 65, 97, 129, 161, 193, 225, 257, 289,
+ 321, 353, 385, 417, 449, 481, 513, 545, 577, 609, 641, 673, 705, 737,
+ 769, 801, 833, 865, 897, 929, 961, 993, 2, 34, 66, 98, 130, 162,
+ 194, 226, 258, 290, 322, 354, 386, 418, 450, 482, 514, 546, 578, 610,
+ 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962, 994, 3, 35,
+ 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+ 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931,
+ 963, 995, 4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356,
+ 388, 420, 452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804,
+ 836, 868, 900, 932, 964, 996, 5, 37, 69, 101, 133, 165, 197, 229,
+ 261, 293, 325, 357, 389, 421, 453, 485, 517, 549, 581, 613, 645, 677,
+ 709, 741, 773, 805, 837, 869, 901, 933, 965, 997, 6, 38, 70, 102,
+ 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486, 518, 550,
+ 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966, 998,
+ 7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+ 455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871,
+ 903, 935, 967, 999, 8, 40, 72, 104, 136, 168, 200, 232, 264, 296,
+ 328, 360, 392, 424, 456, 488, 520, 552, 584, 616, 648, 680, 712, 744,
+ 776, 808, 840, 872, 904, 936, 968, 1000, 9, 41, 73, 105, 137, 169,
+ 201, 233, 265, 297, 329, 361, 393, 425, 457, 489, 521, 553, 585, 617,
+ 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969, 1001, 10, 42,
+ 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458, 490,
+ 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906, 938,
+ 970, 1002, 11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363,
+ 395, 427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+ 843, 875, 907, 939, 971, 1003, 12, 44, 76, 108, 140, 172, 204, 236,
+ 268, 300, 332, 364, 396, 428, 460, 492, 524, 556, 588, 620, 652, 684,
+ 716, 748, 780, 812, 844, 876, 908, 940, 972, 1004, 13, 45, 77, 109,
+ 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461, 493, 525, 557,
+ 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909, 941, 973, 1005,
+ 14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430,
+ 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878,
+ 910, 942, 974, 1006, 15, 47, 79, 111, 143, 175, 207, 239, 271, 303,
+ 335, 367, 399, 431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751,
+ 783, 815, 847, 879, 911, 943, 975, 1007, 16, 48, 80, 112, 144, 176,
+ 208, 240, 272, 304, 336, 368, 400, 432, 464, 496, 528, 560, 592, 624,
+ 656, 688, 720, 752, 784, 816, 848, 880, 912, 944, 976, 1008, 17, 49,
+ 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465, 497,
+ 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913, 945,
+ 977, 1009, 18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370,
+ 402, 434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818,
+ 850, 882, 914, 946, 978, 1010, 19, 51, 83, 115, 147, 179, 211, 243,
+ 275, 307, 339, 371, 403, 435, 467, 499, 531, 563, 595, 627, 659, 691,
+ 723, 755, 787, 819, 851, 883, 915, 947, 979, 1011, 20, 52, 84, 116,
+ 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468, 500, 532, 564,
+ 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916, 948, 980, 1012,
+ 21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437,
+ 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885,
+ 917, 949, 981, 1013, 22, 54, 86, 118, 150, 182, 214, 246, 278, 310,
+ 342, 374, 406, 438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758,
+ 790, 822, 854, 886, 918, 950, 982, 1014, 23, 55, 87, 119, 151, 183,
+ 215, 247, 279, 311, 343, 375, 407, 439, 471, 503, 535, 567, 599, 631,
+ 663, 695, 727, 759, 791, 823, 855, 887, 919, 951, 983, 1015, 24, 56,
+ 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472, 504,
+ 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920, 952,
+ 984, 1016, 25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377,
+ 409, 441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825,
+ 857, 889, 921, 953, 985, 1017, 26, 58, 90, 122, 154, 186, 218, 250,
+ 282, 314, 346, 378, 410, 442, 474, 506, 538, 570, 602, 634, 666, 698,
+ 730, 762, 794, 826, 858, 890, 922, 954, 986, 1018, 27, 59, 91, 123,
+ 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475, 507, 539, 571,
+ 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923, 955, 987, 1019,
+ 28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444,
+ 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892,
+ 924, 956, 988, 1020, 29, 61, 93, 125, 157, 189, 221, 253, 285, 317,
+ 349, 381, 413, 445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765,
+ 797, 829, 861, 893, 925, 957, 989, 1021, 30, 62, 94, 126, 158, 190,
+ 222, 254, 286, 318, 350, 382, 414, 446, 478, 510, 542, 574, 606, 638,
+ 670, 702, 734, 766, 798, 830, 862, 894, 926, 958, 990, 1022, 31, 63,
+ 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479, 511,
+ 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927, 959,
+ 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_mrow_iscan_32x32[1024]) = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
+ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
+ 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
+ 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
+ 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+ 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194,
+ 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+ 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233,
+ 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246,
+ 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259,
+ 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272,
+ 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285,
+ 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+ 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311,
+ 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324,
+ 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337,
+ 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350,
+ 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363,
+ 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376,
+ 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389,
+ 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402,
+ 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+ 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+ 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+ 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454,
+ 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467,
+ 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480,
+ 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493,
+ 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506,
+ 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519,
+ 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532,
+ 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545,
+ 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558,
+ 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571,
+ 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584,
+ 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+ 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610,
+ 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623,
+ 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636,
+ 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649,
+ 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662,
+ 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675,
+ 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
+ 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701,
+ 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+ 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727,
+ 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740,
+ 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753,
+ 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766,
+ 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779,
+ 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792,
+ 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805,
+ 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818,
+ 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+ 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+ 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+ 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870,
+ 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883,
+ 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896,
+ 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909,
+ 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922,
+ 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935,
+ 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948,
+ 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961,
+ 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974,
+ 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987,
+ 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000,
+ 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013,
+ 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_default_iscan_32x32[1024]) = {
+ 0, 1, 5, 6, 14, 15, 27, 28, 44, 45, 65, 66, 90,
+ 91, 119, 120, 152, 153, 189, 190, 230, 231, 275, 276, 324, 325,
+ 377, 378, 434, 435, 495, 496, 2, 4, 7, 13, 16, 26, 29,
+ 43, 46, 64, 67, 89, 92, 118, 121, 151, 154, 188, 191, 229,
+ 232, 274, 277, 323, 326, 376, 379, 433, 436, 494, 497, 558, 3,
+ 8, 12, 17, 25, 30, 42, 47, 63, 68, 88, 93, 117, 122,
+ 150, 155, 187, 192, 228, 233, 273, 278, 322, 327, 375, 380, 432,
+ 437, 493, 498, 557, 559, 9, 11, 18, 24, 31, 41, 48, 62,
+ 69, 87, 94, 116, 123, 149, 156, 186, 193, 227, 234, 272, 279,
+ 321, 328, 374, 381, 431, 438, 492, 499, 556, 560, 617, 10, 19,
+ 23, 32, 40, 49, 61, 70, 86, 95, 115, 124, 148, 157, 185,
+ 194, 226, 235, 271, 280, 320, 329, 373, 382, 430, 439, 491, 500,
+ 555, 561, 616, 618, 20, 22, 33, 39, 50, 60, 71, 85, 96,
+ 114, 125, 147, 158, 184, 195, 225, 236, 270, 281, 319, 330, 372,
+ 383, 429, 440, 490, 501, 554, 562, 615, 619, 672, 21, 34, 38,
+ 51, 59, 72, 84, 97, 113, 126, 146, 159, 183, 196, 224, 237,
+ 269, 282, 318, 331, 371, 384, 428, 441, 489, 502, 553, 563, 614,
+ 620, 671, 673, 35, 37, 52, 58, 73, 83, 98, 112, 127, 145,
+ 160, 182, 197, 223, 238, 268, 283, 317, 332, 370, 385, 427, 442,
+ 488, 503, 552, 564, 613, 621, 670, 674, 723, 36, 53, 57, 74,
+ 82, 99, 111, 128, 144, 161, 181, 198, 222, 239, 267, 284, 316,
+ 333, 369, 386, 426, 443, 487, 504, 551, 565, 612, 622, 669, 675,
+ 722, 724, 54, 56, 75, 81, 100, 110, 129, 143, 162, 180, 199,
+ 221, 240, 266, 285, 315, 334, 368, 387, 425, 444, 486, 505, 550,
+ 566, 611, 623, 668, 676, 721, 725, 770, 55, 76, 80, 101, 109,
+ 130, 142, 163, 179, 200, 220, 241, 265, 286, 314, 335, 367, 388,
+ 424, 445, 485, 506, 549, 567, 610, 624, 667, 677, 720, 726, 769,
+ 771, 77, 79, 102, 108, 131, 141, 164, 178, 201, 219, 242, 264,
+ 287, 313, 336, 366, 389, 423, 446, 484, 507, 548, 568, 609, 625,
+ 666, 678, 719, 727, 768, 772, 813, 78, 103, 107, 132, 140, 165,
+ 177, 202, 218, 243, 263, 288, 312, 337, 365, 390, 422, 447, 483,
+ 508, 547, 569, 608, 626, 665, 679, 718, 728, 767, 773, 812, 814,
+ 104, 106, 133, 139, 166, 176, 203, 217, 244, 262, 289, 311, 338,
+ 364, 391, 421, 448, 482, 509, 546, 570, 607, 627, 664, 680, 717,
+ 729, 766, 774, 811, 815, 852, 105, 134, 138, 167, 175, 204, 216,
+ 245, 261, 290, 310, 339, 363, 392, 420, 449, 481, 510, 545, 571,
+ 606, 628, 663, 681, 716, 730, 765, 775, 810, 816, 851, 853, 135,
+ 137, 168, 174, 205, 215, 246, 260, 291, 309, 340, 362, 393, 419,
+ 450, 480, 511, 544, 572, 605, 629, 662, 682, 715, 731, 764, 776,
+ 809, 817, 850, 854, 887, 136, 169, 173, 206, 214, 247, 259, 292,
+ 308, 341, 361, 394, 418, 451, 479, 512, 543, 573, 604, 630, 661,
+ 683, 714, 732, 763, 777, 808, 818, 849, 855, 886, 888, 170, 172,
+ 207, 213, 248, 258, 293, 307, 342, 360, 395, 417, 452, 478, 513,
+ 542, 574, 603, 631, 660, 684, 713, 733, 762, 778, 807, 819, 848,
+ 856, 885, 889, 918, 171, 208, 212, 249, 257, 294, 306, 343, 359,
+ 396, 416, 453, 477, 514, 541, 575, 602, 632, 659, 685, 712, 734,
+ 761, 779, 806, 820, 847, 857, 884, 890, 917, 919, 209, 211, 250,
+ 256, 295, 305, 344, 358, 397, 415, 454, 476, 515, 540, 576, 601,
+ 633, 658, 686, 711, 735, 760, 780, 805, 821, 846, 858, 883, 891,
+ 916, 920, 945, 210, 251, 255, 296, 304, 345, 357, 398, 414, 455,
+ 475, 516, 539, 577, 600, 634, 657, 687, 710, 736, 759, 781, 804,
+ 822, 845, 859, 882, 892, 915, 921, 944, 946, 252, 254, 297, 303,
+ 346, 356, 399, 413, 456, 474, 517, 538, 578, 599, 635, 656, 688,
+ 709, 737, 758, 782, 803, 823, 844, 860, 881, 893, 914, 922, 943,
+ 947, 968, 253, 298, 302, 347, 355, 400, 412, 457, 473, 518, 537,
+ 579, 598, 636, 655, 689, 708, 738, 757, 783, 802, 824, 843, 861,
+ 880, 894, 913, 923, 942, 948, 967, 969, 299, 301, 348, 354, 401,
+ 411, 458, 472, 519, 536, 580, 597, 637, 654, 690, 707, 739, 756,
+ 784, 801, 825, 842, 862, 879, 895, 912, 924, 941, 949, 966, 970,
+ 987, 300, 349, 353, 402, 410, 459, 471, 520, 535, 581, 596, 638,
+ 653, 691, 706, 740, 755, 785, 800, 826, 841, 863, 878, 896, 911,
+ 925, 940, 950, 965, 971, 986, 988, 350, 352, 403, 409, 460, 470,
+ 521, 534, 582, 595, 639, 652, 692, 705, 741, 754, 786, 799, 827,
+ 840, 864, 877, 897, 910, 926, 939, 951, 964, 972, 985, 989, 1002,
+ 351, 404, 408, 461, 469, 522, 533, 583, 594, 640, 651, 693, 704,
+ 742, 753, 787, 798, 828, 839, 865, 876, 898, 909, 927, 938, 952,
+ 963, 973, 984, 990, 1001, 1003, 405, 407, 462, 468, 523, 532, 584,
+ 593, 641, 650, 694, 703, 743, 752, 788, 797, 829, 838, 866, 875,
+ 899, 908, 928, 937, 953, 962, 974, 983, 991, 1000, 1004, 1013, 406,
+ 463, 467, 524, 531, 585, 592, 642, 649, 695, 702, 744, 751, 789,
+ 796, 830, 837, 867, 874, 900, 907, 929, 936, 954, 961, 975, 982,
+ 992, 999, 1005, 1012, 1014, 464, 466, 525, 530, 586, 591, 643, 648,
+ 696, 701, 745, 750, 790, 795, 831, 836, 868, 873, 901, 906, 930,
+ 935, 955, 960, 976, 981, 993, 998, 1006, 1011, 1015, 1020, 465, 526,
+ 529, 587, 590, 644, 647, 697, 700, 746, 749, 791, 794, 832, 835,
+ 869, 872, 902, 905, 931, 934, 956, 959, 977, 980, 994, 997, 1007,
+ 1010, 1016, 1019, 1021, 527, 528, 588, 589, 645, 646, 698, 699, 747,
+ 748, 792, 793, 833, 834, 870, 871, 903, 904, 932, 933, 957, 958,
+ 978, 979, 995, 996, 1008, 1009, 1017, 1018, 1022, 1023
+};
+
+const SCAN_ORDER av1_default_scan_orders[TX_SIZES] = {
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16, default_scan_16x16_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32, default_scan_32x32_neighbors },
+};
+
+const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES] = {
+ {
+ // TX_4X4
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { default_scan_4x4, av1_default_iscan_4x4, default_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ { mrow_scan_4x4, av1_mrow_iscan_4x4, mrow_scan_4x4_neighbors },
+ { mcol_scan_4x4, av1_mcol_iscan_4x4, mcol_scan_4x4_neighbors },
+ },
+ {
+ // TX_8X8
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { default_scan_8x8, av1_default_iscan_8x8, default_scan_8x8_neighbors },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+ { mrow_scan_8x8, av1_mrow_iscan_8x8, mrow_scan_8x8_neighbors },
+ { mcol_scan_8x8, av1_mcol_iscan_8x8, mcol_scan_8x8_neighbors },
+ },
+ {
+ // TX_16X16
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { default_scan_16x16, av1_default_iscan_16x16,
+ default_scan_16x16_neighbors },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+ { mrow_scan_16x16, av1_mrow_iscan_16x16, mrow_scan_16x16_neighbors },
+ { mcol_scan_16x16, av1_mcol_iscan_16x16, mcol_scan_16x16_neighbors },
+ },
+ {
+ // TX_32X32
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ },
+ {
+ // TX_64X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ },
+ {
+ // TX_4X8
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { default_scan_4x8, av1_default_iscan_4x8, default_scan_4x8_neighbors },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+ { mrow_scan_4x8, av1_mrow_iscan_4x8, mrow_scan_4x8_neighbors },
+ { mcol_scan_4x8, av1_mcol_iscan_4x8, mcol_scan_4x8_neighbors },
+ },
+ {
+ // TX_8X4
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { default_scan_8x4, av1_default_iscan_8x4, default_scan_8x4_neighbors },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+ { mrow_scan_8x4, av1_mrow_iscan_8x4, mrow_scan_8x4_neighbors },
+ { mcol_scan_8x4, av1_mcol_iscan_8x4, mcol_scan_8x4_neighbors },
+ },
+ {
+ // TX_8X16
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { default_scan_8x16, av1_default_iscan_8x16,
+ default_scan_8x16_neighbors },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+ { mrow_scan_8x16, av1_mrow_iscan_8x16, mrow_scan_8x16_neighbors },
+ { mcol_scan_8x16, av1_mcol_iscan_8x16, mcol_scan_8x16_neighbors },
+ },
+ {
+ // TX_16X8
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { default_scan_16x8, av1_default_iscan_16x8,
+ default_scan_16x8_neighbors },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+ { mrow_scan_16x8, av1_mrow_iscan_16x8, mrow_scan_16x8_neighbors },
+ { mcol_scan_16x8, av1_mcol_iscan_16x8, mcol_scan_16x8_neighbors },
+ },
+ {
+ // TX_16X32
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ },
+ {
+ // TX_32X16
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ },
+ {
+ // TX_32X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ },
+ {
+ // TX_64X32
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { default_scan_32x32, av1_default_iscan_32x32,
+ default_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ { mrow_scan_32x32, av1_mrow_iscan_32x32, mrow_scan_32x32_neighbors },
+ { mcol_scan_32x32, av1_mcol_iscan_32x32, mcol_scan_32x32_neighbors },
+ },
+ {
+ // TX_4X16
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { default_scan_4x16, av1_default_iscan_4x16,
+ default_scan_4x16_neighbors },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+ { mrow_scan_4x16, av1_mrow_iscan_4x16, mrow_scan_4x16_neighbors },
+ { mcol_scan_4x16, av1_mcol_iscan_4x16, mcol_scan_4x16_neighbors },
+ },
+ {
+ // TX_16X4
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { default_scan_16x4, av1_default_iscan_16x4,
+ default_scan_16x4_neighbors },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+ { mrow_scan_16x4, av1_mrow_iscan_16x4, mrow_scan_16x4_neighbors },
+ { mcol_scan_16x4, av1_mcol_iscan_16x4, mcol_scan_16x4_neighbors },
+ },
+ {
+ // TX_8X32
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { default_scan_8x32, av1_default_iscan_8x32,
+ default_scan_8x32_neighbors },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+ { mrow_scan_8x32, av1_mrow_iscan_8x32, mrow_scan_8x32_neighbors },
+ { mcol_scan_8x32, av1_mcol_iscan_8x32, mcol_scan_8x32_neighbors },
+ },
+ {
+ // TX_32X8
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { default_scan_32x8, av1_default_iscan_32x8,
+ default_scan_32x8_neighbors },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+ { mrow_scan_32x8, av1_mrow_iscan_32x8, mrow_scan_32x8_neighbors },
+ { mcol_scan_32x8, av1_mcol_iscan_32x8, mcol_scan_32x8_neighbors },
+ },
+ {
+ // TX_16X64
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { default_scan_16x32, av1_default_iscan_16x32,
+ default_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ { mrow_scan_16x32, av1_mrow_iscan_16x32, mrow_scan_16x32_neighbors },
+ { mcol_scan_16x32, av1_mcol_iscan_16x32, mcol_scan_16x32_neighbors },
+ },
+ {
+ // TX_64X16
+ // Half of the coefficients of tx64 at higher frequencies are set to
+ // zeros. So tx32's scan order is used.
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { default_scan_32x16, av1_default_iscan_32x16,
+ default_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ { mrow_scan_32x16, av1_mrow_iscan_32x16, mrow_scan_32x16_neighbors },
+ { mcol_scan_32x16, av1_mcol_iscan_32x16, mcol_scan_32x16_neighbors },
+ },
+};
diff --git a/third_party/aom/av1/common/scan.h b/third_party/aom/av1/common/scan.h
new file mode 100644
index 0000000000..233dc0efa2
--- /dev/null
+++ b/third_party/aom/av1/common/scan.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SCAN_H_
+#define AOM_AV1_COMMON_SCAN_H_
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_NEIGHBORS 2
+
+typedef enum SCAN_MODE {
+ SCAN_MODE_ZIG_ZAG,
+ SCAN_MODE_COL_DIAG,
+ SCAN_MODE_ROW_DIAG,
+ SCAN_MODE_COL_1D,
+ SCAN_MODE_ROW_1D,
+ SCAN_MODES
+} SCAN_MODE;
+
+extern const SCAN_ORDER av1_default_scan_orders[TX_SIZES];
+extern const SCAN_ORDER av1_scan_orders[TX_SIZES_ALL][TX_TYPES];
+
+void av1_deliver_eob_threshold(const AV1_COMMON *cm, MACROBLOCKD *xd);
+
+static INLINE const SCAN_ORDER *get_default_scan(TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ return &av1_scan_orders[tx_size][tx_type];
+}
+
+static INLINE const SCAN_ORDER *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
+ return get_default_scan(tx_size, tx_type);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SCAN_H_
diff --git a/third_party/aom/av1/common/seg_common.c b/third_party/aom/av1/common/seg_common.c
new file mode 100644
index 0000000000..cd189ad769
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/quant_common.h"
+
+static const int seg_feature_data_signed[SEG_LVL_MAX] = { 1, 1, 1, 1, 1, 0, 0 };
+
+static const int seg_feature_data_max[SEG_LVL_MAX] = {
+ MAXQ, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, MAX_LOOP_FILTER, 7, 0
+};
+
+// These functions provide access to new segment level features.
+// Eventually these function may be "optimized out" but for the moment,
+// the coding mechanism is still subject to change so these provide a
+// convenient single point of change.
+
+void av1_clearall_segfeatures(struct segmentation *seg) {
+ av1_zero(seg->feature_data);
+ av1_zero(seg->feature_mask);
+}
+
+void calculate_segdata(struct segmentation *seg) {
+ seg->segid_preskip = 0;
+ seg->last_active_segid = 0;
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ for (int j = 0; j < SEG_LVL_MAX; j++) {
+ if (seg->feature_mask[i] & (1 << j)) {
+ seg->segid_preskip |= (j >= SEG_LVL_REF_FRAME);
+ seg->last_active_segid = i;
+ }
+ }
+ }
+}
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] |= 1 << feature_id;
+}
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id) {
+ return seg_feature_data_max[feature_id];
+}
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id) {
+ return seg_feature_data_signed[feature_id];
+}
+
+// The 'seg_data' given for each segment can be either deltas (from the default
+// value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q), (0-63 for
+// SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q), (+/-63 for
+// SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use
+// the absolute values given).
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id, int seg_data) {
+ if (seg_data < 0) {
+ assert(seg_feature_data_signed[feature_id]);
+ assert(-seg_data <= seg_feature_data_max[feature_id]);
+ } else {
+ assert(seg_data <= seg_feature_data_max[feature_id]);
+ }
+
+ seg->feature_data[segment_id][feature_id] = seg_data;
+}
+
+// TBD? Functions to read and write segment data with range / validity checking
diff --git a/third_party/aom/av1/common/seg_common.h b/third_party/aom/av1/common/seg_common.h
new file mode 100644
index 0000000000..8c35bba86c
--- /dev/null
+++ b/third_party/aom/av1/common/seg_common.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_SEG_COMMON_H_
+#define AOM_AV1_COMMON_SEG_COMMON_H_
+
+#include "aom_dsp/prob.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_SEGMENTS 8
+#define SEG_TREE_PROBS (MAX_SEGMENTS - 1)
+
+#define SEG_TEMPORAL_PRED_CTXS 3
+#define SPATIAL_PREDICTION_PROBS 3
+
+typedef enum {
+ SEG_LVL_ALT_Q, // Use alternate Quantizer ....
+ SEG_LVL_ALT_LF_Y_V, // Use alternate loop filter value on y plane vertical
+ SEG_LVL_ALT_LF_Y_H, // Use alternate loop filter value on y plane horizontal
+ SEG_LVL_ALT_LF_U, // Use alternate loop filter value on u plane
+ SEG_LVL_ALT_LF_V, // Use alternate loop filter value on v plane
+ SEG_LVL_REF_FRAME, // Optional Segment reference frame
+ SEG_LVL_SKIP, // Optional Segment (0,0) + skip mode
+ SEG_LVL_GLOBALMV,
+ SEG_LVL_MAX
+} SEG_LVL_FEATURES;
+
+struct segmentation {
+ uint8_t enabled;
+ uint8_t update_map;
+ uint8_t update_data;
+ uint8_t temporal_update;
+
+ int16_t feature_data[MAX_SEGMENTS][SEG_LVL_MAX];
+ unsigned int feature_mask[MAX_SEGMENTS];
+ int last_active_segid; // The highest numbered segment id that has some
+ // enabled feature.
+ uint8_t segid_preskip; // Whether the segment id will be read before the
+ // skip syntax element.
+ // 1: the segment id will be read first.
+ // 0: the skip syntax element will be read first.
+};
+
+struct segmentation_probs {
+ aom_cdf_prob tree_cdf[CDF_SIZE(MAX_SEGMENTS)];
+ aom_cdf_prob pred_cdf[SEG_TEMPORAL_PRED_CTXS][CDF_SIZE(2)];
+ aom_cdf_prob spatial_pred_seg_cdf[SPATIAL_PREDICTION_PROBS]
+ [CDF_SIZE(MAX_SEGMENTS)];
+};
+
+static INLINE int segfeature_active(const struct segmentation *seg,
+ int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ return seg->enabled && (seg->feature_mask[segment_id] & (1 << feature_id));
+}
+
+static INLINE void segfeatures_copy(struct segmentation *dst,
+ const struct segmentation *src) {
+ int i, j;
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ dst->feature_mask[i] = src->feature_mask[i];
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ dst->feature_data[i][j] = src->feature_data[i][j];
+ }
+ }
+ dst->segid_preskip = src->segid_preskip;
+ dst->last_active_segid = src->last_active_segid;
+}
+
+void av1_clearall_segfeatures(struct segmentation *seg);
+
+void av1_enable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void calculate_segdata(struct segmentation *seg);
+
+int av1_seg_feature_data_max(SEG_LVL_FEATURES feature_id);
+
+int av1_is_segfeature_signed(SEG_LVL_FEATURES feature_id);
+
+void av1_set_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id, int seg_data);
+
+static INLINE int get_segdata(const struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ return seg->feature_data[segment_id][feature_id];
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_SEG_COMMON_H_
diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c
new file mode 100644
index 0000000000..8df4c9a09d
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.c
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/reconinter.h"
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+ // nsync numbers are picked by testing. For example, for 4k
+ // video, using 4 gives best performance.
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+}
+
+static INLINE int get_lr_sync_range(int width) {
+#if 0
+ // nsync numbers are picked by testing. For example, for 4k
+ // video, using 4 gives best performance.
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+#else
+ (void)width;
+ return 1;
+#endif
+}
+
+// Allocate memory for lf row synchronization
+static void loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows,
+ int width, int num_workers) {
+ lf_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i, j;
+
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ CHECK_MEM_ERROR(cm, lf_sync->mutex_[j],
+ aom_malloc(sizeof(*(lf_sync->mutex_[j])) * rows));
+ if (lf_sync->mutex_[j]) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&lf_sync->mutex_[j][i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->cond_[j],
+ aom_malloc(sizeof(*(lf_sync->cond_[j])) * rows));
+ if (lf_sync->cond_[j]) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&lf_sync->cond_[j][i], NULL);
+ }
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lf_sync->job_mutex,
+ aom_malloc(sizeof(*(lf_sync->job_mutex))));
+ if (lf_sync->job_mutex) {
+ pthread_mutex_init(lf_sync->job_mutex, NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+ aom_malloc(num_workers * sizeof(*(lf_sync->lfdata))));
+ lf_sync->num_workers = num_workers;
+
+ for (int j = 0; j < MAX_MB_PLANE; j++) {
+ CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col[j],
+ aom_malloc(sizeof(*(lf_sync->cur_sb_col[j])) * rows));
+ }
+ CHECK_MEM_ERROR(
+ cm, lf_sync->job_queue,
+ aom_malloc(sizeof(*(lf_sync->job_queue)) * rows * MAX_MB_PLANE * 2));
+ // Set up nsync.
+ lf_sync->sync_range = get_sync_range(width);
+}
+
+// Deallocate lf synchronization related mutex and data
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync) {
+ if (lf_sync != NULL) {
+ int j;
+#if CONFIG_MULTITHREAD
+ int i;
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ if (lf_sync->mutex_[j] != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_mutex_destroy(&lf_sync->mutex_[j][i]);
+ }
+ aom_free(lf_sync->mutex_[j]);
+ }
+ if (lf_sync->cond_[j] != NULL) {
+ for (i = 0; i < lf_sync->rows; ++i) {
+ pthread_cond_destroy(&lf_sync->cond_[j][i]);
+ }
+ aom_free(lf_sync->cond_[j]);
+ }
+ }
+ if (lf_sync->job_mutex != NULL) {
+ pthread_mutex_destroy(lf_sync->job_mutex);
+ aom_free(lf_sync->job_mutex);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(lf_sync->lfdata);
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ aom_free(lf_sync->cur_sb_col[j]);
+ }
+
+ aom_free(lf_sync->job_queue);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*lf_sync);
+ }
+}
+
+static void loop_filter_data_reset(LFWorkerData *lf_data,
+ YV12_BUFFER_CONFIG *frame_buffer,
+ struct AV1Common *cm, MACROBLOCKD *xd) {
+ struct macroblockd_plane *pd = xd->plane;
+ lf_data->frame_buffer = frame_buffer;
+ lf_data->cm = cm;
+ lf_data->xd = xd;
+ for (int i = 0; i < MAX_MB_PLANE; i++) {
+ memcpy(&lf_data->planes[i].dst, &pd[i].dst, sizeof(lf_data->planes[i].dst));
+ lf_data->planes[i].subsampling_x = pd[i].subsampling_x;
+ lf_data->planes[i].subsampling_y = pd[i].subsampling_y;
+ }
+}
+
+static INLINE void sync_read(AV1LfSync *const lf_sync, int r, int c,
+ int plane) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &lf_sync->mutex_[plane][r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > lf_sync->cur_sb_col[plane][r - 1] - nsync) {
+ pthread_cond_wait(&lf_sync->cond_[plane][r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c,
+ const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+ const int nsync = lf_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&lf_sync->mutex_[plane][r]);
+
+ lf_sync->cur_sb_col[plane][r] = cur;
+
+ pthread_cond_broadcast(&lf_sync->cond_[plane][r]);
+ pthread_mutex_unlock(&lf_sync->mutex_[plane][r]);
+ }
+#else
+ (void)lf_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+static void enqueue_lf_jobs(AV1LfSync *lf_sync, AV1_COMMON *cm, int start,
+ int stop, int plane_start, int plane_end) {
+ int mi_row, plane, dir;
+ AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
+ lf_sync->jobs_enqueued = 0;
+ lf_sync->jobs_dequeued = 0;
+
+ for (dir = 0; dir < 2; dir++) {
+ for (plane = plane_start; plane < plane_end; plane++) {
+ if (plane == 0 && !(cm->lf.filter_level[0]) && !(cm->lf.filter_level[1]))
+ break;
+ else if (plane == 1 && !(cm->lf.filter_level_u))
+ continue;
+ else if (plane == 2 && !(cm->lf.filter_level_v))
+ continue;
+ for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+ lf_job_queue->mi_row = mi_row;
+ lf_job_queue->plane = plane;
+ lf_job_queue->dir = dir;
+ lf_job_queue++;
+ lf_sync->jobs_enqueued++;
+ }
+ }
+ }
+}
+
+AV1LfMTInfo *get_lf_job_info(AV1LfSync *lf_sync) {
+ AV1LfMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lf_sync->job_mutex);
+
+ if (lf_sync->jobs_dequeued < lf_sync->jobs_enqueued) {
+ cur_job_info = lf_sync->job_queue + lf_sync->jobs_dequeued;
+ lf_sync->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(lf_sync->job_mutex);
+#else
+ (void)lf_sync;
+#endif
+
+ return cur_job_info;
+}
+
+// Implement row loopfiltering for each thread.
+static INLINE void thread_loop_filter_rows(
+ const YV12_BUFFER_CONFIG *const frame_buffer, AV1_COMMON *const cm,
+ struct macroblockd_plane *planes, MACROBLOCKD *xd,
+ AV1LfSync *const lf_sync) {
+ const int sb_cols =
+ ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+ int mi_row, mi_col, plane, dir;
+ int r, c;
+
+ while (1) {
+ AV1LfMTInfo *cur_job_info = get_lf_job_info(lf_sync);
+
+ if (cur_job_info != NULL) {
+ mi_row = cur_job_info->mi_row;
+ plane = cur_job_info->plane;
+ dir = cur_job_info->dir;
+ r = mi_row >> MAX_MIB_SIZE_LOG2;
+
+ if (dir == 0) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+ av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+ mi_row, mi_col, plane, plane + 1);
+
+ av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ sync_write(lf_sync, r, c, sb_cols, plane);
+ }
+ } else if (dir == 1) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+ c = mi_col >> MAX_MIB_SIZE_LOG2;
+
+ // Wait for vertical edge filtering of the top-right block to be
+ // completed
+ sync_read(lf_sync, r, c, plane);
+
+ // Wait for vertical edge filtering of the right block to be
+ // completed
+ sync_read(lf_sync, r + 1, c, plane);
+
+ av1_setup_dst_planes(planes, cm->seq_params.sb_size, frame_buffer,
+ mi_row, mi_col, plane, plane + 1);
+ av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+ mi_col);
+ }
+ }
+ } else {
+ break;
+ }
+ }
+}
+
+// Row-based multi-threaded loopfilter hook
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+ AV1LfSync *const lf_sync = (AV1LfSync *)arg1;
+ LFWorkerData *const lf_data = (LFWorkerData *)arg2;
+ thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+ lf_data->xd, lf_sync);
+ return 1;
+}
+
+static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int start, int stop,
+ int plane_start, int plane_end,
+ AVxWorker *workers, int nworkers,
+ AV1LfSync *lf_sync) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ // Number of superblock rows and cols
+ const int sb_rows =
+ ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+ const int num_workers = nworkers;
+ int i;
+
+ if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+ num_workers > lf_sync->num_workers) {
+ av1_loop_filter_dealloc(lf_sync);
+ loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+ }
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ for (i = 0; i < MAX_MB_PLANE; i++) {
+ memset(lf_sync->cur_sb_col[i], -1,
+ sizeof(*(lf_sync->cur_sb_col[i])) * sb_rows);
+ }
+
+ enqueue_lf_jobs(lf_sync, cm, start, stop, plane_start, plane_end);
+
+ // Set up loopfilter thread data.
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ LFWorkerData *const lf_data = &lf_sync->lfdata[i];
+
+ worker->hook = loop_filter_row_worker;
+ worker->data1 = lf_sync;
+ worker->data2 = lf_data;
+
+ // Loopfilter data
+ loop_filter_data_reset(lf_data, frame, cm, xd);
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+}
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane_start, int plane_end,
+ int partial_frame, AVxWorker *workers,
+ int num_workers, AV1LfSync *lf_sync) {
+ int start_mi_row, end_mi_row, mi_rows_to_filter;
+
+ start_mi_row = 0;
+ mi_rows_to_filter = cm->mi_rows;
+ if (partial_frame && cm->mi_rows > 8) {
+ start_mi_row = cm->mi_rows >> 1;
+ start_mi_row &= 0xfffffff8;
+ mi_rows_to_filter = AOMMAX(cm->mi_rows / 8, 8);
+ }
+ end_mi_row = start_mi_row + mi_rows_to_filter;
+ av1_loop_filter_frame_init(cm, plane_start, plane_end);
+
+ loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
+ plane_end, workers, num_workers, lf_sync);
+}
+
+static INLINE void lr_sync_read(void *const lr_sync, int r, int c, int plane) {
+#if CONFIG_MULTITHREAD
+ AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+ const int nsync = loop_res_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &loop_res_sync->mutex_[plane][r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > loop_res_sync->cur_sb_col[plane][r - 1] - nsync) {
+ pthread_cond_wait(&loop_res_sync->cond_[plane][r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void lr_sync_write(void *const lr_sync, int r, int c,
+ const int sb_cols, int plane) {
+#if CONFIG_MULTITHREAD
+ AV1LrSync *const loop_res_sync = (AV1LrSync *)lr_sync;
+ const int nsync = loop_res_sync->sync_range;
+ int cur;
+ // Only signal when there are enough filtered SB for next row to run.
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]);
+
+ loop_res_sync->cur_sb_col[plane][r] = cur;
+
+ pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]);
+ pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]);
+ }
+#else
+ (void)lr_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+ (void)plane;
+#endif // CONFIG_MULTITHREAD
+}
+
+// Allocate memory for loop restoration row synchronization
+static void loop_restoration_alloc(AV1LrSync *lr_sync, AV1_COMMON *cm,
+ int num_workers, int num_rows_lr,
+ int num_planes, int width) {
+ lr_sync->rows = num_rows_lr;
+ lr_sync->num_planes = num_planes;
+#if CONFIG_MULTITHREAD
+ {
+ int i, j;
+
+ for (j = 0; j < num_planes; j++) {
+ CHECK_MEM_ERROR(cm, lr_sync->mutex_[j],
+ aom_malloc(sizeof(*(lr_sync->mutex_[j])) * num_rows_lr));
+ if (lr_sync->mutex_[j]) {
+ for (i = 0; i < num_rows_lr; ++i) {
+ pthread_mutex_init(&lr_sync->mutex_[j][i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lr_sync->cond_[j],
+ aom_malloc(sizeof(*(lr_sync->cond_[j])) * num_rows_lr));
+ if (lr_sync->cond_[j]) {
+ for (i = 0; i < num_rows_lr; ++i) {
+ pthread_cond_init(&lr_sync->cond_[j][i], NULL);
+ }
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, lr_sync->job_mutex,
+ aom_malloc(sizeof(*(lr_sync->job_mutex))));
+ if (lr_sync->job_mutex) {
+ pthread_mutex_init(lr_sync->job_mutex, NULL);
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata,
+ aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata))));
+
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ if (worker_idx < num_workers - 1) {
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rst_tmpbuf,
+ (int32_t *)aom_memalign(16, RESTORATION_TMPBUF_SIZE));
+ CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata[worker_idx].rlbs,
+ aom_malloc(sizeof(RestorationLineBuffers)));
+
+ } else {
+ lr_sync->lrworkerdata[worker_idx].rst_tmpbuf = cm->rst_tmpbuf;
+ lr_sync->lrworkerdata[worker_idx].rlbs = cm->rlbs;
+ }
+ }
+
+ lr_sync->num_workers = num_workers;
+
+ for (int j = 0; j < num_planes; j++) {
+ CHECK_MEM_ERROR(
+ cm, lr_sync->cur_sb_col[j],
+ aom_malloc(sizeof(*(lr_sync->cur_sb_col[j])) * num_rows_lr));
+ }
+ CHECK_MEM_ERROR(
+ cm, lr_sync->job_queue,
+ aom_malloc(sizeof(*(lr_sync->job_queue)) * num_rows_lr * num_planes));
+ // Set up nsync.
+ lr_sync->sync_range = get_lr_sync_range(width);
+}
+
+// Deallocate loop restoration synchronization related mutex and data
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers) {
+ if (lr_sync != NULL) {
+ int j;
+#if CONFIG_MULTITHREAD
+ int i;
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ if (lr_sync->mutex_[j] != NULL) {
+ for (i = 0; i < lr_sync->rows; ++i) {
+ pthread_mutex_destroy(&lr_sync->mutex_[j][i]);
+ }
+ aom_free(lr_sync->mutex_[j]);
+ }
+ if (lr_sync->cond_[j] != NULL) {
+ for (i = 0; i < lr_sync->rows; ++i) {
+ pthread_cond_destroy(&lr_sync->cond_[j][i]);
+ }
+ aom_free(lr_sync->cond_[j]);
+ }
+ }
+ if (lr_sync->job_mutex != NULL) {
+ pthread_mutex_destroy(lr_sync->job_mutex);
+ aom_free(lr_sync->job_mutex);
+ }
+#endif // CONFIG_MULTITHREAD
+ for (j = 0; j < MAX_MB_PLANE; j++) {
+ aom_free(lr_sync->cur_sb_col[j]);
+ }
+
+ aom_free(lr_sync->job_queue);
+
+ if (lr_sync->lrworkerdata) {
+ for (int worker_idx = 0; worker_idx < num_workers - 1; worker_idx++) {
+ LRWorkerData *const workerdata_data =
+ lr_sync->lrworkerdata + worker_idx;
+
+ aom_free(workerdata_data->rst_tmpbuf);
+ aom_free(workerdata_data->rlbs);
+ }
+ aom_free(lr_sync->lrworkerdata);
+ }
+
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*lr_sync);
+ }
+}
+
+static void enqueue_lr_jobs(AV1LrSync *lr_sync, AV1LrStruct *lr_ctxt,
+ AV1_COMMON *cm) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ const int num_planes = av1_num_planes(cm);
+ AV1LrMTInfo *lr_job_queue = lr_sync->job_queue;
+ int32_t lr_job_counter[2], num_even_lr_jobs = 0;
+ lr_sync->jobs_enqueued = 0;
+ lr_sync->jobs_dequeued = 0;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ num_even_lr_jobs =
+ num_even_lr_jobs + ((ctxt[plane].rsi->vert_units_per_tile + 1) >> 1);
+ }
+ lr_job_counter[0] = 0;
+ lr_job_counter[1] = num_even_lr_jobs;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+ const int is_uv = plane > 0;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+ AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+ const int unit_size = ctxt[plane].rsi->restoration_unit_size;
+
+ const int tile_h = tile_rect.bottom - tile_rect.top;
+ const int ext_size = unit_size * 3 / 2;
+
+ int y0 = 0, i = 0;
+ while (y0 < tile_h) {
+ int remaining_h = tile_h - y0;
+ int h = (remaining_h < ext_size) ? remaining_h : unit_size;
+
+ RestorationTileLimits limits;
+ limits.v_start = tile_rect.top + y0;
+ limits.v_end = tile_rect.top + y0 + h;
+ assert(limits.v_end <= tile_rect.bottom);
+ // Offset the tile upwards to align with the restoration processing stripe
+ const int voffset = RESTORATION_UNIT_OFFSET >> ss_y;
+ limits.v_start = AOMMAX(tile_rect.top, limits.v_start - voffset);
+ if (limits.v_end < tile_rect.bottom) limits.v_end -= voffset;
+
+ assert(lr_job_counter[0] <= num_even_lr_jobs);
+
+ lr_job_queue[lr_job_counter[i & 1]].lr_unit_row = i;
+ lr_job_queue[lr_job_counter[i & 1]].plane = plane;
+ lr_job_queue[lr_job_counter[i & 1]].v_start = limits.v_start;
+ lr_job_queue[lr_job_counter[i & 1]].v_end = limits.v_end;
+ lr_job_queue[lr_job_counter[i & 1]].sync_mode = i & 1;
+ if ((i & 1) == 0) {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ limits.v_start + RESTORATION_BORDER;
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ limits.v_end - RESTORATION_BORDER;
+ if (i == 0) {
+ assert(limits.v_start == tile_rect.top);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start = tile_rect.top;
+ }
+ if (i == (ctxt[plane].rsi->vert_units_per_tile - 1)) {
+ assert(limits.v_end == tile_rect.bottom);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end = tile_rect.bottom;
+ }
+ } else {
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_start =
+ AOMMAX(limits.v_start - RESTORATION_BORDER, tile_rect.top);
+ lr_job_queue[lr_job_counter[i & 1]].v_copy_end =
+ AOMMIN(limits.v_end + RESTORATION_BORDER, tile_rect.bottom);
+ }
+ lr_job_counter[i & 1]++;
+ lr_sync->jobs_enqueued++;
+
+ y0 += h;
+ ++i;
+ }
+ }
+}
+
+AV1LrMTInfo *get_lr_job_info(AV1LrSync *lr_sync) {
+ AV1LrMTInfo *cur_job_info = NULL;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(lr_sync->job_mutex);
+
+ if (lr_sync->jobs_dequeued < lr_sync->jobs_enqueued) {
+ cur_job_info = lr_sync->job_queue + lr_sync->jobs_dequeued;
+ lr_sync->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(lr_sync->job_mutex);
+#else
+ (void)lr_sync;
+#endif
+
+ return cur_job_info;
+}
+
+// Implement row loop restoration for each thread.
+static int loop_restoration_row_worker(void *arg1, void *arg2) {
+ AV1LrSync *const lr_sync = (AV1LrSync *)arg1;
+ LRWorkerData *lrworkerdata = (LRWorkerData *)arg2;
+ AV1LrStruct *lr_ctxt = (AV1LrStruct *)lrworkerdata->lr_ctxt;
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+ int lr_unit_row;
+ int plane;
+ const int tile_row = LR_TILE_ROW;
+ const int tile_col = LR_TILE_COL;
+ const int tile_cols = LR_TILE_COLS;
+ const int tile_idx = tile_col + tile_row * tile_cols;
+ typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc, int hstart, int hend,
+ int vstart, int vend);
+ static const copy_fun copy_funs[3] = {
+ aom_yv12_partial_copy_y, aom_yv12_partial_copy_u, aom_yv12_partial_copy_v
+ };
+
+ while (1) {
+ AV1LrMTInfo *cur_job_info = get_lr_job_info(lr_sync);
+ if (cur_job_info != NULL) {
+ RestorationTileLimits limits;
+ sync_read_fn_t on_sync_read;
+ sync_write_fn_t on_sync_write;
+ limits.v_start = cur_job_info->v_start;
+ limits.v_end = cur_job_info->v_end;
+ lr_unit_row = cur_job_info->lr_unit_row;
+ plane = cur_job_info->plane;
+ const int unit_idx0 = tile_idx * ctxt[plane].rsi->units_per_tile;
+
+ // sync_mode == 1 implies only sync read is required in LR Multi-threading
+ // sync_mode == 0 implies only sync write is required.
+ on_sync_read =
+ cur_job_info->sync_mode == 1 ? lr_sync_read : av1_lr_sync_read_dummy;
+ on_sync_write = cur_job_info->sync_mode == 0 ? lr_sync_write
+ : av1_lr_sync_write_dummy;
+
+ av1_foreach_rest_unit_in_row(
+ &limits, &(ctxt[plane].tile_rect), lr_ctxt->on_rest_unit, lr_unit_row,
+ ctxt[plane].rsi->restoration_unit_size, unit_idx0,
+ ctxt[plane].rsi->horz_units_per_tile,
+ ctxt[plane].rsi->vert_units_per_tile, plane, &ctxt[plane],
+ lrworkerdata->rst_tmpbuf, lrworkerdata->rlbs, on_sync_read,
+ on_sync_write, lr_sync);
+
+ copy_funs[plane](lr_ctxt->dst, lr_ctxt->frame, ctxt[plane].tile_rect.left,
+ ctxt[plane].tile_rect.right, cur_job_info->v_copy_start,
+ cur_job_info->v_copy_end);
+ } else {
+ break;
+ }
+ }
+ return 1;
+}
+
+static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt,
+ AVxWorker *workers, int nworkers,
+ AV1LrSync *lr_sync, AV1_COMMON *cm) {
+ FilterFrameCtxt *ctxt = lr_ctxt->ctxt;
+
+ const int num_planes = av1_num_planes(cm);
+
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int num_rows_lr = 0;
+
+ for (int plane = 0; plane < num_planes; plane++) {
+ if (cm->rst_info[plane].frame_restoration_type == RESTORE_NONE) continue;
+
+ const AV1PixelRect tile_rect = ctxt[plane].tile_rect;
+ const int max_tile_h = tile_rect.bottom - tile_rect.top;
+
+ const int unit_size = cm->rst_info[plane].restoration_unit_size;
+
+ num_rows_lr =
+ AOMMAX(num_rows_lr, av1_lr_count_units_in_tile(unit_size, max_tile_h));
+ }
+
+ const int num_workers = nworkers;
+ int i;
+ assert(MAX_MB_PLANE == 3);
+
+ if (!lr_sync->sync_range || num_rows_lr != lr_sync->rows ||
+ num_workers > lr_sync->num_workers || num_planes != lr_sync->num_planes) {
+ av1_loop_restoration_dealloc(lr_sync, num_workers);
+ loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes,
+ cm->width);
+ }
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ for (i = 0; i < num_planes; i++) {
+ memset(lr_sync->cur_sb_col[i], -1,
+ sizeof(*(lr_sync->cur_sb_col[i])) * num_rows_lr);
+ }
+
+ enqueue_lr_jobs(lr_sync, lr_ctxt, cm);
+
+ // Set up looprestoration thread data.
+ for (i = 0; i < num_workers; ++i) {
+ AVxWorker *const worker = &workers[i];
+ lr_sync->lrworkerdata[i].lr_ctxt = (void *)lr_ctxt;
+ worker->hook = loop_restoration_row_worker;
+ worker->data1 = lr_sync;
+ worker->data2 = &lr_sync->lrworkerdata[i];
+
+ // Start loopfiltering
+ if (i == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+
+ // Wait till all rows are finished
+ for (i = 0; i < num_workers; ++i) {
+ winterface->sync(&workers[i]);
+ }
+}
+
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ AV1_COMMON *cm, int optimized_lr,
+ AVxWorker *workers, int num_workers,
+ AV1LrSync *lr_sync, void *lr_ctxt) {
+ assert(!cm->all_lossless);
+
+ const int num_planes = av1_num_planes(cm);
+
+ AV1LrStruct *loop_rest_ctxt = (AV1LrStruct *)lr_ctxt;
+
+ av1_loop_restoration_filter_frame_init(loop_rest_ctxt, frame, cm,
+ optimized_lr, num_planes);
+
+ foreach_rest_unit_in_planes_mt(loop_rest_ctxt, workers, num_workers, lr_sync,
+ cm);
+}
diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h
new file mode 100644
index 0000000000..23d61d72a3
--- /dev/null
+++ b/third_party/aom/av1/common/thread_common.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_THREAD_COMMON_H_
+#define AOM_AV1_COMMON_THREAD_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+
+typedef struct AV1LfMTInfo {
+ int mi_row;
+ int plane;
+ int dir;
+} AV1LfMTInfo;
+
+// Loopfilter row synchronization
+typedef struct AV1LfSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_[MAX_MB_PLANE];
+ pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+ // Allocate memory to store the loop-filtered superblock index in each row.
+ int *cur_sb_col[MAX_MB_PLANE];
+ // The optimal sync_range for different resolution and platform should be
+ // determined by testing. Currently, it is chosen to be a power-of-2 number.
+ int sync_range;
+ int rows;
+
+ // Row-based parallel loopfilter data
+ LFWorkerData *lfdata;
+ int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ AV1LfMTInfo *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+} AV1LfSync;
+
+typedef struct AV1LrMTInfo {
+ int v_start;
+ int v_end;
+ int lr_unit_row;
+ int plane;
+ int sync_mode;
+ int v_copy_start;
+ int v_copy_end;
+} AV1LrMTInfo;
+
+typedef struct LoopRestorationWorkerData {
+ int32_t *rst_tmpbuf;
+ void *rlbs;
+ void *lr_ctxt;
+} LRWorkerData;
+
+// Looprestoration row synchronization
+typedef struct AV1LrSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_[MAX_MB_PLANE];
+ pthread_cond_t *cond_[MAX_MB_PLANE];
+#endif
+ // Allocate memory to store the loop-restoration block index in each row.
+ int *cur_sb_col[MAX_MB_PLANE];
+ // The optimal sync_range for different resolution and platform should be
+ // determined by testing. Currently, it is chosen to be a power-of-2 number.
+ int sync_range;
+ int rows;
+ int num_planes;
+
+ int num_workers;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ // Row-based parallel loopfilter data
+ LRWorkerData *lrworkerdata;
+
+ AV1LrMTInfo *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+} AV1LrSync;
+
+// Deallocate loopfilter synchronization related mutex and data.
+void av1_loop_filter_dealloc(AV1LfSync *lf_sync);
+
+void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
+ struct macroblockd *mbd, int plane_start,
+ int plane_end, int partial_frame,
+ AVxWorker *workers, int num_workers,
+ AV1LfSync *lf_sync);
+void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
+ struct AV1Common *cm,
+ int optimized_lr, AVxWorker *workers,
+ int num_workers, AV1LrSync *lr_sync,
+ void *lr_ctxt);
+void av1_loop_restoration_dealloc(AV1LrSync *lr_sync, int num_workers);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_THREAD_COMMON_H_
diff --git a/third_party/aom/av1/common/tile_common.c b/third_party/aom/av1/common/tile_common.c
new file mode 100644
index 0000000000..1b413487f0
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/tile_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+void av1_tile_init(TileInfo *tile, const AV1_COMMON *cm, int row, int col) {
+ av1_tile_set_row(tile, cm, row);
+ av1_tile_set_col(tile, cm, col);
+}
+
+// Find smallest k>=0 such that (blk_size << k) >= target
+static int tile_log2(int blk_size, int target) {
+ int k;
+ for (k = 0; (blk_size << k) < target; k++) {
+ }
+ return k;
+}
+
+void av1_get_tile_limits(AV1_COMMON *const cm) {
+ int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+ int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+ int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+
+ int sb_size_log2 = cm->seq_params.mib_size_log2 + MI_SIZE_LOG2;
+ cm->max_tile_width_sb = MAX_TILE_WIDTH >> sb_size_log2;
+ int max_tile_area_sb = MAX_TILE_AREA >> (2 * sb_size_log2);
+
+ cm->min_log2_tile_cols = tile_log2(cm->max_tile_width_sb, sb_cols);
+ cm->max_log2_tile_cols = tile_log2(1, AOMMIN(sb_cols, MAX_TILE_COLS));
+ cm->max_log2_tile_rows = tile_log2(1, AOMMIN(sb_rows, MAX_TILE_ROWS));
+ cm->min_log2_tiles = tile_log2(max_tile_area_sb, sb_cols * sb_rows);
+ cm->min_log2_tiles = AOMMAX(cm->min_log2_tiles, cm->min_log2_tile_cols);
+}
+
+void av1_calculate_tile_cols(AV1_COMMON *const cm) {
+ int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+ int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+ int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+ int i;
+
+ if (cm->uniform_tile_spacing_flag) {
+ int start_sb;
+ int size_sb = ALIGN_POWER_OF_TWO(sb_cols, cm->log2_tile_cols);
+ size_sb >>= cm->log2_tile_cols;
+ assert(size_sb > 0);
+ for (i = 0, start_sb = 0; start_sb < sb_cols; i++) {
+ cm->tile_col_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ }
+ cm->tile_cols = i;
+ cm->tile_col_start_sb[i] = sb_cols;
+ cm->min_log2_tile_rows = AOMMAX(cm->min_log2_tiles - cm->log2_tile_cols, 0);
+ cm->max_tile_height_sb = sb_rows >> cm->min_log2_tile_rows;
+
+ cm->tile_width = size_sb << cm->seq_params.mib_size_log2;
+ cm->tile_width = AOMMIN(cm->tile_width, cm->mi_cols);
+ } else {
+ int max_tile_area_sb = (sb_rows * sb_cols);
+ int widest_tile_sb = 1;
+ cm->log2_tile_cols = tile_log2(1, cm->tile_cols);
+ for (i = 0; i < cm->tile_cols; i++) {
+ int size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+ widest_tile_sb = AOMMAX(widest_tile_sb, size_sb);
+ }
+ if (cm->min_log2_tiles) {
+ max_tile_area_sb >>= (cm->min_log2_tiles + 1);
+ }
+ cm->max_tile_height_sb = AOMMAX(max_tile_area_sb / widest_tile_sb, 1);
+ }
+}
+
+void av1_calculate_tile_rows(AV1_COMMON *const cm) {
+ int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+ int start_sb, size_sb, i;
+
+ if (cm->uniform_tile_spacing_flag) {
+ size_sb = ALIGN_POWER_OF_TWO(sb_rows, cm->log2_tile_rows);
+ size_sb >>= cm->log2_tile_rows;
+ assert(size_sb > 0);
+ for (i = 0, start_sb = 0; start_sb < sb_rows; i++) {
+ cm->tile_row_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ }
+ cm->tile_rows = i;
+ cm->tile_row_start_sb[i] = sb_rows;
+
+ cm->tile_height = size_sb << cm->seq_params.mib_size_log2;
+ cm->tile_height = AOMMIN(cm->tile_height, cm->mi_rows);
+ } else {
+ cm->log2_tile_rows = tile_log2(1, cm->tile_rows);
+ }
+}
+
+void av1_tile_set_row(TileInfo *tile, const AV1_COMMON *cm, int row) {
+ assert(row < cm->tile_rows);
+ int mi_row_start = cm->tile_row_start_sb[row] << cm->seq_params.mib_size_log2;
+ int mi_row_end = cm->tile_row_start_sb[row + 1]
+ << cm->seq_params.mib_size_log2;
+ tile->tile_row = row;
+ tile->mi_row_start = mi_row_start;
+ tile->mi_row_end = AOMMIN(mi_row_end, cm->mi_rows);
+ assert(tile->mi_row_end > tile->mi_row_start);
+}
+
+void av1_tile_set_col(TileInfo *tile, const AV1_COMMON *cm, int col) {
+ assert(col < cm->tile_cols);
+ int mi_col_start = cm->tile_col_start_sb[col] << cm->seq_params.mib_size_log2;
+ int mi_col_end = cm->tile_col_start_sb[col + 1]
+ << cm->seq_params.mib_size_log2;
+ tile->tile_col = col;
+ tile->mi_col_start = mi_col_start;
+ tile->mi_col_end = AOMMIN(mi_col_end, cm->mi_cols);
+ assert(tile->mi_col_end > tile->mi_col_start);
+}
+
+int av1_get_sb_rows_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_rows_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_row_end - tile.mi_row_start, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_rows;
+}
+
+int av1_get_sb_cols_in_tile(AV1_COMMON *cm, TileInfo tile) {
+ int mi_cols_aligned_to_sb = ALIGN_POWER_OF_TWO(
+ tile.mi_col_end - tile.mi_col_start, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ return sb_cols;
+}
+
+int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles) {
+ // Round the frame up to a whole number of max superblocks
+ mi_frame_size = ALIGN_POWER_OF_TWO(mi_frame_size, MAX_MIB_SIZE_LOG2);
+
+ // Divide by the signalled number of tiles, rounding up to the multiple of
+ // the max superblock size. To do this, shift right (and round up) to get the
+ // tile size in max super-blocks and then shift left again to convert it to
+ // mi units.
+ const int shift = log2_tile_num + MAX_MIB_SIZE_LOG2;
+ const int max_sb_tile_size =
+ ALIGN_POWER_OF_TWO(mi_frame_size, shift) >> shift;
+ const int mi_tile_size = max_sb_tile_size << MAX_MIB_SIZE_LOG2;
+
+ // The actual number of tiles is the ceiling of the frame size in mi units
+ // divided by mi_size. This is at most 1 << log2_tile_num but might be
+ // strictly less if max_sb_tile_size got rounded up significantly.
+ if (ntiles) {
+ *ntiles = (mi_frame_size + mi_tile_size - 1) / mi_tile_size;
+ assert(*ntiles <= (1 << log2_tile_num));
+ }
+
+ return mi_tile_size;
+}
+
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm,
+ int is_uv) {
+ AV1PixelRect r;
+
+ // Calculate position in the Y plane
+ r.left = tile_info->mi_col_start * MI_SIZE;
+ r.right = tile_info->mi_col_end * MI_SIZE;
+ r.top = tile_info->mi_row_start * MI_SIZE;
+ r.bottom = tile_info->mi_row_end * MI_SIZE;
+
+ // If upscaling is enabled, the tile limits need scaling to match the
+ // upscaled frame where the restoration units live. To do this, scale up the
+ // top-left and bottom-right of the tile.
+ if (av1_superres_scaled(cm)) {
+ av1_calculate_unscaled_superres_size(&r.left, &r.top,
+ cm->superres_scale_denominator);
+ av1_calculate_unscaled_superres_size(&r.right, &r.bottom,
+ cm->superres_scale_denominator);
+ }
+
+ const int frame_w = cm->superres_upscaled_width;
+ const int frame_h = cm->superres_upscaled_height;
+
+ // Make sure we don't fall off the bottom-right of the frame.
+ r.right = AOMMIN(r.right, frame_w);
+ r.bottom = AOMMIN(r.bottom, frame_h);
+
+ // Convert to coordinates in the appropriate plane
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+
+ r.left = ROUND_POWER_OF_TWO(r.left, ss_x);
+ r.right = ROUND_POWER_OF_TWO(r.right, ss_x);
+ r.top = ROUND_POWER_OF_TWO(r.top, ss_y);
+ r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y);
+
+ return r;
+}
diff --git a/third_party/aom/av1/common/tile_common.h b/third_party/aom/av1/common/tile_common.h
new file mode 100644
index 0000000000..c03553dc6f
--- /dev/null
+++ b/third_party/aom/av1/common/tile_common.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TILE_COMMON_H_
+#define AOM_AV1_COMMON_TILE_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+struct AV1Common;
+
+#define DEFAULT_MAX_NUM_TG 1
+
+typedef struct TileInfo {
+ int mi_row_start, mi_row_end;
+ int mi_col_start, mi_col_end;
+ int tg_horz_boundary;
+ int tile_row;
+ int tile_col;
+} TileInfo;
+
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
+// 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
+void av1_tile_init(TileInfo *tile, const struct AV1Common *cm, int row,
+ int col);
+
+void av1_tile_set_row(TileInfo *tile, const struct AV1Common *cm, int row);
+void av1_tile_set_col(TileInfo *tile, const struct AV1Common *cm, int col);
+void av1_get_tile_n_bits(int mi_cols, int *min_log2_tile_cols,
+ int *max_log2_tile_cols);
+
+// Calculate the correct tile size (width or height) for (1 << log2_tile_num)
+// tiles horizontally or vertically in the frame.
+int get_tile_size(int mi_frame_size, int log2_tile_num, int *ntiles);
+
+int av1_get_sb_rows_in_tile(struct AV1Common *cm, TileInfo tile);
+int av1_get_sb_cols_in_tile(struct AV1Common *cm, TileInfo tile);
+
+typedef struct {
+ int left, top, right, bottom;
+} AV1PixelRect;
+
+// Return the pixel extents of the given tile
+AV1PixelRect av1_get_tile_rect(const TileInfo *tile_info,
+ const struct AV1Common *cm, int is_uv);
+
+// Define tile maximum width and area
+// There is no maximum height since height is limited by area and width limits
+// The minimum tile width or height is fixed at one superblock
+#define MAX_TILE_WIDTH (4096) // Max Tile width in pixels
+#define MAX_TILE_AREA (4096 * 2304) // Maximum tile area in pixels
+
+void av1_get_tile_limits(struct AV1Common *const cm);
+void av1_calculate_tile_cols(struct AV1Common *const cm);
+void av1_calculate_tile_rows(struct AV1Common *const cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_TILE_COMMON_H_
diff --git a/third_party/aom/av1/common/timing.c b/third_party/aom/av1/common/timing.c
new file mode 100644
index 0000000000..49dbde78fb
--- /dev/null
+++ b/third_party/aom/av1/common/timing.c
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/timing.h"
+
+/* Tables for AV1 max bitrates for different levels of main and high tier.
+ * The tables are in Kbps instead of Mbps in the specification.
+ * Note that depending on the profile, a multiplier is needed.
+ */
+
+/* Max Bitrates for levels of Main Tier in kbps. Bitrate in main_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t main_kbps[1 << LEVEL_BITS] = {
+ 1500, 3000, 0, 0, 6000, 10000, 0, 0, 12000, 20000, 0,
+ 0, 30000, 40000, 60000, 60000, 60000, 100000, 160000, 160000, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, (1 << 26)
+};
+
+/* Max Bitrates for levels of High Tier in kbps. Bitrate in high_kbps [31] */
+/* is a dummy value. The decoder model is not applicable for level 31. */
+static int32_t high_kbps[1 << LEVEL_BITS] = {
+ 0, 0, 0, 0, 0, 0, 0, 0,
+ 30000, 50000, 0, 0, 100000, 160000, 240000, 240000,
+ 240000, 480000, 800000, 800000, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, (1 << 26)
+};
+
+/* BitrateProfileFactor */
+static int bitrate_profile_factor[1 << PROFILE_BITS] = {
+ 1, 2, 3, 0, 0, 0, 0, 0
+};
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier) {
+ int64_t bitrate;
+
+ if (seq_tier) {
+ bitrate = high_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+ } else {
+ bitrate = main_kbps[seq_level_idx] * bitrate_profile_factor[seq_profile];
+ }
+
+ return bitrate * 1000;
+}
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model) {
+ decoder_model->encoder_decoder_buffer_delay_length = 16;
+ decoder_model->buffer_removal_time_length = 10;
+ decoder_model->frame_presentation_time_length = 10;
+}
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params) {
+ op_params->decoder_model_param_present_flag = 1;
+ op_params->decoder_buffer_delay = 90000 >> 1; // 0.5 s
+ op_params->encoder_buffer_delay = 90000 >> 1; // 0.5 s
+ op_params->low_delay_mode_flag = 0;
+ op_params->display_model_param_present_flag = 1;
+ op_params->initial_display_delay = 8; // 8 frames delay
+}
+
+void set_resource_availability_parameters(
+ aom_dec_model_op_parameters_t *op_params) {
+ op_params->decoder_model_param_present_flag = 0;
+ op_params->decoder_buffer_delay =
+ 70000; // Resource availability mode default
+ op_params->encoder_buffer_delay =
+ 20000; // Resource availability mode default
+ op_params->low_delay_mode_flag = 0; // Resource availability mode default
+ op_params->display_model_param_present_flag = 1;
+ op_params->initial_display_delay = 8; // 8 frames delay
+}
diff --git a/third_party/aom/av1/common/timing.h b/third_party/aom/av1/common/timing.h
new file mode 100644
index 0000000000..06939ae43a
--- /dev/null
+++ b/third_party/aom/av1/common/timing.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TIMING_H_
+#define AOM_AV1_COMMON_TIMING_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/enums.h"
+
+#define MAX_NUM_OP_POINTS 32
+
+typedef struct aom_timing {
+ uint32_t num_units_in_display_tick;
+ uint32_t time_scale;
+ int equal_picture_interval;
+ uint32_t num_ticks_per_picture;
+} aom_timing_info_t;
+
+typedef struct aom_dec_model_info {
+ uint32_t num_units_in_decoding_tick;
+ int encoder_decoder_buffer_delay_length;
+ int buffer_removal_time_length;
+ int frame_presentation_time_length;
+} aom_dec_model_info_t;
+
+typedef struct aom_dec_model_op_parameters {
+ int decoder_model_param_present_flag;
+ int64_t bitrate;
+ int64_t buffer_size;
+ uint32_t decoder_buffer_delay;
+ uint32_t encoder_buffer_delay;
+ int low_delay_mode_flag;
+ int display_model_param_present_flag;
+ int initial_display_delay;
+} aom_dec_model_op_parameters_t;
+
+typedef struct aom_op_timing_info_t {
+ uint32_t buffer_removal_time;
+} aom_op_timing_info_t;
+
+void set_aom_dec_model_info(aom_dec_model_info_t *decoder_model);
+
+void set_dec_model_op_parameters(aom_dec_model_op_parameters_t *op_params);
+
+void set_resource_availability_parameters(
+ aom_dec_model_op_parameters_t *op_params);
+
+int64_t max_level_bitrate(BITSTREAM_PROFILE seq_profile, int seq_level_idx,
+ int seq_tier);
+
+#endif // AOM_AV1_COMMON_TIMING_H_
diff --git a/third_party/aom/av1/common/token_cdfs.h b/third_party/aom/av1/common/token_cdfs.h
new file mode 100644
index 0000000000..53e956450b
--- /dev/null
+++ b/third_party/aom/av1/common/token_cdfs.h
@@ -0,0 +1,3555 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TOKEN_CDFS_H_
+#define AOM_AV1_COMMON_TOKEN_CDFS_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/entropy.h"
+
+static const aom_cdf_prob
+ av1_default_dc_sign_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][DC_SIGN_CONTEXTS]
+ [CDF_SIZE(2)] = {
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ { {
+ { AOM_CDF2(128 * 125) },
+ { AOM_CDF2(128 * 102) },
+ { AOM_CDF2(128 * 147) },
+ },
+ {
+ { AOM_CDF2(128 * 119) },
+ { AOM_CDF2(128 * 101) },
+ { AOM_CDF2(128 * 135) },
+ } },
+ };
+
+static const aom_cdf_prob
+ av1_default_txb_skip_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS]
+ [CDF_SIZE(2)] = { { { { AOM_CDF2(31849) },
+ { AOM_CDF2(5892) },
+ { AOM_CDF2(12112) },
+ { AOM_CDF2(21935) },
+ { AOM_CDF2(20289) },
+ { AOM_CDF2(27473) },
+ { AOM_CDF2(32487) },
+ { AOM_CDF2(7654) },
+ { AOM_CDF2(19473) },
+ { AOM_CDF2(29984) },
+ { AOM_CDF2(9961) },
+ { AOM_CDF2(30242) },
+ { AOM_CDF2(32117) } },
+ { { AOM_CDF2(31548) },
+ { AOM_CDF2(1549) },
+ { AOM_CDF2(10130) },
+ { AOM_CDF2(16656) },
+ { AOM_CDF2(18591) },
+ { AOM_CDF2(26308) },
+ { AOM_CDF2(32537) },
+ { AOM_CDF2(5403) },
+ { AOM_CDF2(18096) },
+ { AOM_CDF2(30003) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(29957) },
+ { AOM_CDF2(5391) },
+ { AOM_CDF2(18039) },
+ { AOM_CDF2(23566) },
+ { AOM_CDF2(22431) },
+ { AOM_CDF2(25822) },
+ { AOM_CDF2(32197) },
+ { AOM_CDF2(3778) },
+ { AOM_CDF2(15336) },
+ { AOM_CDF2(28981) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(17920) },
+ { AOM_CDF2(1818) },
+ { AOM_CDF2(7282) },
+ { AOM_CDF2(25273) },
+ { AOM_CDF2(10923) },
+ { AOM_CDF2(31554) },
+ { AOM_CDF2(32624) },
+ { AOM_CDF2(1366) },
+ { AOM_CDF2(15628) },
+ { AOM_CDF2(30462) },
+ { AOM_CDF2(146) },
+ { AOM_CDF2(5132) },
+ { AOM_CDF2(31657) } },
+ { { AOM_CDF2(6308) },
+ { AOM_CDF2(117) },
+ { AOM_CDF2(1638) },
+ { AOM_CDF2(2161) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(10923) },
+ { AOM_CDF2(30247) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(30371) },
+ { AOM_CDF2(7570) },
+ { AOM_CDF2(13155) },
+ { AOM_CDF2(20751) },
+ { AOM_CDF2(20969) },
+ { AOM_CDF2(27067) },
+ { AOM_CDF2(32013) },
+ { AOM_CDF2(5495) },
+ { AOM_CDF2(17942) },
+ { AOM_CDF2(28280) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31782) },
+ { AOM_CDF2(1836) },
+ { AOM_CDF2(10689) },
+ { AOM_CDF2(17604) },
+ { AOM_CDF2(21622) },
+ { AOM_CDF2(27518) },
+ { AOM_CDF2(32399) },
+ { AOM_CDF2(4419) },
+ { AOM_CDF2(16294) },
+ { AOM_CDF2(28345) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31901) },
+ { AOM_CDF2(10311) },
+ { AOM_CDF2(18047) },
+ { AOM_CDF2(24806) },
+ { AOM_CDF2(23288) },
+ { AOM_CDF2(27914) },
+ { AOM_CDF2(32296) },
+ { AOM_CDF2(4215) },
+ { AOM_CDF2(15756) },
+ { AOM_CDF2(28341) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(26726) },
+ { AOM_CDF2(1045) },
+ { AOM_CDF2(11703) },
+ { AOM_CDF2(20590) },
+ { AOM_CDF2(18554) },
+ { AOM_CDF2(25970) },
+ { AOM_CDF2(31938) },
+ { AOM_CDF2(5583) },
+ { AOM_CDF2(21313) },
+ { AOM_CDF2(29390) },
+ { AOM_CDF2(641) },
+ { AOM_CDF2(22265) },
+ { AOM_CDF2(31452) } },
+ { { AOM_CDF2(26584) },
+ { AOM_CDF2(188) },
+ { AOM_CDF2(8847) },
+ { AOM_CDF2(24519) },
+ { AOM_CDF2(22938) },
+ { AOM_CDF2(30583) },
+ { AOM_CDF2(32608) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(29614) },
+ { AOM_CDF2(9068) },
+ { AOM_CDF2(12924) },
+ { AOM_CDF2(19538) },
+ { AOM_CDF2(17737) },
+ { AOM_CDF2(24619) },
+ { AOM_CDF2(30642) },
+ { AOM_CDF2(4119) },
+ { AOM_CDF2(16026) },
+ { AOM_CDF2(25657) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31957) },
+ { AOM_CDF2(3230) },
+ { AOM_CDF2(11153) },
+ { AOM_CDF2(18123) },
+ { AOM_CDF2(20143) },
+ { AOM_CDF2(26536) },
+ { AOM_CDF2(31986) },
+ { AOM_CDF2(3050) },
+ { AOM_CDF2(14603) },
+ { AOM_CDF2(25155) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(32363) },
+ { AOM_CDF2(10692) },
+ { AOM_CDF2(19090) },
+ { AOM_CDF2(24357) },
+ { AOM_CDF2(24442) },
+ { AOM_CDF2(28312) },
+ { AOM_CDF2(32169) },
+ { AOM_CDF2(3648) },
+ { AOM_CDF2(15690) },
+ { AOM_CDF2(26815) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(30669) },
+ { AOM_CDF2(3832) },
+ { AOM_CDF2(11663) },
+ { AOM_CDF2(18889) },
+ { AOM_CDF2(19782) },
+ { AOM_CDF2(23313) },
+ { AOM_CDF2(31330) },
+ { AOM_CDF2(5124) },
+ { AOM_CDF2(18719) },
+ { AOM_CDF2(28468) },
+ { AOM_CDF2(3082) },
+ { AOM_CDF2(20982) },
+ { AOM_CDF2(29443) } },
+ { { AOM_CDF2(28573) },
+ { AOM_CDF2(3183) },
+ { AOM_CDF2(17802) },
+ { AOM_CDF2(25977) },
+ { AOM_CDF2(26677) },
+ { AOM_CDF2(27832) },
+ { AOM_CDF2(32387) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } },
+ { { { AOM_CDF2(26887) },
+ { AOM_CDF2(6729) },
+ { AOM_CDF2(10361) },
+ { AOM_CDF2(17442) },
+ { AOM_CDF2(15045) },
+ { AOM_CDF2(22478) },
+ { AOM_CDF2(29072) },
+ { AOM_CDF2(2713) },
+ { AOM_CDF2(11861) },
+ { AOM_CDF2(20773) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31903) },
+ { AOM_CDF2(2044) },
+ { AOM_CDF2(7528) },
+ { AOM_CDF2(14618) },
+ { AOM_CDF2(16182) },
+ { AOM_CDF2(24168) },
+ { AOM_CDF2(31037) },
+ { AOM_CDF2(2786) },
+ { AOM_CDF2(11194) },
+ { AOM_CDF2(20155) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(32510) },
+ { AOM_CDF2(8430) },
+ { AOM_CDF2(17318) },
+ { AOM_CDF2(24154) },
+ { AOM_CDF2(23674) },
+ { AOM_CDF2(28789) },
+ { AOM_CDF2(32139) },
+ { AOM_CDF2(3440) },
+ { AOM_CDF2(13117) },
+ { AOM_CDF2(22702) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } },
+ { { AOM_CDF2(31671) },
+ { AOM_CDF2(2056) },
+ { AOM_CDF2(11746) },
+ { AOM_CDF2(16852) },
+ { AOM_CDF2(18635) },
+ { AOM_CDF2(24715) },
+ { AOM_CDF2(31484) },
+ { AOM_CDF2(4656) },
+ { AOM_CDF2(16074) },
+ { AOM_CDF2(24704) },
+ { AOM_CDF2(1806) },
+ { AOM_CDF2(14645) },
+ { AOM_CDF2(25336) } },
+ { { AOM_CDF2(31539) },
+ { AOM_CDF2(8433) },
+ { AOM_CDF2(20576) },
+ { AOM_CDF2(27904) },
+ { AOM_CDF2(27852) },
+ { AOM_CDF2(30026) },
+ { AOM_CDF2(32441) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_extra_cdfs[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [EOB_COEF_CONTEXTS][CDF_SIZE(2)] = {
+ { { {
+ { AOM_CDF2(16961) },
+ { AOM_CDF2(17223) },
+ { AOM_CDF2(7621) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(19069) },
+ { AOM_CDF2(22525) },
+ { AOM_CDF2(13377) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20401) },
+ { AOM_CDF2(17025) },
+ { AOM_CDF2(12845) },
+ { AOM_CDF2(12873) },
+ { AOM_CDF2(14094) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20681) },
+ { AOM_CDF2(20701) },
+ { AOM_CDF2(15250) },
+ { AOM_CDF2(15017) },
+ { AOM_CDF2(14928) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(23905) },
+ { AOM_CDF2(17194) },
+ { AOM_CDF2(16170) },
+ { AOM_CDF2(17695) },
+ { AOM_CDF2(13826) },
+ { AOM_CDF2(15810) },
+ { AOM_CDF2(12036) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(23959) },
+ { AOM_CDF2(20799) },
+ { AOM_CDF2(19021) },
+ { AOM_CDF2(16203) },
+ { AOM_CDF2(17886) },
+ { AOM_CDF2(14144) },
+ { AOM_CDF2(12010) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(27399) },
+ { AOM_CDF2(16327) },
+ { AOM_CDF2(18071) },
+ { AOM_CDF2(19584) },
+ { AOM_CDF2(20721) },
+ { AOM_CDF2(18432) },
+ { AOM_CDF2(19560) },
+ { AOM_CDF2(10150) },
+ { AOM_CDF2(8805) },
+ },
+ {
+ { AOM_CDF2(24932) },
+ { AOM_CDF2(20833) },
+ { AOM_CDF2(12027) },
+ { AOM_CDF2(16670) },
+ { AOM_CDF2(19914) },
+ { AOM_CDF2(15106) },
+ { AOM_CDF2(17662) },
+ { AOM_CDF2(13783) },
+ { AOM_CDF2(28756) },
+ } },
+ { {
+ { AOM_CDF2(23406) },
+ { AOM_CDF2(21845) },
+ { AOM_CDF2(18432) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(17096) },
+ { AOM_CDF2(12561) },
+ { AOM_CDF2(17320) },
+ { AOM_CDF2(22395) },
+ { AOM_CDF2(21370) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(17471) },
+ { AOM_CDF2(20223) },
+ { AOM_CDF2(11357) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20335) },
+ { AOM_CDF2(21667) },
+ { AOM_CDF2(14818) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20430) },
+ { AOM_CDF2(20662) },
+ { AOM_CDF2(15367) },
+ { AOM_CDF2(16970) },
+ { AOM_CDF2(14657) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22117) },
+ { AOM_CDF2(22028) },
+ { AOM_CDF2(18650) },
+ { AOM_CDF2(16042) },
+ { AOM_CDF2(15885) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(22409) },
+ { AOM_CDF2(21012) },
+ { AOM_CDF2(15650) },
+ { AOM_CDF2(17395) },
+ { AOM_CDF2(15469) },
+ { AOM_CDF2(20205) },
+ { AOM_CDF2(19511) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(24220) },
+ { AOM_CDF2(22480) },
+ { AOM_CDF2(17737) },
+ { AOM_CDF2(18916) },
+ { AOM_CDF2(19268) },
+ { AOM_CDF2(18412) },
+ { AOM_CDF2(18844) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(25991) },
+ { AOM_CDF2(20314) },
+ { AOM_CDF2(17731) },
+ { AOM_CDF2(19678) },
+ { AOM_CDF2(18649) },
+ { AOM_CDF2(17307) },
+ { AOM_CDF2(21798) },
+ { AOM_CDF2(17549) },
+ { AOM_CDF2(15630) },
+ },
+ {
+ { AOM_CDF2(26585) },
+ { AOM_CDF2(21469) },
+ { AOM_CDF2(20432) },
+ { AOM_CDF2(17735) },
+ { AOM_CDF2(19280) },
+ { AOM_CDF2(15235) },
+ { AOM_CDF2(20297) },
+ { AOM_CDF2(22471) },
+ { AOM_CDF2(28997) },
+ } },
+ { {
+ { AOM_CDF2(26605) },
+ { AOM_CDF2(11304) },
+ { AOM_CDF2(16726) },
+ { AOM_CDF2(16560) },
+ { AOM_CDF2(20866) },
+ { AOM_CDF2(23524) },
+ { AOM_CDF2(19878) },
+ { AOM_CDF2(13469) },
+ { AOM_CDF2(23084) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(18983) },
+ { AOM_CDF2(20512) },
+ { AOM_CDF2(14885) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20090) },
+ { AOM_CDF2(19444) },
+ { AOM_CDF2(17286) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19139) },
+ { AOM_CDF2(21487) },
+ { AOM_CDF2(18959) },
+ { AOM_CDF2(20910) },
+ { AOM_CDF2(19089) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20536) },
+ { AOM_CDF2(20664) },
+ { AOM_CDF2(20625) },
+ { AOM_CDF2(19123) },
+ { AOM_CDF2(14862) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19833) },
+ { AOM_CDF2(21502) },
+ { AOM_CDF2(17485) },
+ { AOM_CDF2(20267) },
+ { AOM_CDF2(18353) },
+ { AOM_CDF2(23329) },
+ { AOM_CDF2(21478) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22041) },
+ { AOM_CDF2(23434) },
+ { AOM_CDF2(20001) },
+ { AOM_CDF2(20554) },
+ { AOM_CDF2(20951) },
+ { AOM_CDF2(20145) },
+ { AOM_CDF2(15562) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(23312) },
+ { AOM_CDF2(21607) },
+ { AOM_CDF2(16526) },
+ { AOM_CDF2(18957) },
+ { AOM_CDF2(18034) },
+ { AOM_CDF2(18934) },
+ { AOM_CDF2(24247) },
+ { AOM_CDF2(16921) },
+ { AOM_CDF2(17080) },
+ },
+ {
+ { AOM_CDF2(26579) },
+ { AOM_CDF2(24910) },
+ { AOM_CDF2(18637) },
+ { AOM_CDF2(19800) },
+ { AOM_CDF2(20388) },
+ { AOM_CDF2(9887) },
+ { AOM_CDF2(15642) },
+ { AOM_CDF2(30198) },
+ { AOM_CDF2(24721) },
+ } },
+ { {
+ { AOM_CDF2(26998) },
+ { AOM_CDF2(16737) },
+ { AOM_CDF2(17838) },
+ { AOM_CDF2(18922) },
+ { AOM_CDF2(19515) },
+ { AOM_CDF2(18636) },
+ { AOM_CDF2(17333) },
+ { AOM_CDF2(15776) },
+ { AOM_CDF2(22658) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } },
+ { { {
+ { AOM_CDF2(20177) },
+ { AOM_CDF2(20789) },
+ { AOM_CDF2(20262) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(21416) },
+ { AOM_CDF2(20855) },
+ { AOM_CDF2(23410) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(20238) },
+ { AOM_CDF2(21057) },
+ { AOM_CDF2(19159) },
+ { AOM_CDF2(22337) },
+ { AOM_CDF2(20159) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(20125) },
+ { AOM_CDF2(20559) },
+ { AOM_CDF2(21707) },
+ { AOM_CDF2(22296) },
+ { AOM_CDF2(17333) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(19941) },
+ { AOM_CDF2(20527) },
+ { AOM_CDF2(21470) },
+ { AOM_CDF2(22487) },
+ { AOM_CDF2(19558) },
+ { AOM_CDF2(22354) },
+ { AOM_CDF2(20331) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ },
+ {
+ { AOM_CDF2(22752) },
+ { AOM_CDF2(25006) },
+ { AOM_CDF2(22075) },
+ { AOM_CDF2(21576) },
+ { AOM_CDF2(17740) },
+ { AOM_CDF2(21690) },
+ { AOM_CDF2(19211) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } },
+ { {
+ { AOM_CDF2(21442) },
+ { AOM_CDF2(22358) },
+ { AOM_CDF2(18503) },
+ { AOM_CDF2(20291) },
+ { AOM_CDF2(19945) },
+ { AOM_CDF2(21294) },
+ { AOM_CDF2(21178) },
+ { AOM_CDF2(19400) },
+ { AOM_CDF2(10556) },
+ },
+ {
+ { AOM_CDF2(24648) },
+ { AOM_CDF2(24949) },
+ { AOM_CDF2(20708) },
+ { AOM_CDF2(23905) },
+ { AOM_CDF2(20501) },
+ { AOM_CDF2(9558) },
+ { AOM_CDF2(9423) },
+ { AOM_CDF2(30365) },
+ { AOM_CDF2(19253) },
+ } },
+ { {
+ { AOM_CDF2(26064) },
+ { AOM_CDF2(22098) },
+ { AOM_CDF2(19613) },
+ { AOM_CDF2(20525) },
+ { AOM_CDF2(17595) },
+ { AOM_CDF2(16618) },
+ { AOM_CDF2(20497) },
+ { AOM_CDF2(18989) },
+ { AOM_CDF2(15513) },
+ },
+ {
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ { AOM_CDF2(16384) },
+ } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi16_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 5)] = { { { { AOM_CDF5(840, 1039, 1980, 4895) },
+ { AOM_CDF5(370, 671, 1883, 4471) } },
+ { { AOM_CDF5(3247, 4950, 9688, 14563) },
+ { AOM_CDF5(1904, 3354, 7763, 14647) } } },
+ { { { AOM_CDF5(2125, 2551, 5165, 8946) },
+ { AOM_CDF5(513, 765, 1859, 6339) } },
+ { { AOM_CDF5(7637, 9498, 14259, 19108) },
+ { AOM_CDF5(2497, 4096, 8866, 16993) } } },
+ { { { AOM_CDF5(4016, 4897, 8881, 14968) },
+ { AOM_CDF5(716, 1105, 2646, 10056) } },
+ { { AOM_CDF5(11139, 13270, 18241, 23566) },
+ { AOM_CDF5(3192, 5032, 10297, 19755) } } },
+ { { { AOM_CDF5(6708, 8958, 14746, 22133) },
+ { AOM_CDF5(1222, 2074, 4783, 15410) } },
+ { { AOM_CDF5(19575, 21766, 26044, 29709) },
+ { AOM_CDF5(7297, 10767, 19273, 28194) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi32_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 6)] = { { { { AOM_CDF6(400, 520, 977, 2102, 6542) },
+ { AOM_CDF6(210, 405, 1315, 3326, 7537) } },
+ { { AOM_CDF6(2636, 4273, 7588, 11794, 20401) },
+ { AOM_CDF6(1786, 3179, 6902, 11357, 19054) } } },
+ { { { AOM_CDF6(989, 1249, 2019, 4151, 10785) },
+ { AOM_CDF6(313, 441, 1099, 2917, 8562) } },
+ { { AOM_CDF6(8394, 10352, 13932, 18855, 26014) },
+ { AOM_CDF6(2578, 4124, 8181, 13670, 24234) } } },
+ { { { AOM_CDF6(2515, 3003, 4452, 8162, 16041) },
+ { AOM_CDF6(574, 821, 1836, 5089, 13128) } },
+ { { AOM_CDF6(13468, 16303, 20361, 25105, 29281) },
+ { AOM_CDF6(3542, 5502, 10415, 16760, 25644) } } },
+ { { { AOM_CDF6(4617, 5709, 8446, 13584, 23135) },
+ { AOM_CDF6(1156, 1702, 3675, 9274, 20539) } },
+ { { AOM_CDF6(22086, 24282, 27010, 29770, 31743) },
+ { AOM_CDF6(7699, 10897, 20891, 26926, 31628) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi64_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 7)] = { { { { AOM_CDF7(329, 498, 1101, 1784, 3265, 7758) },
+ { AOM_CDF7(335, 730, 1459, 5494, 8755, 12997) } },
+ { { AOM_CDF7(3505, 5304, 10086, 13814, 17684, 23370) },
+ { AOM_CDF7(1563, 2700, 4876, 10911, 14706, 22480) } } },
+ { { { AOM_CDF7(1260, 1446, 2253, 3712, 6652, 13369) },
+ { AOM_CDF7(401, 605, 1029, 2563, 5845, 12626) } },
+ { { AOM_CDF7(8609, 10612, 14624, 18714, 22614, 29024) },
+ { AOM_CDF7(1923, 3127, 5867, 9703, 14277, 27100) } } },
+ { { { AOM_CDF7(2374, 2772, 4583, 7276, 12288, 19706) },
+ { AOM_CDF7(497, 810, 1315, 3000, 7004, 15641) } },
+ { { AOM_CDF7(15050, 17126, 21410, 24886, 28156, 30726) },
+ { AOM_CDF7(4034, 6290, 10235, 14982, 21214, 28491) } } },
+ { { { AOM_CDF7(6307, 7541, 12060, 16358, 22553, 27865) },
+ { AOM_CDF7(1289, 2320, 3971, 7926, 14153, 24291) } },
+ { { AOM_CDF7(24212, 25708, 28268, 30035, 31307, 32049) },
+ { AOM_CDF7(8726, 12378, 19409, 26450, 30038, 32462) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi128_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 8)] = {
+ { { { AOM_CDF8(219, 482, 1140, 2091, 3680, 6028, 12586) },
+ { AOM_CDF8(371, 699, 1254, 4830, 9479, 12562, 17497) } },
+ { { AOM_CDF8(5245, 7456, 12880, 15852, 20033, 23932, 27608) },
+ { AOM_CDF8(2054, 3472, 5869, 14232, 18242, 20590, 26752) } } },
+ { { { AOM_CDF8(685, 933, 1488, 2714, 4766, 8562, 19254) },
+ { AOM_CDF8(217, 352, 618, 2303, 5261, 9969, 17472) } },
+ { { AOM_CDF8(8045, 11200, 15497, 19595, 23948, 27408, 30938) },
+ { AOM_CDF8(2310, 4160, 7471, 14997, 17931, 20768, 30240) } } },
+ { { { AOM_CDF8(1366, 1738, 2527, 5016, 9355, 15797, 24643) },
+ { AOM_CDF8(354, 558, 944, 2760, 7287, 14037, 21779) } },
+ { { AOM_CDF8(13627, 16246, 20173, 24429, 27948, 30415, 31863) },
+ { AOM_CDF8(6275, 9889, 14769, 23164, 27988, 30493, 32272) } } },
+ { { { AOM_CDF8(3472, 4885, 7489, 12481, 18517, 24536, 29635) },
+ { AOM_CDF8(886, 1731, 3271, 8469, 15569, 22126, 28383) } },
+ { { AOM_CDF8(24313, 26062, 28385, 30107, 31217, 31898, 32345) },
+ { AOM_CDF8(9165, 13282, 21150, 30286, 31894, 32571, 32712) } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi256_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 9)] = {
+ { { { AOM_CDF9(310, 584, 1887, 3589, 6168, 8611, 11352, 15652) },
+ { AOM_CDF9(998, 1850, 2998, 5604, 17341, 19888, 22899, 25583) } },
+ { { AOM_CDF9(2520, 3240, 5952, 8870, 12577, 17558, 19954, 24168) },
+ { AOM_CDF9(2203, 4130, 7435, 10739, 20652, 23681, 25609, 27261) } } },
+ { { { AOM_CDF9(1448, 2109, 4151, 6263, 9329, 13260, 17944, 23300) },
+ { AOM_CDF9(399, 1019, 1749, 3038, 10444, 15546, 22739, 27294) } },
+ { { AOM_CDF9(6402, 8148, 12623, 15072, 18728, 22847, 26447, 29377) },
+ { AOM_CDF9(1674, 3252, 5734, 10159, 22397, 23802, 24821, 30940) } } },
+ { { { AOM_CDF9(3089, 3920, 6038, 9460, 14266, 19881, 25766, 29176) },
+ { AOM_CDF9(1084, 2358, 3488, 5122, 11483, 18103, 26023, 29799) } },
+ { { AOM_CDF9(11514, 13794, 17480, 20754, 24361, 27378, 29492, 31277) },
+ { AOM_CDF9(6571, 9610, 15516, 21826, 29092, 30829, 31842,
+ 32708) } } },
+ { { { AOM_CDF9(5348, 7113, 11820, 15924, 22106, 26777, 30334, 31757) },
+ { AOM_CDF9(2453, 4474, 6307, 8777, 16474, 22975, 29000, 31547) } },
+ { { AOM_CDF9(23110, 24597, 27140, 28894, 30167, 30927, 31392, 32094) },
+ { AOM_CDF9(9998, 17661, 25178, 28097, 31308, 32038, 32403,
+ 32695) } } }
+ };
+
+static const aom_cdf_prob
+ av1_default_eob_multi512_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 10)] = { { { { AOM_CDF10(641, 983, 3707, 5430, 10234, 14958, 18788,
+ 23412, 26061) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(5095, 6446, 9996, 13354, 16017, 17986, 20919,
+ 26129, 29140) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(1230, 2278, 5035, 7776, 11871, 15346, 19590,
+ 24584, 28749) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(7265, 9979, 15819, 19250, 21780, 23846, 26478,
+ 28396, 31811) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(2624, 3936, 6480, 9686, 13979, 17726, 23267,
+ 28410, 31078) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(12015, 14769, 19588, 22052, 24222, 25812,
+ 27300, 29219, 32114) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } },
+ { { { AOM_CDF10(5927, 7809, 10923, 14597, 19439, 24135, 28456,
+ 31142, 32060) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } },
+ { { AOM_CDF10(21093, 23043, 25742, 27658, 29097, 29716,
+ 30073, 30820, 31956) },
+ { AOM_CDF10(3277, 6554, 9830, 13107, 16384, 19661, 22938,
+ 26214, 29491) } } } };
+
+static const aom_cdf_prob
+ av1_default_eob_multi1024_cdfs[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][CDF_SIZE(
+ 11)] = { { { { AOM_CDF11(393, 421, 751, 1623, 3160, 6352, 13345, 18047,
+ 22571, 25830) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(1865, 1988, 2930, 4242, 10533, 16538, 21354,
+ 27255, 28546, 31784) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(696, 948, 3145, 5702, 9706, 13217, 17851,
+ 21856, 25692, 28034) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(2672, 3591, 9330, 17084, 22725, 24284, 26527,
+ 28027, 28377, 30876) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(2784, 3831, 7041, 10521, 14847, 18844, 23155,
+ 26682, 29229, 31045) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(9577, 12466, 17739, 20750, 22061, 23215, 24601,
+ 25483, 25843, 32056) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } },
+ { { { AOM_CDF11(6698, 8334, 11961, 15762, 20186, 23862, 27434,
+ 29326, 31082, 32050) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } },
+ { { AOM_CDF11(20569, 22426, 25569, 26859, 28053, 28913,
+ 29486, 29724, 29807, 32570) },
+ { AOM_CDF11(2979, 5958, 8937, 11916, 14895, 17873, 20852,
+ 23831, 26810, 29789) } } } };
+
+static const aom_cdf_prob av1_default_coeff_lps_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][LEVEL_CONTEXTS]
+ [CDF_SIZE(BR_CDF_SIZE)] = {
+ { { { { AOM_CDF4(14298, 20718, 24174) },
+ { AOM_CDF4(12536, 19601, 23789) },
+ { AOM_CDF4(8712, 15051, 19503) },
+ { AOM_CDF4(6170, 11327, 15434) },
+ { AOM_CDF4(4742, 8926, 12538) },
+ { AOM_CDF4(3803, 7317, 10546) },
+ { AOM_CDF4(1696, 3317, 4871) },
+ { AOM_CDF4(14392, 19951, 22756) },
+ { AOM_CDF4(15978, 23218, 26818) },
+ { AOM_CDF4(12187, 19474, 23889) },
+ { AOM_CDF4(9176, 15640, 20259) },
+ { AOM_CDF4(7068, 12655, 17028) },
+ { AOM_CDF4(5656, 10442, 14472) },
+ { AOM_CDF4(2580, 4992, 7244) },
+ { AOM_CDF4(12136, 18049, 21426) },
+ { AOM_CDF4(13784, 20721, 24481) },
+ { AOM_CDF4(10836, 17621, 21900) },
+ { AOM_CDF4(8372, 14444, 18847) },
+ { AOM_CDF4(6523, 11779, 16000) },
+ { AOM_CDF4(5337, 9898, 13760) },
+ { AOM_CDF4(3034, 5860, 8462) } },
+ { { AOM_CDF4(15967, 22905, 26286) },
+ { AOM_CDF4(13534, 20654, 24579) },
+ { AOM_CDF4(9504, 16092, 20535) },
+ { AOM_CDF4(6975, 12568, 16903) },
+ { AOM_CDF4(5364, 10091, 14020) },
+ { AOM_CDF4(4357, 8370, 11857) },
+ { AOM_CDF4(2506, 4934, 7218) },
+ { AOM_CDF4(23032, 28815, 30936) },
+ { AOM_CDF4(19540, 26704, 29719) },
+ { AOM_CDF4(15158, 22969, 27097) },
+ { AOM_CDF4(11408, 18865, 23650) },
+ { AOM_CDF4(8885, 15448, 20250) },
+ { AOM_CDF4(7108, 12853, 17416) },
+ { AOM_CDF4(4231, 8041, 11480) },
+ { AOM_CDF4(19823, 26490, 29156) },
+ { AOM_CDF4(18890, 25929, 28932) },
+ { AOM_CDF4(15660, 23491, 27433) },
+ { AOM_CDF4(12147, 19776, 24488) },
+ { AOM_CDF4(9728, 16774, 21649) },
+ { AOM_CDF4(7919, 14277, 19066) },
+ { AOM_CDF4(5440, 10170, 14185) } } },
+ { { { AOM_CDF4(14406, 20862, 24414) },
+ { AOM_CDF4(11824, 18907, 23109) },
+ { AOM_CDF4(8257, 14393, 18803) },
+ { AOM_CDF4(5860, 10747, 14778) },
+ { AOM_CDF4(4475, 8486, 11984) },
+ { AOM_CDF4(3606, 6954, 10043) },
+ { AOM_CDF4(1736, 3410, 5048) },
+ { AOM_CDF4(14430, 20046, 22882) },
+ { AOM_CDF4(15593, 22899, 26709) },
+ { AOM_CDF4(12102, 19368, 23811) },
+ { AOM_CDF4(9059, 15584, 20262) },
+ { AOM_CDF4(6999, 12603, 17048) },
+ { AOM_CDF4(5684, 10497, 14553) },
+ { AOM_CDF4(2822, 5438, 7862) },
+ { AOM_CDF4(15785, 21585, 24359) },
+ { AOM_CDF4(18347, 25229, 28266) },
+ { AOM_CDF4(14974, 22487, 26389) },
+ { AOM_CDF4(11423, 18681, 23271) },
+ { AOM_CDF4(8863, 15350, 20008) },
+ { AOM_CDF4(7153, 12852, 17278) },
+ { AOM_CDF4(3707, 7036, 9982) } },
+ { { AOM_CDF4(15460, 21696, 25469) },
+ { AOM_CDF4(12170, 19249, 23191) },
+ { AOM_CDF4(8723, 15027, 19332) },
+ { AOM_CDF4(6428, 11704, 15874) },
+ { AOM_CDF4(4922, 9292, 13052) },
+ { AOM_CDF4(4139, 7695, 11010) },
+ { AOM_CDF4(2291, 4508, 6598) },
+ { AOM_CDF4(19856, 26920, 29828) },
+ { AOM_CDF4(17923, 25289, 28792) },
+ { AOM_CDF4(14278, 21968, 26297) },
+ { AOM_CDF4(10910, 18136, 22950) },
+ { AOM_CDF4(8423, 14815, 19627) },
+ { AOM_CDF4(6771, 12283, 16774) },
+ { AOM_CDF4(4074, 7750, 11081) },
+ { AOM_CDF4(19852, 26074, 28672) },
+ { AOM_CDF4(19371, 26110, 28989) },
+ { AOM_CDF4(16265, 23873, 27663) },
+ { AOM_CDF4(12758, 20378, 24952) },
+ { AOM_CDF4(10095, 17098, 21961) },
+ { AOM_CDF4(8250, 14628, 19451) },
+ { AOM_CDF4(5205, 9745, 13622) } } },
+ { { { AOM_CDF4(10563, 16233, 19763) },
+ { AOM_CDF4(9794, 16022, 19804) },
+ { AOM_CDF4(6750, 11945, 15759) },
+ { AOM_CDF4(4963, 9186, 12752) },
+ { AOM_CDF4(3845, 7435, 10627) },
+ { AOM_CDF4(3051, 6085, 8834) },
+ { AOM_CDF4(1311, 2596, 3830) },
+ { AOM_CDF4(11246, 16404, 19689) },
+ { AOM_CDF4(12315, 18911, 22731) },
+ { AOM_CDF4(10557, 17095, 21289) },
+ { AOM_CDF4(8136, 14006, 18249) },
+ { AOM_CDF4(6348, 11474, 15565) },
+ { AOM_CDF4(5196, 9655, 13400) },
+ { AOM_CDF4(2349, 4526, 6587) },
+ { AOM_CDF4(13337, 18730, 21569) },
+ { AOM_CDF4(19306, 26071, 28882) },
+ { AOM_CDF4(15952, 23540, 27254) },
+ { AOM_CDF4(12409, 19934, 24430) },
+ { AOM_CDF4(9760, 16706, 21389) },
+ { AOM_CDF4(8004, 14220, 18818) },
+ { AOM_CDF4(4138, 7794, 10961) } },
+ { { AOM_CDF4(10870, 16684, 20949) },
+ { AOM_CDF4(9664, 15230, 18680) },
+ { AOM_CDF4(6886, 12109, 15408) },
+ { AOM_CDF4(4825, 8900, 12305) },
+ { AOM_CDF4(3630, 7162, 10314) },
+ { AOM_CDF4(3036, 6429, 9387) },
+ { AOM_CDF4(1671, 3296, 4940) },
+ { AOM_CDF4(13819, 19159, 23026) },
+ { AOM_CDF4(11984, 19108, 23120) },
+ { AOM_CDF4(10690, 17210, 21663) },
+ { AOM_CDF4(7984, 14154, 18333) },
+ { AOM_CDF4(6868, 12294, 16124) },
+ { AOM_CDF4(5274, 8994, 12868) },
+ { AOM_CDF4(2988, 5771, 8424) },
+ { AOM_CDF4(19736, 26647, 29141) },
+ { AOM_CDF4(18933, 26070, 28984) },
+ { AOM_CDF4(15779, 23048, 27200) },
+ { AOM_CDF4(12638, 20061, 24532) },
+ { AOM_CDF4(10692, 17545, 22220) },
+ { AOM_CDF4(9217, 15251, 20054) },
+ { AOM_CDF4(5078, 9284, 12594) } } },
+ { { { AOM_CDF4(2331, 3662, 5244) },
+ { AOM_CDF4(2891, 4771, 6145) },
+ { AOM_CDF4(4598, 7623, 9729) },
+ { AOM_CDF4(3520, 6845, 9199) },
+ { AOM_CDF4(3417, 6119, 9324) },
+ { AOM_CDF4(2601, 5412, 7385) },
+ { AOM_CDF4(600, 1173, 1744) },
+ { AOM_CDF4(7672, 13286, 17469) },
+ { AOM_CDF4(4232, 7792, 10793) },
+ { AOM_CDF4(2915, 5317, 7397) },
+ { AOM_CDF4(2318, 4356, 6152) },
+ { AOM_CDF4(2127, 4000, 5554) },
+ { AOM_CDF4(1850, 3478, 5275) },
+ { AOM_CDF4(977, 1933, 2843) },
+ { AOM_CDF4(18280, 24387, 27989) },
+ { AOM_CDF4(15852, 22671, 26185) },
+ { AOM_CDF4(13845, 20951, 24789) },
+ { AOM_CDF4(11055, 17966, 22129) },
+ { AOM_CDF4(9138, 15422, 19801) },
+ { AOM_CDF4(7454, 13145, 17456) },
+ { AOM_CDF4(3370, 6393, 9013) } },
+ { { AOM_CDF4(5842, 9229, 10838) },
+ { AOM_CDF4(2313, 3491, 4276) },
+ { AOM_CDF4(2998, 6104, 7496) },
+ { AOM_CDF4(2420, 7447, 9868) },
+ { AOM_CDF4(3034, 8495, 10923) },
+ { AOM_CDF4(4076, 8937, 10975) },
+ { AOM_CDF4(1086, 2370, 3299) },
+ { AOM_CDF4(9714, 17254, 20444) },
+ { AOM_CDF4(8543, 13698, 17123) },
+ { AOM_CDF4(4918, 9007, 11910) },
+ { AOM_CDF4(4129, 7532, 10553) },
+ { AOM_CDF4(2364, 5533, 8058) },
+ { AOM_CDF4(1834, 3546, 5563) },
+ { AOM_CDF4(1473, 2908, 4133) },
+ { AOM_CDF4(15405, 21193, 25619) },
+ { AOM_CDF4(15691, 21952, 26561) },
+ { AOM_CDF4(12962, 19194, 24165) },
+ { AOM_CDF4(10272, 17855, 22129) },
+ { AOM_CDF4(8588, 15270, 20718) },
+ { AOM_CDF4(8682, 14669, 19500) },
+ { AOM_CDF4(4870, 9636, 13205) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(14995, 21341, 24749) },
+ { AOM_CDF4(13158, 20289, 24601) },
+ { AOM_CDF4(8941, 15326, 19876) },
+ { AOM_CDF4(6297, 11541, 15807) },
+ { AOM_CDF4(4817, 9029, 12776) },
+ { AOM_CDF4(3731, 7273, 10627) },
+ { AOM_CDF4(1847, 3617, 5354) },
+ { AOM_CDF4(14472, 19659, 22343) },
+ { AOM_CDF4(16806, 24162, 27533) },
+ { AOM_CDF4(12900, 20404, 24713) },
+ { AOM_CDF4(9411, 16112, 20797) },
+ { AOM_CDF4(7056, 12697, 17148) },
+ { AOM_CDF4(5544, 10339, 14460) },
+ { AOM_CDF4(2954, 5704, 8319) },
+ { AOM_CDF4(12464, 18071, 21354) },
+ { AOM_CDF4(15482, 22528, 26034) },
+ { AOM_CDF4(12070, 19269, 23624) },
+ { AOM_CDF4(8953, 15406, 20106) },
+ { AOM_CDF4(7027, 12730, 17220) },
+ { AOM_CDF4(5887, 10913, 15140) },
+ { AOM_CDF4(3793, 7278, 10447) } },
+ { { AOM_CDF4(15571, 22232, 25749) },
+ { AOM_CDF4(14506, 21575, 25374) },
+ { AOM_CDF4(10189, 17089, 21569) },
+ { AOM_CDF4(7316, 13301, 17915) },
+ { AOM_CDF4(5783, 10912, 15190) },
+ { AOM_CDF4(4760, 9155, 13088) },
+ { AOM_CDF4(2993, 5966, 8774) },
+ { AOM_CDF4(23424, 28903, 30778) },
+ { AOM_CDF4(20775, 27666, 30290) },
+ { AOM_CDF4(16474, 24410, 28299) },
+ { AOM_CDF4(12471, 20180, 24987) },
+ { AOM_CDF4(9410, 16487, 21439) },
+ { AOM_CDF4(7536, 13614, 18529) },
+ { AOM_CDF4(5048, 9586, 13549) },
+ { AOM_CDF4(21090, 27290, 29756) },
+ { AOM_CDF4(20796, 27402, 30026) },
+ { AOM_CDF4(17819, 25485, 28969) },
+ { AOM_CDF4(13860, 21909, 26462) },
+ { AOM_CDF4(11002, 18494, 23529) },
+ { AOM_CDF4(8953, 15929, 20897) },
+ { AOM_CDF4(6448, 11918, 16454) } } },
+ { { { AOM_CDF4(15999, 22208, 25449) },
+ { AOM_CDF4(13050, 19988, 24122) },
+ { AOM_CDF4(8594, 14864, 19378) },
+ { AOM_CDF4(6033, 11079, 15238) },
+ { AOM_CDF4(4554, 8683, 12347) },
+ { AOM_CDF4(3672, 7139, 10337) },
+ { AOM_CDF4(1900, 3771, 5576) },
+ { AOM_CDF4(15788, 21340, 23949) },
+ { AOM_CDF4(16825, 24235, 27758) },
+ { AOM_CDF4(12873, 20402, 24810) },
+ { AOM_CDF4(9590, 16363, 21094) },
+ { AOM_CDF4(7352, 13209, 17733) },
+ { AOM_CDF4(5960, 10989, 15184) },
+ { AOM_CDF4(3232, 6234, 9007) },
+ { AOM_CDF4(15761, 20716, 23224) },
+ { AOM_CDF4(19318, 25989, 28759) },
+ { AOM_CDF4(15529, 23094, 26929) },
+ { AOM_CDF4(11662, 18989, 23641) },
+ { AOM_CDF4(8955, 15568, 20366) },
+ { AOM_CDF4(7281, 13106, 17708) },
+ { AOM_CDF4(4248, 8059, 11440) } },
+ { { AOM_CDF4(14899, 21217, 24503) },
+ { AOM_CDF4(13519, 20283, 24047) },
+ { AOM_CDF4(9429, 15966, 20365) },
+ { AOM_CDF4(6700, 12355, 16652) },
+ { AOM_CDF4(5088, 9704, 13716) },
+ { AOM_CDF4(4243, 8154, 11731) },
+ { AOM_CDF4(2702, 5364, 7861) },
+ { AOM_CDF4(22745, 28388, 30454) },
+ { AOM_CDF4(20235, 27146, 29922) },
+ { AOM_CDF4(15896, 23715, 27637) },
+ { AOM_CDF4(11840, 19350, 24131) },
+ { AOM_CDF4(9122, 15932, 20880) },
+ { AOM_CDF4(7488, 13581, 18362) },
+ { AOM_CDF4(5114, 9568, 13370) },
+ { AOM_CDF4(20845, 26553, 28932) },
+ { AOM_CDF4(20981, 27372, 29884) },
+ { AOM_CDF4(17781, 25335, 28785) },
+ { AOM_CDF4(13760, 21708, 26297) },
+ { AOM_CDF4(10975, 18415, 23365) },
+ { AOM_CDF4(9045, 15789, 20686) },
+ { AOM_CDF4(6130, 11199, 15423) } } },
+ { { { AOM_CDF4(13549, 19724, 23158) },
+ { AOM_CDF4(11844, 18382, 22246) },
+ { AOM_CDF4(7919, 13619, 17773) },
+ { AOM_CDF4(5486, 10143, 13946) },
+ { AOM_CDF4(4166, 7983, 11324) },
+ { AOM_CDF4(3364, 6506, 9427) },
+ { AOM_CDF4(1598, 3160, 4674) },
+ { AOM_CDF4(15281, 20979, 23781) },
+ { AOM_CDF4(14939, 22119, 25952) },
+ { AOM_CDF4(11363, 18407, 22812) },
+ { AOM_CDF4(8609, 14857, 19370) },
+ { AOM_CDF4(6737, 12184, 16480) },
+ { AOM_CDF4(5506, 10263, 14262) },
+ { AOM_CDF4(2990, 5786, 8380) },
+ { AOM_CDF4(20249, 25253, 27417) },
+ { AOM_CDF4(21070, 27518, 30001) },
+ { AOM_CDF4(16854, 24469, 28074) },
+ { AOM_CDF4(12864, 20486, 25000) },
+ { AOM_CDF4(9962, 16978, 21778) },
+ { AOM_CDF4(8074, 14338, 19048) },
+ { AOM_CDF4(4494, 8479, 11906) } },
+ { { AOM_CDF4(13960, 19617, 22829) },
+ { AOM_CDF4(11150, 17341, 21228) },
+ { AOM_CDF4(7150, 12964, 17190) },
+ { AOM_CDF4(5331, 10002, 13867) },
+ { AOM_CDF4(4167, 7744, 11057) },
+ { AOM_CDF4(3480, 6629, 9646) },
+ { AOM_CDF4(1883, 3784, 5686) },
+ { AOM_CDF4(18752, 25660, 28912) },
+ { AOM_CDF4(16968, 24586, 28030) },
+ { AOM_CDF4(13520, 21055, 25313) },
+ { AOM_CDF4(10453, 17626, 22280) },
+ { AOM_CDF4(8386, 14505, 19116) },
+ { AOM_CDF4(6742, 12595, 17008) },
+ { AOM_CDF4(4273, 8140, 11499) },
+ { AOM_CDF4(22120, 27827, 30233) },
+ { AOM_CDF4(20563, 27358, 29895) },
+ { AOM_CDF4(17076, 24644, 28153) },
+ { AOM_CDF4(13362, 20942, 25309) },
+ { AOM_CDF4(10794, 17965, 22695) },
+ { AOM_CDF4(9014, 15652, 20319) },
+ { AOM_CDF4(5708, 10512, 14497) } } },
+ { { { AOM_CDF4(5705, 10930, 15725) },
+ { AOM_CDF4(7946, 12765, 16115) },
+ { AOM_CDF4(6801, 12123, 16226) },
+ { AOM_CDF4(5462, 10135, 14200) },
+ { AOM_CDF4(4189, 8011, 11507) },
+ { AOM_CDF4(3191, 6229, 9408) },
+ { AOM_CDF4(1057, 2137, 3212) },
+ { AOM_CDF4(10018, 17067, 21491) },
+ { AOM_CDF4(7380, 12582, 16453) },
+ { AOM_CDF4(6068, 10845, 14339) },
+ { AOM_CDF4(5098, 9198, 12555) },
+ { AOM_CDF4(4312, 8010, 11119) },
+ { AOM_CDF4(3700, 6966, 9781) },
+ { AOM_CDF4(1693, 3326, 4887) },
+ { AOM_CDF4(18757, 24930, 27774) },
+ { AOM_CDF4(17648, 24596, 27817) },
+ { AOM_CDF4(14707, 22052, 26026) },
+ { AOM_CDF4(11720, 18852, 23292) },
+ { AOM_CDF4(9357, 15952, 20525) },
+ { AOM_CDF4(7810, 13753, 18210) },
+ { AOM_CDF4(3879, 7333, 10328) } },
+ { { AOM_CDF4(8278, 13242, 15922) },
+ { AOM_CDF4(10547, 15867, 18919) },
+ { AOM_CDF4(9106, 15842, 20609) },
+ { AOM_CDF4(6833, 13007, 17218) },
+ { AOM_CDF4(4811, 9712, 13923) },
+ { AOM_CDF4(3985, 7352, 11128) },
+ { AOM_CDF4(1688, 3458, 5262) },
+ { AOM_CDF4(12951, 21861, 26510) },
+ { AOM_CDF4(9788, 16044, 20276) },
+ { AOM_CDF4(6309, 11244, 14870) },
+ { AOM_CDF4(5183, 9349, 12566) },
+ { AOM_CDF4(4389, 8229, 11492) },
+ { AOM_CDF4(3633, 6945, 10620) },
+ { AOM_CDF4(3600, 6847, 9907) },
+ { AOM_CDF4(21748, 28137, 30255) },
+ { AOM_CDF4(19436, 26581, 29560) },
+ { AOM_CDF4(16359, 24201, 27953) },
+ { AOM_CDF4(13961, 21693, 25871) },
+ { AOM_CDF4(11544, 18686, 23322) },
+ { AOM_CDF4(9372, 16462, 20952) },
+ { AOM_CDF4(6138, 11210, 15390) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(16138, 22223, 25509) },
+ { AOM_CDF4(15347, 22430, 26332) },
+ { AOM_CDF4(9614, 16736, 21332) },
+ { AOM_CDF4(6600, 12275, 16907) },
+ { AOM_CDF4(4811, 9424, 13547) },
+ { AOM_CDF4(3748, 7809, 11420) },
+ { AOM_CDF4(2254, 4587, 6890) },
+ { AOM_CDF4(15196, 20284, 23177) },
+ { AOM_CDF4(18317, 25469, 28451) },
+ { AOM_CDF4(13918, 21651, 25842) },
+ { AOM_CDF4(10052, 17150, 21995) },
+ { AOM_CDF4(7499, 13630, 18587) },
+ { AOM_CDF4(6158, 11417, 16003) },
+ { AOM_CDF4(4014, 7785, 11252) },
+ { AOM_CDF4(15048, 21067, 24384) },
+ { AOM_CDF4(18202, 25346, 28553) },
+ { AOM_CDF4(14302, 22019, 26356) },
+ { AOM_CDF4(10839, 18139, 23166) },
+ { AOM_CDF4(8715, 15744, 20806) },
+ { AOM_CDF4(7536, 13576, 18544) },
+ { AOM_CDF4(5413, 10335, 14498) } },
+ { { AOM_CDF4(17394, 24501, 27895) },
+ { AOM_CDF4(15889, 23420, 27185) },
+ { AOM_CDF4(11561, 19133, 23870) },
+ { AOM_CDF4(8285, 14812, 19844) },
+ { AOM_CDF4(6496, 12043, 16550) },
+ { AOM_CDF4(4771, 9574, 13677) },
+ { AOM_CDF4(3603, 6830, 10144) },
+ { AOM_CDF4(21656, 27704, 30200) },
+ { AOM_CDF4(21324, 27915, 30511) },
+ { AOM_CDF4(17327, 25336, 28997) },
+ { AOM_CDF4(13417, 21381, 26033) },
+ { AOM_CDF4(10132, 17425, 22338) },
+ { AOM_CDF4(8580, 15016, 19633) },
+ { AOM_CDF4(5694, 11477, 16411) },
+ { AOM_CDF4(24116, 29780, 31450) },
+ { AOM_CDF4(23853, 29695, 31591) },
+ { AOM_CDF4(20085, 27614, 30428) },
+ { AOM_CDF4(15326, 24335, 28575) },
+ { AOM_CDF4(11814, 19472, 24810) },
+ { AOM_CDF4(10221, 18611, 24767) },
+ { AOM_CDF4(7689, 14558, 20321) } } },
+ { { { AOM_CDF4(16214, 22380, 25770) },
+ { AOM_CDF4(14213, 21304, 25295) },
+ { AOM_CDF4(9213, 15823, 20455) },
+ { AOM_CDF4(6395, 11758, 16139) },
+ { AOM_CDF4(4779, 9187, 13066) },
+ { AOM_CDF4(3821, 7501, 10953) },
+ { AOM_CDF4(2293, 4567, 6795) },
+ { AOM_CDF4(15859, 21283, 23820) },
+ { AOM_CDF4(18404, 25602, 28726) },
+ { AOM_CDF4(14325, 21980, 26206) },
+ { AOM_CDF4(10669, 17937, 22720) },
+ { AOM_CDF4(8297, 14642, 19447) },
+ { AOM_CDF4(6746, 12389, 16893) },
+ { AOM_CDF4(4324, 8251, 11770) },
+ { AOM_CDF4(16532, 21631, 24475) },
+ { AOM_CDF4(20667, 27150, 29668) },
+ { AOM_CDF4(16728, 24510, 28175) },
+ { AOM_CDF4(12861, 20645, 25332) },
+ { AOM_CDF4(10076, 17361, 22417) },
+ { AOM_CDF4(8395, 14940, 19963) },
+ { AOM_CDF4(5731, 10683, 14912) } },
+ { { AOM_CDF4(14433, 21155, 24938) },
+ { AOM_CDF4(14658, 21716, 25545) },
+ { AOM_CDF4(9923, 16824, 21557) },
+ { AOM_CDF4(6982, 13052, 17721) },
+ { AOM_CDF4(5419, 10503, 15050) },
+ { AOM_CDF4(4852, 9162, 13014) },
+ { AOM_CDF4(3271, 6395, 9630) },
+ { AOM_CDF4(22210, 27833, 30109) },
+ { AOM_CDF4(20750, 27368, 29821) },
+ { AOM_CDF4(16894, 24828, 28573) },
+ { AOM_CDF4(13247, 21276, 25757) },
+ { AOM_CDF4(10038, 17265, 22563) },
+ { AOM_CDF4(8587, 14947, 20327) },
+ { AOM_CDF4(5645, 11371, 15252) },
+ { AOM_CDF4(22027, 27526, 29714) },
+ { AOM_CDF4(23098, 29146, 31221) },
+ { AOM_CDF4(19886, 27341, 30272) },
+ { AOM_CDF4(15609, 23747, 28046) },
+ { AOM_CDF4(11993, 20065, 24939) },
+ { AOM_CDF4(9637, 18267, 23671) },
+ { AOM_CDF4(7625, 13801, 19144) } } },
+ { { { AOM_CDF4(14438, 20798, 24089) },
+ { AOM_CDF4(12621, 19203, 23097) },
+ { AOM_CDF4(8177, 14125, 18402) },
+ { AOM_CDF4(5674, 10501, 14456) },
+ { AOM_CDF4(4236, 8239, 11733) },
+ { AOM_CDF4(3447, 6750, 9806) },
+ { AOM_CDF4(1986, 3950, 5864) },
+ { AOM_CDF4(16208, 22099, 24930) },
+ { AOM_CDF4(16537, 24025, 27585) },
+ { AOM_CDF4(12780, 20381, 24867) },
+ { AOM_CDF4(9767, 16612, 21416) },
+ { AOM_CDF4(7686, 13738, 18398) },
+ { AOM_CDF4(6333, 11614, 15964) },
+ { AOM_CDF4(3941, 7571, 10836) },
+ { AOM_CDF4(22819, 27422, 29202) },
+ { AOM_CDF4(22224, 28514, 30721) },
+ { AOM_CDF4(17660, 25433, 28913) },
+ { AOM_CDF4(13574, 21482, 26002) },
+ { AOM_CDF4(10629, 17977, 22938) },
+ { AOM_CDF4(8612, 15298, 20265) },
+ { AOM_CDF4(5607, 10491, 14596) } },
+ { { AOM_CDF4(13569, 19800, 23206) },
+ { AOM_CDF4(13128, 19924, 23869) },
+ { AOM_CDF4(8329, 14841, 19403) },
+ { AOM_CDF4(6130, 10976, 15057) },
+ { AOM_CDF4(4682, 8839, 12518) },
+ { AOM_CDF4(3656, 7409, 10588) },
+ { AOM_CDF4(2577, 5099, 7412) },
+ { AOM_CDF4(22427, 28684, 30585) },
+ { AOM_CDF4(20913, 27750, 30139) },
+ { AOM_CDF4(15840, 24109, 27834) },
+ { AOM_CDF4(12308, 20029, 24569) },
+ { AOM_CDF4(10216, 16785, 21458) },
+ { AOM_CDF4(8309, 14203, 19113) },
+ { AOM_CDF4(6043, 11168, 15307) },
+ { AOM_CDF4(23166, 28901, 30998) },
+ { AOM_CDF4(21899, 28405, 30751) },
+ { AOM_CDF4(18413, 26091, 29443) },
+ { AOM_CDF4(15233, 23114, 27352) },
+ { AOM_CDF4(12683, 20472, 25288) },
+ { AOM_CDF4(10702, 18259, 23409) },
+ { AOM_CDF4(8125, 14464, 19226) } } },
+ { { { AOM_CDF4(9040, 14786, 18360) },
+ { AOM_CDF4(9979, 15718, 19415) },
+ { AOM_CDF4(7913, 13918, 18311) },
+ { AOM_CDF4(5859, 10889, 15184) },
+ { AOM_CDF4(4593, 8677, 12510) },
+ { AOM_CDF4(3820, 7396, 10791) },
+ { AOM_CDF4(1730, 3471, 5192) },
+ { AOM_CDF4(11803, 18365, 22709) },
+ { AOM_CDF4(11419, 18058, 22225) },
+ { AOM_CDF4(9418, 15774, 20243) },
+ { AOM_CDF4(7539, 13325, 17657) },
+ { AOM_CDF4(6233, 11317, 15384) },
+ { AOM_CDF4(5137, 9656, 13545) },
+ { AOM_CDF4(2977, 5774, 8349) },
+ { AOM_CDF4(21207, 27246, 29640) },
+ { AOM_CDF4(19547, 26578, 29497) },
+ { AOM_CDF4(16169, 23871, 27690) },
+ { AOM_CDF4(12820, 20458, 25018) },
+ { AOM_CDF4(10224, 17332, 22214) },
+ { AOM_CDF4(8526, 15048, 19884) },
+ { AOM_CDF4(5037, 9410, 13118) } },
+ { { AOM_CDF4(12339, 17329, 20140) },
+ { AOM_CDF4(13505, 19895, 23225) },
+ { AOM_CDF4(9847, 16944, 21564) },
+ { AOM_CDF4(7280, 13256, 18348) },
+ { AOM_CDF4(4712, 10009, 14454) },
+ { AOM_CDF4(4361, 7914, 12477) },
+ { AOM_CDF4(2870, 5628, 7995) },
+ { AOM_CDF4(20061, 25504, 28526) },
+ { AOM_CDF4(15235, 22878, 26145) },
+ { AOM_CDF4(12985, 19958, 24155) },
+ { AOM_CDF4(9782, 16641, 21403) },
+ { AOM_CDF4(9456, 16360, 20760) },
+ { AOM_CDF4(6855, 12940, 18557) },
+ { AOM_CDF4(5661, 10564, 15002) },
+ { AOM_CDF4(25656, 30602, 31894) },
+ { AOM_CDF4(22570, 29107, 31092) },
+ { AOM_CDF4(18917, 26423, 29541) },
+ { AOM_CDF4(15940, 23649, 27754) },
+ { AOM_CDF4(12803, 20581, 25219) },
+ { AOM_CDF4(11082, 18695, 23376) },
+ { AOM_CDF4(7939, 14373, 19005) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(18315, 24289, 27551) },
+ { AOM_CDF4(16854, 24068, 27835) },
+ { AOM_CDF4(10140, 17927, 23173) },
+ { AOM_CDF4(6722, 12982, 18267) },
+ { AOM_CDF4(4661, 9826, 14706) },
+ { AOM_CDF4(3832, 8165, 12294) },
+ { AOM_CDF4(2795, 6098, 9245) },
+ { AOM_CDF4(17145, 23326, 26672) },
+ { AOM_CDF4(20733, 27680, 30308) },
+ { AOM_CDF4(16032, 24461, 28546) },
+ { AOM_CDF4(11653, 20093, 25081) },
+ { AOM_CDF4(9290, 16429, 22086) },
+ { AOM_CDF4(7796, 14598, 19982) },
+ { AOM_CDF4(6502, 12378, 17441) },
+ { AOM_CDF4(21681, 27732, 30320) },
+ { AOM_CDF4(22389, 29044, 31261) },
+ { AOM_CDF4(19027, 26731, 30087) },
+ { AOM_CDF4(14739, 23755, 28624) },
+ { AOM_CDF4(11358, 20778, 25511) },
+ { AOM_CDF4(10995, 18073, 24190) },
+ { AOM_CDF4(9162, 14990, 20617) } },
+ { { AOM_CDF4(21425, 27952, 30388) },
+ { AOM_CDF4(18062, 25838, 29034) },
+ { AOM_CDF4(11956, 19881, 24808) },
+ { AOM_CDF4(7718, 15000, 20980) },
+ { AOM_CDF4(5702, 11254, 16143) },
+ { AOM_CDF4(4898, 9088, 16864) },
+ { AOM_CDF4(3679, 6776, 11907) },
+ { AOM_CDF4(23294, 30160, 31663) },
+ { AOM_CDF4(24397, 29896, 31836) },
+ { AOM_CDF4(19245, 27128, 30593) },
+ { AOM_CDF4(13202, 19825, 26404) },
+ { AOM_CDF4(11578, 19297, 23957) },
+ { AOM_CDF4(8073, 13297, 21370) },
+ { AOM_CDF4(5461, 10923, 19745) },
+ { AOM_CDF4(27367, 30521, 31934) },
+ { AOM_CDF4(24904, 30671, 31940) },
+ { AOM_CDF4(23075, 28460, 31299) },
+ { AOM_CDF4(14400, 23658, 30417) },
+ { AOM_CDF4(13885, 23882, 28325) },
+ { AOM_CDF4(14746, 22938, 27853) },
+ { AOM_CDF4(5461, 16384, 27307) } } },
+ { { { AOM_CDF4(18274, 24813, 27890) },
+ { AOM_CDF4(15537, 23149, 27003) },
+ { AOM_CDF4(9449, 16740, 21827) },
+ { AOM_CDF4(6700, 12498, 17261) },
+ { AOM_CDF4(4988, 9866, 14198) },
+ { AOM_CDF4(4236, 8147, 11902) },
+ { AOM_CDF4(2867, 5860, 8654) },
+ { AOM_CDF4(17124, 23171, 26101) },
+ { AOM_CDF4(20396, 27477, 30148) },
+ { AOM_CDF4(16573, 24629, 28492) },
+ { AOM_CDF4(12749, 20846, 25674) },
+ { AOM_CDF4(10233, 17878, 22818) },
+ { AOM_CDF4(8525, 15332, 20363) },
+ { AOM_CDF4(6283, 11632, 16255) },
+ { AOM_CDF4(20466, 26511, 29286) },
+ { AOM_CDF4(23059, 29174, 31191) },
+ { AOM_CDF4(19481, 27263, 30241) },
+ { AOM_CDF4(15458, 23631, 28137) },
+ { AOM_CDF4(12416, 20608, 25693) },
+ { AOM_CDF4(10261, 18011, 23261) },
+ { AOM_CDF4(8016, 14655, 19666) } },
+ { { AOM_CDF4(17616, 24586, 28112) },
+ { AOM_CDF4(15809, 23299, 27155) },
+ { AOM_CDF4(10767, 18890, 23793) },
+ { AOM_CDF4(7727, 14255, 18865) },
+ { AOM_CDF4(6129, 11926, 16882) },
+ { AOM_CDF4(4482, 9704, 14861) },
+ { AOM_CDF4(3277, 7452, 11522) },
+ { AOM_CDF4(22956, 28551, 30730) },
+ { AOM_CDF4(22724, 28937, 30961) },
+ { AOM_CDF4(18467, 26324, 29580) },
+ { AOM_CDF4(13234, 20713, 25649) },
+ { AOM_CDF4(11181, 17592, 22481) },
+ { AOM_CDF4(8291, 18358, 24576) },
+ { AOM_CDF4(7568, 11881, 14984) },
+ { AOM_CDF4(24948, 29001, 31147) },
+ { AOM_CDF4(25674, 30619, 32151) },
+ { AOM_CDF4(20841, 26793, 29603) },
+ { AOM_CDF4(14669, 24356, 28666) },
+ { AOM_CDF4(11334, 23593, 28219) },
+ { AOM_CDF4(8922, 14762, 22873) },
+ { AOM_CDF4(8301, 13544, 20535) } } },
+ { { { AOM_CDF4(17113, 23733, 27081) },
+ { AOM_CDF4(14139, 21406, 25452) },
+ { AOM_CDF4(8552, 15002, 19776) },
+ { AOM_CDF4(5871, 11120, 15378) },
+ { AOM_CDF4(4455, 8616, 12253) },
+ { AOM_CDF4(3469, 6910, 10386) },
+ { AOM_CDF4(2255, 4553, 6782) },
+ { AOM_CDF4(18224, 24376, 27053) },
+ { AOM_CDF4(19290, 26710, 29614) },
+ { AOM_CDF4(14936, 22991, 27184) },
+ { AOM_CDF4(11238, 18951, 23762) },
+ { AOM_CDF4(8786, 15617, 20588) },
+ { AOM_CDF4(7317, 13228, 18003) },
+ { AOM_CDF4(5101, 9512, 13493) },
+ { AOM_CDF4(22639, 28222, 30210) },
+ { AOM_CDF4(23216, 29331, 31307) },
+ { AOM_CDF4(19075, 26762, 29895) },
+ { AOM_CDF4(15014, 23113, 27457) },
+ { AOM_CDF4(11938, 19857, 24752) },
+ { AOM_CDF4(9942, 17280, 22282) },
+ { AOM_CDF4(7167, 13144, 17752) } },
+ { { AOM_CDF4(15820, 22738, 26488) },
+ { AOM_CDF4(13530, 20885, 25216) },
+ { AOM_CDF4(8395, 15530, 20452) },
+ { AOM_CDF4(6574, 12321, 16380) },
+ { AOM_CDF4(5353, 10419, 14568) },
+ { AOM_CDF4(4613, 8446, 12381) },
+ { AOM_CDF4(3440, 7158, 9903) },
+ { AOM_CDF4(24247, 29051, 31224) },
+ { AOM_CDF4(22118, 28058, 30369) },
+ { AOM_CDF4(16498, 24768, 28389) },
+ { AOM_CDF4(12920, 21175, 26137) },
+ { AOM_CDF4(10730, 18619, 25352) },
+ { AOM_CDF4(10187, 16279, 22791) },
+ { AOM_CDF4(9310, 14631, 22127) },
+ { AOM_CDF4(24970, 30558, 32057) },
+ { AOM_CDF4(24801, 29942, 31698) },
+ { AOM_CDF4(22432, 28453, 30855) },
+ { AOM_CDF4(19054, 25680, 29580) },
+ { AOM_CDF4(14392, 23036, 28109) },
+ { AOM_CDF4(12495, 20947, 26650) },
+ { AOM_CDF4(12442, 20326, 26214) } } },
+ { { { AOM_CDF4(12162, 18785, 22648) },
+ { AOM_CDF4(12749, 19697, 23806) },
+ { AOM_CDF4(8580, 15297, 20346) },
+ { AOM_CDF4(6169, 11749, 16543) },
+ { AOM_CDF4(4836, 9391, 13448) },
+ { AOM_CDF4(3821, 7711, 11613) },
+ { AOM_CDF4(2228, 4601, 7070) },
+ { AOM_CDF4(16319, 24725, 28280) },
+ { AOM_CDF4(15698, 23277, 27168) },
+ { AOM_CDF4(12726, 20368, 25047) },
+ { AOM_CDF4(9912, 17015, 21976) },
+ { AOM_CDF4(7888, 14220, 19179) },
+ { AOM_CDF4(6777, 12284, 17018) },
+ { AOM_CDF4(4492, 8590, 12252) },
+ { AOM_CDF4(23249, 28904, 30947) },
+ { AOM_CDF4(21050, 27908, 30512) },
+ { AOM_CDF4(17440, 25340, 28949) },
+ { AOM_CDF4(14059, 22018, 26541) },
+ { AOM_CDF4(11288, 18903, 23898) },
+ { AOM_CDF4(9411, 16342, 21428) },
+ { AOM_CDF4(6278, 11588, 15944) } },
+ { { AOM_CDF4(13981, 20067, 23226) },
+ { AOM_CDF4(16922, 23580, 26783) },
+ { AOM_CDF4(11005, 19039, 24487) },
+ { AOM_CDF4(7389, 14218, 19798) },
+ { AOM_CDF4(5598, 11505, 17206) },
+ { AOM_CDF4(6090, 11213, 15659) },
+ { AOM_CDF4(3820, 7371, 10119) },
+ { AOM_CDF4(21082, 26925, 29675) },
+ { AOM_CDF4(21262, 28627, 31128) },
+ { AOM_CDF4(18392, 26454, 30437) },
+ { AOM_CDF4(14870, 22910, 27096) },
+ { AOM_CDF4(12620, 19484, 24908) },
+ { AOM_CDF4(9290, 16553, 22802) },
+ { AOM_CDF4(6668, 14288, 20004) },
+ { AOM_CDF4(27704, 31055, 31949) },
+ { AOM_CDF4(24709, 29978, 31788) },
+ { AOM_CDF4(21668, 29264, 31657) },
+ { AOM_CDF4(18295, 26968, 30074) },
+ { AOM_CDF4(16399, 24422, 29313) },
+ { AOM_CDF4(14347, 23026, 28104) },
+ { AOM_CDF4(12370, 19806, 24477) } } },
+ { { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) }, { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } }
+ };
+
+static const aom_cdf_prob av1_default_coeff_base_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS]
+ [CDF_SIZE(NUM_BASE_LEVELS + 2)] =
+ { { { { { AOM_CDF4(4034, 8930, 12727) },
+ { AOM_CDF4(18082, 29741, 31877) },
+ { AOM_CDF4(12596, 26124, 30493) },
+ { AOM_CDF4(9446, 21118, 27005) },
+ { AOM_CDF4(6308, 15141, 21279) },
+ { AOM_CDF4(2463, 6357, 9783) },
+ { AOM_CDF4(20667, 30546, 31929) },
+ { AOM_CDF4(13043, 26123, 30134) },
+ { AOM_CDF4(8151, 18757, 24778) },
+ { AOM_CDF4(5255, 12839, 18632) },
+ { AOM_CDF4(2820, 7206, 11161) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(15736, 27553, 30604) },
+ { AOM_CDF4(11210, 23794, 28787) },
+ { AOM_CDF4(5947, 13874, 19701) },
+ { AOM_CDF4(4215, 9323, 13891) },
+ { AOM_CDF4(2833, 6462, 10059) },
+ { AOM_CDF4(19605, 30393, 31582) },
+ { AOM_CDF4(13523, 26252, 30248) },
+ { AOM_CDF4(8446, 18622, 24512) },
+ { AOM_CDF4(3818, 10343, 15974) },
+ { AOM_CDF4(1481, 4117, 6796) },
+ { AOM_CDF4(22649, 31302, 32190) },
+ { AOM_CDF4(14829, 27127, 30449) },
+ { AOM_CDF4(8313, 17702, 23304) },
+ { AOM_CDF4(3022, 8301, 12786) },
+ { AOM_CDF4(1536, 4412, 7184) },
+ { AOM_CDF4(22354, 29774, 31372) },
+ { AOM_CDF4(14723, 25472, 29214) },
+ { AOM_CDF4(6673, 13745, 18662) },
+ { AOM_CDF4(2068, 5766, 9322) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6302, 16444, 21761) },
+ { AOM_CDF4(23040, 31538, 32475) },
+ { AOM_CDF4(15196, 28452, 31496) },
+ { AOM_CDF4(10020, 22946, 28514) },
+ { AOM_CDF4(6533, 16862, 23501) },
+ { AOM_CDF4(3538, 9816, 15076) },
+ { AOM_CDF4(24444, 31875, 32525) },
+ { AOM_CDF4(15881, 28924, 31635) },
+ { AOM_CDF4(9922, 22873, 28466) },
+ { AOM_CDF4(6527, 16966, 23691) },
+ { AOM_CDF4(4114, 11303, 17220) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20201, 30770, 32209) },
+ { AOM_CDF4(14754, 28071, 31258) },
+ { AOM_CDF4(8378, 20186, 26517) },
+ { AOM_CDF4(5916, 15299, 21978) },
+ { AOM_CDF4(4268, 11583, 17901) },
+ { AOM_CDF4(24361, 32025, 32581) },
+ { AOM_CDF4(18673, 30105, 31943) },
+ { AOM_CDF4(10196, 22244, 27576) },
+ { AOM_CDF4(5495, 14349, 20417) },
+ { AOM_CDF4(2676, 7415, 11498) },
+ { AOM_CDF4(24678, 31958, 32585) },
+ { AOM_CDF4(18629, 29906, 31831) },
+ { AOM_CDF4(9364, 20724, 26315) },
+ { AOM_CDF4(4641, 12318, 18094) },
+ { AOM_CDF4(2758, 7387, 11579) },
+ { AOM_CDF4(25433, 31842, 32469) },
+ { AOM_CDF4(18795, 29289, 31411) },
+ { AOM_CDF4(7644, 17584, 23592) },
+ { AOM_CDF4(3408, 9014, 15047) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4536, 10072, 14001) },
+ { AOM_CDF4(25459, 31416, 32206) },
+ { AOM_CDF4(16605, 28048, 30818) },
+ { AOM_CDF4(11008, 22857, 27719) },
+ { AOM_CDF4(6915, 16268, 22315) },
+ { AOM_CDF4(2625, 6812, 10537) },
+ { AOM_CDF4(24257, 31788, 32499) },
+ { AOM_CDF4(16880, 29454, 31879) },
+ { AOM_CDF4(11958, 25054, 29778) },
+ { AOM_CDF4(7916, 18718, 25084) },
+ { AOM_CDF4(3383, 8777, 13446) },
+ { AOM_CDF4(22720, 31603, 32393) },
+ { AOM_CDF4(14960, 28125, 31335) },
+ { AOM_CDF4(9731, 22210, 27928) },
+ { AOM_CDF4(6304, 15832, 22277) },
+ { AOM_CDF4(2910, 7818, 12166) },
+ { AOM_CDF4(20375, 30627, 32131) },
+ { AOM_CDF4(13904, 27284, 30887) },
+ { AOM_CDF4(9368, 21558, 27144) },
+ { AOM_CDF4(5937, 14966, 21119) },
+ { AOM_CDF4(2667, 7225, 11319) },
+ { AOM_CDF4(23970, 31470, 32378) },
+ { AOM_CDF4(17173, 29734, 32018) },
+ { AOM_CDF4(12795, 25441, 29965) },
+ { AOM_CDF4(8981, 19680, 25893) },
+ { AOM_CDF4(4728, 11372, 16902) },
+ { AOM_CDF4(24287, 31797, 32439) },
+ { AOM_CDF4(16703, 29145, 31696) },
+ { AOM_CDF4(10833, 23554, 28725) },
+ { AOM_CDF4(6468, 16566, 23057) },
+ { AOM_CDF4(2415, 6562, 10278) },
+ { AOM_CDF4(26610, 32395, 32659) },
+ { AOM_CDF4(18590, 30498, 32117) },
+ { AOM_CDF4(12420, 25756, 29950) },
+ { AOM_CDF4(7639, 18746, 24710) },
+ { AOM_CDF4(3001, 8086, 12347) },
+ { AOM_CDF4(25076, 32064, 32580) },
+ { AOM_CDF4(17946, 30128, 32028) },
+ { AOM_CDF4(12024, 24985, 29378) },
+ { AOM_CDF4(7517, 18390, 24304) },
+ { AOM_CDF4(3243, 8781, 13331) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6037, 16771, 21957) },
+ { AOM_CDF4(24774, 31704, 32426) },
+ { AOM_CDF4(16830, 28589, 31056) },
+ { AOM_CDF4(10602, 22828, 27760) },
+ { AOM_CDF4(6733, 16829, 23071) },
+ { AOM_CDF4(3250, 8914, 13556) },
+ { AOM_CDF4(25582, 32220, 32668) },
+ { AOM_CDF4(18659, 30342, 32223) },
+ { AOM_CDF4(12546, 26149, 30515) },
+ { AOM_CDF4(8420, 20451, 26801) },
+ { AOM_CDF4(4636, 12420, 18344) },
+ { AOM_CDF4(27581, 32362, 32639) },
+ { AOM_CDF4(18987, 30083, 31978) },
+ { AOM_CDF4(11327, 24248, 29084) },
+ { AOM_CDF4(7264, 17719, 24120) },
+ { AOM_CDF4(3995, 10768, 16169) },
+ { AOM_CDF4(25893, 31831, 32487) },
+ { AOM_CDF4(16577, 28587, 31379) },
+ { AOM_CDF4(10189, 22748, 28182) },
+ { AOM_CDF4(6832, 17094, 23556) },
+ { AOM_CDF4(3708, 10110, 15334) },
+ { AOM_CDF4(25904, 32282, 32656) },
+ { AOM_CDF4(19721, 30792, 32276) },
+ { AOM_CDF4(12819, 26243, 30411) },
+ { AOM_CDF4(8572, 20614, 26891) },
+ { AOM_CDF4(5364, 14059, 20467) },
+ { AOM_CDF4(26580, 32438, 32677) },
+ { AOM_CDF4(20852, 31225, 32340) },
+ { AOM_CDF4(12435, 25700, 29967) },
+ { AOM_CDF4(8691, 20825, 26976) },
+ { AOM_CDF4(4446, 12209, 17269) },
+ { AOM_CDF4(27350, 32429, 32696) },
+ { AOM_CDF4(21372, 30977, 32272) },
+ { AOM_CDF4(12673, 25270, 29853) },
+ { AOM_CDF4(9208, 20925, 26640) },
+ { AOM_CDF4(5018, 13351, 18732) },
+ { AOM_CDF4(27351, 32479, 32713) },
+ { AOM_CDF4(21398, 31209, 32387) },
+ { AOM_CDF4(12162, 25047, 29842) },
+ { AOM_CDF4(7896, 18691, 25319) },
+ { AOM_CDF4(4670, 12882, 18881) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5487, 10460, 13708) },
+ { AOM_CDF4(21597, 28303, 30674) },
+ { AOM_CDF4(11037, 21953, 26476) },
+ { AOM_CDF4(8147, 17962, 22952) },
+ { AOM_CDF4(5242, 13061, 18532) },
+ { AOM_CDF4(1889, 5208, 8182) },
+ { AOM_CDF4(26774, 32133, 32590) },
+ { AOM_CDF4(17844, 29564, 31767) },
+ { AOM_CDF4(11690, 24438, 29171) },
+ { AOM_CDF4(7542, 18215, 24459) },
+ { AOM_CDF4(2993, 8050, 12319) },
+ { AOM_CDF4(28023, 32328, 32591) },
+ { AOM_CDF4(18651, 30126, 31954) },
+ { AOM_CDF4(12164, 25146, 29589) },
+ { AOM_CDF4(7762, 18530, 24771) },
+ { AOM_CDF4(3492, 9183, 13920) },
+ { AOM_CDF4(27591, 32008, 32491) },
+ { AOM_CDF4(17149, 28853, 31510) },
+ { AOM_CDF4(11485, 24003, 28860) },
+ { AOM_CDF4(7697, 18086, 24210) },
+ { AOM_CDF4(3075, 7999, 12218) },
+ { AOM_CDF4(28268, 32482, 32654) },
+ { AOM_CDF4(19631, 31051, 32404) },
+ { AOM_CDF4(13860, 27260, 31020) },
+ { AOM_CDF4(9605, 21613, 27594) },
+ { AOM_CDF4(4876, 12162, 17908) },
+ { AOM_CDF4(27248, 32316, 32576) },
+ { AOM_CDF4(18955, 30457, 32075) },
+ { AOM_CDF4(11824, 23997, 28795) },
+ { AOM_CDF4(7346, 18196, 24647) },
+ { AOM_CDF4(3403, 9247, 14111) },
+ { AOM_CDF4(29711, 32655, 32735) },
+ { AOM_CDF4(21169, 31394, 32417) },
+ { AOM_CDF4(13487, 27198, 30957) },
+ { AOM_CDF4(8828, 21683, 27614) },
+ { AOM_CDF4(4270, 11451, 17038) },
+ { AOM_CDF4(28708, 32578, 32731) },
+ { AOM_CDF4(20120, 31241, 32482) },
+ { AOM_CDF4(13692, 27550, 31321) },
+ { AOM_CDF4(9418, 22514, 28439) },
+ { AOM_CDF4(4999, 13283, 19462) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5673, 14302, 19711) },
+ { AOM_CDF4(26251, 30701, 31834) },
+ { AOM_CDF4(12782, 23783, 27803) },
+ { AOM_CDF4(9127, 20657, 25808) },
+ { AOM_CDF4(6368, 16208, 21462) },
+ { AOM_CDF4(2465, 7177, 10822) },
+ { AOM_CDF4(29961, 32563, 32719) },
+ { AOM_CDF4(18318, 29891, 31949) },
+ { AOM_CDF4(11361, 24514, 29357) },
+ { AOM_CDF4(7900, 19603, 25607) },
+ { AOM_CDF4(4002, 10590, 15546) },
+ { AOM_CDF4(29637, 32310, 32595) },
+ { AOM_CDF4(18296, 29913, 31809) },
+ { AOM_CDF4(10144, 21515, 26871) },
+ { AOM_CDF4(5358, 14322, 20394) },
+ { AOM_CDF4(3067, 8362, 13346) },
+ { AOM_CDF4(28652, 32470, 32676) },
+ { AOM_CDF4(17538, 30771, 32209) },
+ { AOM_CDF4(13924, 26882, 30494) },
+ { AOM_CDF4(10496, 22837, 27869) },
+ { AOM_CDF4(7236, 16396, 21621) },
+ { AOM_CDF4(30743, 32687, 32746) },
+ { AOM_CDF4(23006, 31676, 32489) },
+ { AOM_CDF4(14494, 27828, 31120) },
+ { AOM_CDF4(10174, 22801, 28352) },
+ { AOM_CDF4(6242, 15281, 21043) },
+ { AOM_CDF4(25817, 32243, 32720) },
+ { AOM_CDF4(18618, 31367, 32325) },
+ { AOM_CDF4(13997, 28318, 31878) },
+ { AOM_CDF4(12255, 26534, 31383) },
+ { AOM_CDF4(9561, 21588, 28450) },
+ { AOM_CDF4(28188, 32635, 32724) },
+ { AOM_CDF4(22060, 32365, 32728) },
+ { AOM_CDF4(18102, 30690, 32528) },
+ { AOM_CDF4(14196, 28864, 31999) },
+ { AOM_CDF4(12262, 25792, 30865) },
+ { AOM_CDF4(24176, 32109, 32628) },
+ { AOM_CDF4(18280, 29681, 31963) },
+ { AOM_CDF4(10205, 23703, 29664) },
+ { AOM_CDF4(7889, 20025, 27676) },
+ { AOM_CDF4(6060, 16743, 23970) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5141, 7096, 8260) },
+ { AOM_CDF4(27186, 29022, 29789) },
+ { AOM_CDF4(6668, 12568, 15682) },
+ { AOM_CDF4(2172, 6181, 8638) },
+ { AOM_CDF4(1126, 3379, 4531) },
+ { AOM_CDF4(443, 1361, 2254) },
+ { AOM_CDF4(26083, 31153, 32436) },
+ { AOM_CDF4(13486, 24603, 28483) },
+ { AOM_CDF4(6508, 14840, 19910) },
+ { AOM_CDF4(3386, 8800, 13286) },
+ { AOM_CDF4(1530, 4322, 7054) },
+ { AOM_CDF4(29639, 32080, 32548) },
+ { AOM_CDF4(15897, 27552, 30290) },
+ { AOM_CDF4(8588, 20047, 25383) },
+ { AOM_CDF4(4889, 13339, 19269) },
+ { AOM_CDF4(2240, 6871, 10498) },
+ { AOM_CDF4(28165, 32197, 32517) },
+ { AOM_CDF4(20735, 30427, 31568) },
+ { AOM_CDF4(14325, 24671, 27692) },
+ { AOM_CDF4(5119, 12554, 17805) },
+ { AOM_CDF4(1810, 5441, 8261) },
+ { AOM_CDF4(31212, 32724, 32748) },
+ { AOM_CDF4(23352, 31766, 32545) },
+ { AOM_CDF4(14669, 27570, 31059) },
+ { AOM_CDF4(8492, 20894, 27272) },
+ { AOM_CDF4(3644, 10194, 15204) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(2461, 7013, 9371) },
+ { AOM_CDF4(24749, 29600, 30986) },
+ { AOM_CDF4(9466, 19037, 22417) },
+ { AOM_CDF4(3584, 9280, 14400) },
+ { AOM_CDF4(1505, 3929, 5433) },
+ { AOM_CDF4(677, 1500, 2736) },
+ { AOM_CDF4(23987, 30702, 32117) },
+ { AOM_CDF4(13554, 24571, 29263) },
+ { AOM_CDF4(6211, 14556, 21155) },
+ { AOM_CDF4(3135, 10972, 15625) },
+ { AOM_CDF4(2435, 7127, 11427) },
+ { AOM_CDF4(31300, 32532, 32550) },
+ { AOM_CDF4(14757, 30365, 31954) },
+ { AOM_CDF4(4405, 11612, 18553) },
+ { AOM_CDF4(580, 4132, 7322) },
+ { AOM_CDF4(1695, 10169, 14124) },
+ { AOM_CDF4(30008, 32282, 32591) },
+ { AOM_CDF4(19244, 30108, 31748) },
+ { AOM_CDF4(11180, 24158, 29555) },
+ { AOM_CDF4(5650, 14972, 19209) },
+ { AOM_CDF4(2114, 5109, 8456) },
+ { AOM_CDF4(31856, 32716, 32748) },
+ { AOM_CDF4(23012, 31664, 32572) },
+ { AOM_CDF4(13694, 26656, 30636) },
+ { AOM_CDF4(8142, 19508, 26093) },
+ { AOM_CDF4(4253, 10955, 16724) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(601, 983, 1311) },
+ { AOM_CDF4(18725, 23406, 28087) },
+ { AOM_CDF4(5461, 8192, 10923) },
+ { AOM_CDF4(3781, 15124, 21425) },
+ { AOM_CDF4(2587, 7761, 12072) },
+ { AOM_CDF4(106, 458, 810) },
+ { AOM_CDF4(22282, 29710, 31894) },
+ { AOM_CDF4(8508, 20926, 25984) },
+ { AOM_CDF4(3726, 12713, 18083) },
+ { AOM_CDF4(1620, 7112, 10893) },
+ { AOM_CDF4(729, 2236, 3495) },
+ { AOM_CDF4(30163, 32474, 32684) },
+ { AOM_CDF4(18304, 30464, 32000) },
+ { AOM_CDF4(11443, 26526, 29647) },
+ { AOM_CDF4(6007, 15292, 21299) },
+ { AOM_CDF4(2234, 6703, 8937) },
+ { AOM_CDF4(30954, 32177, 32571) },
+ { AOM_CDF4(17363, 29562, 31076) },
+ { AOM_CDF4(9686, 22464, 27410) },
+ { AOM_CDF4(8192, 16384, 21390) },
+ { AOM_CDF4(1755, 8046, 11264) },
+ { AOM_CDF4(31168, 32734, 32748) },
+ { AOM_CDF4(22486, 31441, 32471) },
+ { AOM_CDF4(12833, 25627, 29738) },
+ { AOM_CDF4(6980, 17379, 23122) },
+ { AOM_CDF4(3111, 8887, 13479) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(6041, 11854, 15927) },
+ { AOM_CDF4(20326, 30905, 32251) },
+ { AOM_CDF4(14164, 26831, 30725) },
+ { AOM_CDF4(9760, 20647, 26585) },
+ { AOM_CDF4(6416, 14953, 21219) },
+ { AOM_CDF4(2966, 7151, 10891) },
+ { AOM_CDF4(23567, 31374, 32254) },
+ { AOM_CDF4(14978, 27416, 30946) },
+ { AOM_CDF4(9434, 20225, 26254) },
+ { AOM_CDF4(6658, 14558, 20535) },
+ { AOM_CDF4(3916, 8677, 12989) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(18088, 29545, 31587) },
+ { AOM_CDF4(13062, 25843, 30073) },
+ { AOM_CDF4(8940, 16827, 22251) },
+ { AOM_CDF4(7654, 13220, 17973) },
+ { AOM_CDF4(5733, 10316, 14456) },
+ { AOM_CDF4(22879, 31388, 32114) },
+ { AOM_CDF4(15215, 27993, 30955) },
+ { AOM_CDF4(9397, 19445, 24978) },
+ { AOM_CDF4(3442, 9813, 15344) },
+ { AOM_CDF4(1368, 3936, 6532) },
+ { AOM_CDF4(25494, 32033, 32406) },
+ { AOM_CDF4(16772, 27963, 30718) },
+ { AOM_CDF4(9419, 18165, 23260) },
+ { AOM_CDF4(2677, 7501, 11797) },
+ { AOM_CDF4(1516, 4344, 7170) },
+ { AOM_CDF4(26556, 31454, 32101) },
+ { AOM_CDF4(17128, 27035, 30108) },
+ { AOM_CDF4(8324, 15344, 20249) },
+ { AOM_CDF4(1903, 5696, 9469) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8455, 19003, 24368) },
+ { AOM_CDF4(23563, 32021, 32604) },
+ { AOM_CDF4(16237, 29446, 31935) },
+ { AOM_CDF4(10724, 23999, 29358) },
+ { AOM_CDF4(6725, 17528, 24416) },
+ { AOM_CDF4(3927, 10927, 16825) },
+ { AOM_CDF4(26313, 32288, 32634) },
+ { AOM_CDF4(17430, 30095, 32095) },
+ { AOM_CDF4(11116, 24606, 29679) },
+ { AOM_CDF4(7195, 18384, 25269) },
+ { AOM_CDF4(4726, 12852, 19315) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(22822, 31648, 32483) },
+ { AOM_CDF4(16724, 29633, 31929) },
+ { AOM_CDF4(10261, 23033, 28725) },
+ { AOM_CDF4(7029, 17840, 24528) },
+ { AOM_CDF4(4867, 13886, 21502) },
+ { AOM_CDF4(25298, 31892, 32491) },
+ { AOM_CDF4(17809, 29330, 31512) },
+ { AOM_CDF4(9668, 21329, 26579) },
+ { AOM_CDF4(4774, 12956, 18976) },
+ { AOM_CDF4(2322, 7030, 11540) },
+ { AOM_CDF4(25472, 31920, 32543) },
+ { AOM_CDF4(17957, 29387, 31632) },
+ { AOM_CDF4(9196, 20593, 26400) },
+ { AOM_CDF4(4680, 12705, 19202) },
+ { AOM_CDF4(2917, 8456, 13436) },
+ { AOM_CDF4(26471, 32059, 32574) },
+ { AOM_CDF4(18458, 29783, 31909) },
+ { AOM_CDF4(8400, 19464, 25956) },
+ { AOM_CDF4(3812, 10973, 17206) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(6779, 13743, 17678) },
+ { AOM_CDF4(24806, 31797, 32457) },
+ { AOM_CDF4(17616, 29047, 31372) },
+ { AOM_CDF4(11063, 23175, 28003) },
+ { AOM_CDF4(6521, 16110, 22324) },
+ { AOM_CDF4(2764, 7504, 11654) },
+ { AOM_CDF4(25266, 32367, 32637) },
+ { AOM_CDF4(19054, 30553, 32175) },
+ { AOM_CDF4(12139, 25212, 29807) },
+ { AOM_CDF4(7311, 18162, 24704) },
+ { AOM_CDF4(3397, 9164, 14074) },
+ { AOM_CDF4(25988, 32208, 32522) },
+ { AOM_CDF4(16253, 28912, 31526) },
+ { AOM_CDF4(9151, 21387, 27372) },
+ { AOM_CDF4(5688, 14915, 21496) },
+ { AOM_CDF4(2717, 7627, 12004) },
+ { AOM_CDF4(23144, 31855, 32443) },
+ { AOM_CDF4(16070, 28491, 31325) },
+ { AOM_CDF4(8702, 20467, 26517) },
+ { AOM_CDF4(5243, 13956, 20367) },
+ { AOM_CDF4(2621, 7335, 11567) },
+ { AOM_CDF4(26636, 32340, 32630) },
+ { AOM_CDF4(19990, 31050, 32341) },
+ { AOM_CDF4(13243, 26105, 30315) },
+ { AOM_CDF4(8588, 19521, 25918) },
+ { AOM_CDF4(4717, 11585, 17304) },
+ { AOM_CDF4(25844, 32292, 32582) },
+ { AOM_CDF4(19090, 30635, 32097) },
+ { AOM_CDF4(11963, 24546, 28939) },
+ { AOM_CDF4(6218, 16087, 22354) },
+ { AOM_CDF4(2340, 6608, 10426) },
+ { AOM_CDF4(28046, 32576, 32694) },
+ { AOM_CDF4(21178, 31313, 32296) },
+ { AOM_CDF4(13486, 26184, 29870) },
+ { AOM_CDF4(7149, 17871, 23723) },
+ { AOM_CDF4(2833, 7958, 12259) },
+ { AOM_CDF4(27710, 32528, 32686) },
+ { AOM_CDF4(20674, 31076, 32268) },
+ { AOM_CDF4(12413, 24955, 29243) },
+ { AOM_CDF4(6676, 16927, 23097) },
+ { AOM_CDF4(2966, 8333, 12919) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8639, 19339, 24429) },
+ { AOM_CDF4(24404, 31837, 32525) },
+ { AOM_CDF4(16997, 29425, 31784) },
+ { AOM_CDF4(11253, 24234, 29149) },
+ { AOM_CDF4(6751, 17394, 24028) },
+ { AOM_CDF4(3490, 9830, 15191) },
+ { AOM_CDF4(26283, 32471, 32714) },
+ { AOM_CDF4(19599, 31168, 32442) },
+ { AOM_CDF4(13146, 26954, 30893) },
+ { AOM_CDF4(8214, 20588, 26890) },
+ { AOM_CDF4(4699, 13081, 19300) },
+ { AOM_CDF4(28212, 32458, 32669) },
+ { AOM_CDF4(18594, 30316, 32100) },
+ { AOM_CDF4(11219, 24408, 29234) },
+ { AOM_CDF4(6865, 17656, 24149) },
+ { AOM_CDF4(3678, 10362, 16006) },
+ { AOM_CDF4(25825, 32136, 32616) },
+ { AOM_CDF4(17313, 29853, 32021) },
+ { AOM_CDF4(11197, 24471, 29472) },
+ { AOM_CDF4(6947, 17781, 24405) },
+ { AOM_CDF4(3768, 10660, 16261) },
+ { AOM_CDF4(27352, 32500, 32706) },
+ { AOM_CDF4(20850, 31468, 32469) },
+ { AOM_CDF4(14021, 27707, 31133) },
+ { AOM_CDF4(8964, 21748, 27838) },
+ { AOM_CDF4(5437, 14665, 21187) },
+ { AOM_CDF4(26304, 32492, 32698) },
+ { AOM_CDF4(20409, 31380, 32385) },
+ { AOM_CDF4(13682, 27222, 30632) },
+ { AOM_CDF4(8974, 21236, 26685) },
+ { AOM_CDF4(4234, 11665, 16934) },
+ { AOM_CDF4(26273, 32357, 32711) },
+ { AOM_CDF4(20672, 31242, 32441) },
+ { AOM_CDF4(14172, 27254, 30902) },
+ { AOM_CDF4(9870, 21898, 27275) },
+ { AOM_CDF4(5164, 13506, 19270) },
+ { AOM_CDF4(26725, 32459, 32728) },
+ { AOM_CDF4(20991, 31442, 32527) },
+ { AOM_CDF4(13071, 26434, 30811) },
+ { AOM_CDF4(8184, 20090, 26742) },
+ { AOM_CDF4(4803, 13255, 19895) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7555, 14942, 18501) },
+ { AOM_CDF4(24410, 31178, 32287) },
+ { AOM_CDF4(14394, 26738, 30253) },
+ { AOM_CDF4(8413, 19554, 25195) },
+ { AOM_CDF4(4766, 12924, 18785) },
+ { AOM_CDF4(2029, 5806, 9207) },
+ { AOM_CDF4(26776, 32364, 32663) },
+ { AOM_CDF4(18732, 29967, 31931) },
+ { AOM_CDF4(11005, 23786, 28852) },
+ { AOM_CDF4(6466, 16909, 23510) },
+ { AOM_CDF4(3044, 8638, 13419) },
+ { AOM_CDF4(29208, 32582, 32704) },
+ { AOM_CDF4(20068, 30857, 32208) },
+ { AOM_CDF4(12003, 25085, 29595) },
+ { AOM_CDF4(6947, 17750, 24189) },
+ { AOM_CDF4(3245, 9103, 14007) },
+ { AOM_CDF4(27359, 32465, 32669) },
+ { AOM_CDF4(19421, 30614, 32174) },
+ { AOM_CDF4(11915, 25010, 29579) },
+ { AOM_CDF4(6950, 17676, 24074) },
+ { AOM_CDF4(3007, 8473, 13096) },
+ { AOM_CDF4(29002, 32676, 32735) },
+ { AOM_CDF4(22102, 31849, 32576) },
+ { AOM_CDF4(14408, 28009, 31405) },
+ { AOM_CDF4(9027, 21679, 27931) },
+ { AOM_CDF4(4694, 12678, 18748) },
+ { AOM_CDF4(28216, 32528, 32682) },
+ { AOM_CDF4(20849, 31264, 32318) },
+ { AOM_CDF4(12756, 25815, 29751) },
+ { AOM_CDF4(7565, 18801, 24923) },
+ { AOM_CDF4(3509, 9533, 14477) },
+ { AOM_CDF4(30133, 32687, 32739) },
+ { AOM_CDF4(23063, 31910, 32515) },
+ { AOM_CDF4(14588, 28051, 31132) },
+ { AOM_CDF4(9085, 21649, 27457) },
+ { AOM_CDF4(4261, 11654, 17264) },
+ { AOM_CDF4(29518, 32691, 32748) },
+ { AOM_CDF4(22451, 31959, 32613) },
+ { AOM_CDF4(14864, 28722, 31700) },
+ { AOM_CDF4(9695, 22964, 28716) },
+ { AOM_CDF4(4932, 13358, 19502) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(6465, 16958, 21688) },
+ { AOM_CDF4(25199, 31514, 32360) },
+ { AOM_CDF4(14774, 27149, 30607) },
+ { AOM_CDF4(9257, 21438, 26972) },
+ { AOM_CDF4(5723, 15183, 21882) },
+ { AOM_CDF4(3150, 8879, 13731) },
+ { AOM_CDF4(26989, 32262, 32682) },
+ { AOM_CDF4(17396, 29937, 32085) },
+ { AOM_CDF4(11387, 24901, 29784) },
+ { AOM_CDF4(7289, 18821, 25548) },
+ { AOM_CDF4(3734, 10577, 16086) },
+ { AOM_CDF4(29728, 32501, 32695) },
+ { AOM_CDF4(17431, 29701, 31903) },
+ { AOM_CDF4(9921, 22826, 28300) },
+ { AOM_CDF4(5896, 15434, 22068) },
+ { AOM_CDF4(3430, 9646, 14757) },
+ { AOM_CDF4(28614, 32511, 32705) },
+ { AOM_CDF4(19364, 30638, 32263) },
+ { AOM_CDF4(13129, 26254, 30402) },
+ { AOM_CDF4(8754, 20484, 26440) },
+ { AOM_CDF4(4378, 11607, 17110) },
+ { AOM_CDF4(30292, 32671, 32744) },
+ { AOM_CDF4(21780, 31603, 32501) },
+ { AOM_CDF4(14314, 27829, 31291) },
+ { AOM_CDF4(9611, 22327, 28263) },
+ { AOM_CDF4(4890, 13087, 19065) },
+ { AOM_CDF4(25862, 32567, 32733) },
+ { AOM_CDF4(20794, 32050, 32567) },
+ { AOM_CDF4(17243, 30625, 32254) },
+ { AOM_CDF4(13283, 27628, 31474) },
+ { AOM_CDF4(9669, 22532, 28918) },
+ { AOM_CDF4(27435, 32697, 32748) },
+ { AOM_CDF4(24922, 32390, 32714) },
+ { AOM_CDF4(21449, 31504, 32536) },
+ { AOM_CDF4(16392, 29729, 31832) },
+ { AOM_CDF4(11692, 24884, 29076) },
+ { AOM_CDF4(24193, 32290, 32735) },
+ { AOM_CDF4(18909, 31104, 32563) },
+ { AOM_CDF4(12236, 26841, 31403) },
+ { AOM_CDF4(8171, 21840, 29082) },
+ { AOM_CDF4(7224, 17280, 25275) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(3078, 6839, 9890) },
+ { AOM_CDF4(13837, 20450, 24479) },
+ { AOM_CDF4(5914, 14222, 19328) },
+ { AOM_CDF4(3866, 10267, 14762) },
+ { AOM_CDF4(2612, 7208, 11042) },
+ { AOM_CDF4(1067, 2991, 4776) },
+ { AOM_CDF4(25817, 31646, 32529) },
+ { AOM_CDF4(13708, 26338, 30385) },
+ { AOM_CDF4(7328, 18585, 24870) },
+ { AOM_CDF4(4691, 13080, 19276) },
+ { AOM_CDF4(1825, 5253, 8352) },
+ { AOM_CDF4(29386, 32315, 32624) },
+ { AOM_CDF4(17160, 29001, 31360) },
+ { AOM_CDF4(9602, 21862, 27396) },
+ { AOM_CDF4(5915, 15772, 22148) },
+ { AOM_CDF4(2786, 7779, 12047) },
+ { AOM_CDF4(29246, 32450, 32663) },
+ { AOM_CDF4(18696, 29929, 31818) },
+ { AOM_CDF4(10510, 23369, 28560) },
+ { AOM_CDF4(6229, 16499, 23125) },
+ { AOM_CDF4(2608, 7448, 11705) },
+ { AOM_CDF4(30753, 32710, 32748) },
+ { AOM_CDF4(21638, 31487, 32503) },
+ { AOM_CDF4(12937, 26854, 30870) },
+ { AOM_CDF4(8182, 20596, 26970) },
+ { AOM_CDF4(3637, 10269, 15497) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(5244, 12150, 16906) },
+ { AOM_CDF4(20486, 26858, 29701) },
+ { AOM_CDF4(7756, 18317, 23735) },
+ { AOM_CDF4(3452, 9256, 13146) },
+ { AOM_CDF4(2020, 5206, 8229) },
+ { AOM_CDF4(1801, 4993, 7903) },
+ { AOM_CDF4(27051, 31858, 32531) },
+ { AOM_CDF4(15988, 27531, 30619) },
+ { AOM_CDF4(9188, 21484, 26719) },
+ { AOM_CDF4(6273, 17186, 23800) },
+ { AOM_CDF4(3108, 9355, 14764) },
+ { AOM_CDF4(31076, 32520, 32680) },
+ { AOM_CDF4(18119, 30037, 31850) },
+ { AOM_CDF4(10244, 22969, 27472) },
+ { AOM_CDF4(4692, 14077, 19273) },
+ { AOM_CDF4(3694, 11677, 17556) },
+ { AOM_CDF4(30060, 32581, 32720) },
+ { AOM_CDF4(21011, 30775, 32120) },
+ { AOM_CDF4(11931, 24820, 29289) },
+ { AOM_CDF4(7119, 17662, 24356) },
+ { AOM_CDF4(3833, 10706, 16304) },
+ { AOM_CDF4(31954, 32731, 32748) },
+ { AOM_CDF4(23913, 31724, 32489) },
+ { AOM_CDF4(15520, 28060, 31286) },
+ { AOM_CDF4(11517, 23008, 28571) },
+ { AOM_CDF4(6193, 14508, 20629) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(1035, 2807, 4156) },
+ { AOM_CDF4(13162, 18138, 20939) },
+ { AOM_CDF4(2696, 6633, 8755) },
+ { AOM_CDF4(1373, 4161, 6853) },
+ { AOM_CDF4(1099, 2746, 4716) },
+ { AOM_CDF4(340, 1021, 1599) },
+ { AOM_CDF4(22826, 30419, 32135) },
+ { AOM_CDF4(10395, 21762, 26942) },
+ { AOM_CDF4(4726, 12407, 17361) },
+ { AOM_CDF4(2447, 7080, 10593) },
+ { AOM_CDF4(1227, 3717, 6011) },
+ { AOM_CDF4(28156, 31424, 31934) },
+ { AOM_CDF4(16915, 27754, 30373) },
+ { AOM_CDF4(9148, 20990, 26431) },
+ { AOM_CDF4(5950, 15515, 21148) },
+ { AOM_CDF4(2492, 7327, 11526) },
+ { AOM_CDF4(30602, 32477, 32670) },
+ { AOM_CDF4(20026, 29955, 31568) },
+ { AOM_CDF4(11220, 23628, 28105) },
+ { AOM_CDF4(6652, 17019, 22973) },
+ { AOM_CDF4(3064, 8536, 13043) },
+ { AOM_CDF4(31769, 32724, 32748) },
+ { AOM_CDF4(22230, 30887, 32373) },
+ { AOM_CDF4(12234, 25079, 29731) },
+ { AOM_CDF4(7326, 18816, 25353) },
+ { AOM_CDF4(3933, 10907, 16616) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(8896, 16227, 20630) },
+ { AOM_CDF4(23629, 31782, 32527) },
+ { AOM_CDF4(15173, 27755, 31321) },
+ { AOM_CDF4(10158, 21233, 27382) },
+ { AOM_CDF4(6420, 14857, 21558) },
+ { AOM_CDF4(3269, 8155, 12646) },
+ { AOM_CDF4(24835, 32009, 32496) },
+ { AOM_CDF4(16509, 28421, 31579) },
+ { AOM_CDF4(10957, 21514, 27418) },
+ { AOM_CDF4(7881, 15930, 22096) },
+ { AOM_CDF4(5388, 10960, 15918) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(20745, 30773, 32093) },
+ { AOM_CDF4(15200, 27221, 30861) },
+ { AOM_CDF4(13032, 20873, 25667) },
+ { AOM_CDF4(12285, 18663, 23494) },
+ { AOM_CDF4(11563, 17481, 21489) },
+ { AOM_CDF4(26260, 31982, 32320) },
+ { AOM_CDF4(15397, 28083, 31100) },
+ { AOM_CDF4(9742, 19217, 24824) },
+ { AOM_CDF4(3261, 9629, 15362) },
+ { AOM_CDF4(1480, 4322, 7499) },
+ { AOM_CDF4(27599, 32256, 32460) },
+ { AOM_CDF4(16857, 27659, 30774) },
+ { AOM_CDF4(9551, 18290, 23748) },
+ { AOM_CDF4(3052, 8933, 14103) },
+ { AOM_CDF4(2021, 5910, 9787) },
+ { AOM_CDF4(29005, 32015, 32392) },
+ { AOM_CDF4(17677, 27694, 30863) },
+ { AOM_CDF4(9204, 17356, 23219) },
+ { AOM_CDF4(2403, 7516, 12814) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10808, 22056, 26896) },
+ { AOM_CDF4(25739, 32313, 32676) },
+ { AOM_CDF4(17288, 30203, 32221) },
+ { AOM_CDF4(11359, 24878, 29896) },
+ { AOM_CDF4(6949, 17767, 24893) },
+ { AOM_CDF4(4287, 11796, 18071) },
+ { AOM_CDF4(27880, 32521, 32705) },
+ { AOM_CDF4(19038, 31004, 32414) },
+ { AOM_CDF4(12564, 26345, 30768) },
+ { AOM_CDF4(8269, 19947, 26779) },
+ { AOM_CDF4(5674, 14657, 21674) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(25742, 32319, 32671) },
+ { AOM_CDF4(19557, 31164, 32454) },
+ { AOM_CDF4(13381, 26381, 30755) },
+ { AOM_CDF4(10101, 21466, 26722) },
+ { AOM_CDF4(9209, 19650, 26825) },
+ { AOM_CDF4(27107, 31917, 32432) },
+ { AOM_CDF4(18056, 28893, 31203) },
+ { AOM_CDF4(10200, 21434, 26764) },
+ { AOM_CDF4(4660, 12913, 19502) },
+ { AOM_CDF4(2368, 6930, 12504) },
+ { AOM_CDF4(26960, 32158, 32613) },
+ { AOM_CDF4(18628, 30005, 32031) },
+ { AOM_CDF4(10233, 22442, 28232) },
+ { AOM_CDF4(5471, 14630, 21516) },
+ { AOM_CDF4(3235, 10767, 17109) },
+ { AOM_CDF4(27696, 32440, 32692) },
+ { AOM_CDF4(20032, 31167, 32438) },
+ { AOM_CDF4(8700, 21341, 28442) },
+ { AOM_CDF4(5662, 14831, 21795) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9704, 17294, 21132) },
+ { AOM_CDF4(26762, 32278, 32633) },
+ { AOM_CDF4(18382, 29620, 31819) },
+ { AOM_CDF4(10891, 23475, 28723) },
+ { AOM_CDF4(6358, 16583, 23309) },
+ { AOM_CDF4(3248, 9118, 14141) },
+ { AOM_CDF4(27204, 32573, 32699) },
+ { AOM_CDF4(19818, 30824, 32329) },
+ { AOM_CDF4(11772, 25120, 30041) },
+ { AOM_CDF4(6995, 18033, 25039) },
+ { AOM_CDF4(3752, 10442, 16098) },
+ { AOM_CDF4(27222, 32256, 32559) },
+ { AOM_CDF4(15356, 28399, 31475) },
+ { AOM_CDF4(8821, 20635, 27057) },
+ { AOM_CDF4(5511, 14404, 21239) },
+ { AOM_CDF4(2935, 8222, 13051) },
+ { AOM_CDF4(24875, 32120, 32529) },
+ { AOM_CDF4(15233, 28265, 31445) },
+ { AOM_CDF4(8605, 20570, 26932) },
+ { AOM_CDF4(5431, 14413, 21196) },
+ { AOM_CDF4(2994, 8341, 13223) },
+ { AOM_CDF4(28201, 32604, 32700) },
+ { AOM_CDF4(21041, 31446, 32456) },
+ { AOM_CDF4(13221, 26213, 30475) },
+ { AOM_CDF4(8255, 19385, 26037) },
+ { AOM_CDF4(4930, 12585, 18830) },
+ { AOM_CDF4(28768, 32448, 32627) },
+ { AOM_CDF4(19705, 30561, 32021) },
+ { AOM_CDF4(11572, 23589, 28220) },
+ { AOM_CDF4(5532, 15034, 21446) },
+ { AOM_CDF4(2460, 7150, 11456) },
+ { AOM_CDF4(29874, 32619, 32699) },
+ { AOM_CDF4(21621, 31071, 32201) },
+ { AOM_CDF4(12511, 24747, 28992) },
+ { AOM_CDF4(6281, 16395, 22748) },
+ { AOM_CDF4(3246, 9278, 14497) },
+ { AOM_CDF4(29715, 32625, 32712) },
+ { AOM_CDF4(20958, 31011, 32283) },
+ { AOM_CDF4(11233, 23671, 28806) },
+ { AOM_CDF4(6012, 16128, 22868) },
+ { AOM_CDF4(3427, 9851, 15414) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11016, 22111, 26794) },
+ { AOM_CDF4(25946, 32357, 32677) },
+ { AOM_CDF4(17890, 30452, 32252) },
+ { AOM_CDF4(11678, 25142, 29816) },
+ { AOM_CDF4(6720, 17534, 24584) },
+ { AOM_CDF4(4230, 11665, 17820) },
+ { AOM_CDF4(28400, 32623, 32747) },
+ { AOM_CDF4(21164, 31668, 32575) },
+ { AOM_CDF4(13572, 27388, 31182) },
+ { AOM_CDF4(8234, 20750, 27358) },
+ { AOM_CDF4(5065, 14055, 20897) },
+ { AOM_CDF4(28981, 32547, 32705) },
+ { AOM_CDF4(18681, 30543, 32239) },
+ { AOM_CDF4(10919, 24075, 29286) },
+ { AOM_CDF4(6431, 17199, 24077) },
+ { AOM_CDF4(3819, 10464, 16618) },
+ { AOM_CDF4(26870, 32467, 32693) },
+ { AOM_CDF4(19041, 30831, 32347) },
+ { AOM_CDF4(11794, 25211, 30016) },
+ { AOM_CDF4(6888, 18019, 24970) },
+ { AOM_CDF4(4370, 12363, 18992) },
+ { AOM_CDF4(29578, 32670, 32744) },
+ { AOM_CDF4(23159, 32007, 32613) },
+ { AOM_CDF4(15315, 28669, 31676) },
+ { AOM_CDF4(9298, 22607, 28782) },
+ { AOM_CDF4(6144, 15913, 22968) },
+ { AOM_CDF4(28110, 32499, 32669) },
+ { AOM_CDF4(21574, 30937, 32015) },
+ { AOM_CDF4(12759, 24818, 28727) },
+ { AOM_CDF4(6545, 16761, 23042) },
+ { AOM_CDF4(3649, 10597, 16833) },
+ { AOM_CDF4(28163, 32552, 32728) },
+ { AOM_CDF4(22101, 31469, 32464) },
+ { AOM_CDF4(13160, 25472, 30143) },
+ { AOM_CDF4(7303, 18684, 25468) },
+ { AOM_CDF4(5241, 13975, 20955) },
+ { AOM_CDF4(28400, 32631, 32744) },
+ { AOM_CDF4(22104, 31793, 32603) },
+ { AOM_CDF4(13557, 26571, 30846) },
+ { AOM_CDF4(7749, 19861, 26675) },
+ { AOM_CDF4(4873, 14030, 21234) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(9800, 17635, 21073) },
+ { AOM_CDF4(26153, 31885, 32527) },
+ { AOM_CDF4(15038, 27852, 31006) },
+ { AOM_CDF4(8718, 20564, 26486) },
+ { AOM_CDF4(5128, 14076, 20514) },
+ { AOM_CDF4(2636, 7566, 11925) },
+ { AOM_CDF4(27551, 32504, 32701) },
+ { AOM_CDF4(18310, 30054, 32100) },
+ { AOM_CDF4(10211, 23420, 29082) },
+ { AOM_CDF4(6222, 16876, 23916) },
+ { AOM_CDF4(3462, 9954, 15498) },
+ { AOM_CDF4(29991, 32633, 32721) },
+ { AOM_CDF4(19883, 30751, 32201) },
+ { AOM_CDF4(11141, 24184, 29285) },
+ { AOM_CDF4(6420, 16940, 23774) },
+ { AOM_CDF4(3392, 9753, 15118) },
+ { AOM_CDF4(28465, 32616, 32712) },
+ { AOM_CDF4(19850, 30702, 32244) },
+ { AOM_CDF4(10983, 24024, 29223) },
+ { AOM_CDF4(6294, 16770, 23582) },
+ { AOM_CDF4(3244, 9283, 14509) },
+ { AOM_CDF4(30023, 32717, 32748) },
+ { AOM_CDF4(22940, 32032, 32626) },
+ { AOM_CDF4(14282, 27928, 31473) },
+ { AOM_CDF4(8562, 21327, 27914) },
+ { AOM_CDF4(4846, 13393, 19919) },
+ { AOM_CDF4(29981, 32590, 32695) },
+ { AOM_CDF4(20465, 30963, 32166) },
+ { AOM_CDF4(11479, 23579, 28195) },
+ { AOM_CDF4(5916, 15648, 22073) },
+ { AOM_CDF4(3031, 8605, 13398) },
+ { AOM_CDF4(31146, 32691, 32739) },
+ { AOM_CDF4(23106, 31724, 32444) },
+ { AOM_CDF4(13783, 26738, 30439) },
+ { AOM_CDF4(7852, 19468, 25807) },
+ { AOM_CDF4(3860, 11124, 16853) },
+ { AOM_CDF4(31014, 32724, 32748) },
+ { AOM_CDF4(23629, 32109, 32628) },
+ { AOM_CDF4(14747, 28115, 31403) },
+ { AOM_CDF4(8545, 21242, 27478) },
+ { AOM_CDF4(4574, 12781, 19067) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9185, 19694, 24688) },
+ { AOM_CDF4(26081, 31985, 32621) },
+ { AOM_CDF4(16015, 29000, 31787) },
+ { AOM_CDF4(10542, 23690, 29206) },
+ { AOM_CDF4(6732, 17945, 24677) },
+ { AOM_CDF4(3916, 11039, 16722) },
+ { AOM_CDF4(28224, 32566, 32744) },
+ { AOM_CDF4(19100, 31138, 32485) },
+ { AOM_CDF4(12528, 26620, 30879) },
+ { AOM_CDF4(7741, 20277, 26885) },
+ { AOM_CDF4(4566, 12845, 18990) },
+ { AOM_CDF4(29933, 32593, 32718) },
+ { AOM_CDF4(17670, 30333, 32155) },
+ { AOM_CDF4(10385, 23600, 28909) },
+ { AOM_CDF4(6243, 16236, 22407) },
+ { AOM_CDF4(3976, 10389, 16017) },
+ { AOM_CDF4(28377, 32561, 32738) },
+ { AOM_CDF4(19366, 31175, 32482) },
+ { AOM_CDF4(13327, 27175, 31094) },
+ { AOM_CDF4(8258, 20769, 27143) },
+ { AOM_CDF4(4703, 13198, 19527) },
+ { AOM_CDF4(31086, 32706, 32748) },
+ { AOM_CDF4(22853, 31902, 32583) },
+ { AOM_CDF4(14759, 28186, 31419) },
+ { AOM_CDF4(9284, 22382, 28348) },
+ { AOM_CDF4(5585, 15192, 21868) },
+ { AOM_CDF4(28291, 32652, 32746) },
+ { AOM_CDF4(19849, 32107, 32571) },
+ { AOM_CDF4(14834, 26818, 29214) },
+ { AOM_CDF4(10306, 22594, 28672) },
+ { AOM_CDF4(6615, 17384, 23384) },
+ { AOM_CDF4(28947, 32604, 32745) },
+ { AOM_CDF4(25625, 32289, 32646) },
+ { AOM_CDF4(18758, 28672, 31403) },
+ { AOM_CDF4(10017, 23430, 28523) },
+ { AOM_CDF4(6862, 15269, 22131) },
+ { AOM_CDF4(23933, 32509, 32739) },
+ { AOM_CDF4(19927, 31495, 32631) },
+ { AOM_CDF4(11903, 26023, 30621) },
+ { AOM_CDF4(7026, 20094, 27252) },
+ { AOM_CDF4(5998, 18106, 24437) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4456, 11274, 15533) },
+ { AOM_CDF4(21219, 29079, 31616) },
+ { AOM_CDF4(11173, 23774, 28567) },
+ { AOM_CDF4(7282, 18293, 24263) },
+ { AOM_CDF4(4890, 13286, 19115) },
+ { AOM_CDF4(1890, 5508, 8659) },
+ { AOM_CDF4(26651, 32136, 32647) },
+ { AOM_CDF4(14630, 28254, 31455) },
+ { AOM_CDF4(8716, 21287, 27395) },
+ { AOM_CDF4(5615, 15331, 22008) },
+ { AOM_CDF4(2675, 7700, 12150) },
+ { AOM_CDF4(29954, 32526, 32690) },
+ { AOM_CDF4(16126, 28982, 31633) },
+ { AOM_CDF4(9030, 21361, 27352) },
+ { AOM_CDF4(5411, 14793, 21271) },
+ { AOM_CDF4(2943, 8422, 13163) },
+ { AOM_CDF4(29539, 32601, 32730) },
+ { AOM_CDF4(18125, 30385, 32201) },
+ { AOM_CDF4(10422, 24090, 29468) },
+ { AOM_CDF4(6468, 17487, 24438) },
+ { AOM_CDF4(2970, 8653, 13531) },
+ { AOM_CDF4(30912, 32715, 32748) },
+ { AOM_CDF4(20666, 31373, 32497) },
+ { AOM_CDF4(12509, 26640, 30917) },
+ { AOM_CDF4(8058, 20629, 27290) },
+ { AOM_CDF4(4231, 12006, 18052) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(10202, 20633, 25484) },
+ { AOM_CDF4(27336, 31445, 32352) },
+ { AOM_CDF4(12420, 24384, 28552) },
+ { AOM_CDF4(7648, 18115, 23856) },
+ { AOM_CDF4(5662, 14341, 19902) },
+ { AOM_CDF4(3611, 10328, 15390) },
+ { AOM_CDF4(30945, 32616, 32736) },
+ { AOM_CDF4(18682, 30505, 32253) },
+ { AOM_CDF4(11513, 25336, 30203) },
+ { AOM_CDF4(7449, 19452, 26148) },
+ { AOM_CDF4(4482, 13051, 18886) },
+ { AOM_CDF4(32022, 32690, 32747) },
+ { AOM_CDF4(18578, 30501, 32146) },
+ { AOM_CDF4(11249, 23368, 28631) },
+ { AOM_CDF4(5645, 16958, 22158) },
+ { AOM_CDF4(5009, 11444, 16637) },
+ { AOM_CDF4(31357, 32710, 32748) },
+ { AOM_CDF4(21552, 31494, 32504) },
+ { AOM_CDF4(13891, 27677, 31340) },
+ { AOM_CDF4(9051, 22098, 28172) },
+ { AOM_CDF4(5190, 13377, 19486) },
+ { AOM_CDF4(32364, 32740, 32748) },
+ { AOM_CDF4(24839, 31907, 32551) },
+ { AOM_CDF4(17160, 28779, 31696) },
+ { AOM_CDF4(12452, 24137, 29602) },
+ { AOM_CDF4(6165, 15389, 22477) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(2575, 7281, 11077) },
+ { AOM_CDF4(14002, 20866, 25402) },
+ { AOM_CDF4(6343, 15056, 19658) },
+ { AOM_CDF4(4474, 11858, 17041) },
+ { AOM_CDF4(2865, 8299, 12534) },
+ { AOM_CDF4(1344, 3949, 6391) },
+ { AOM_CDF4(24720, 31239, 32459) },
+ { AOM_CDF4(12585, 25356, 29968) },
+ { AOM_CDF4(7181, 18246, 24444) },
+ { AOM_CDF4(5025, 13667, 19885) },
+ { AOM_CDF4(2521, 7304, 11605) },
+ { AOM_CDF4(29908, 32252, 32584) },
+ { AOM_CDF4(17421, 29156, 31575) },
+ { AOM_CDF4(9889, 22188, 27782) },
+ { AOM_CDF4(5878, 15647, 22123) },
+ { AOM_CDF4(2814, 8665, 13323) },
+ { AOM_CDF4(30183, 32568, 32713) },
+ { AOM_CDF4(18528, 30195, 32049) },
+ { AOM_CDF4(10982, 24606, 29657) },
+ { AOM_CDF4(6957, 18165, 25231) },
+ { AOM_CDF4(3508, 10118, 15468) },
+ { AOM_CDF4(31761, 32736, 32748) },
+ { AOM_CDF4(21041, 31328, 32546) },
+ { AOM_CDF4(12568, 26732, 31166) },
+ { AOM_CDF4(8052, 20720, 27733) },
+ { AOM_CDF4(4336, 12192, 18396) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } },
+ { { { { AOM_CDF4(7062, 16472, 22319) },
+ { AOM_CDF4(24538, 32261, 32674) },
+ { AOM_CDF4(13675, 28041, 31779) },
+ { AOM_CDF4(8590, 20674, 27631) },
+ { AOM_CDF4(5685, 14675, 22013) },
+ { AOM_CDF4(3655, 9898, 15731) },
+ { AOM_CDF4(26493, 32418, 32658) },
+ { AOM_CDF4(16376, 29342, 32090) },
+ { AOM_CDF4(10594, 22649, 28970) },
+ { AOM_CDF4(8176, 17170, 24303) },
+ { AOM_CDF4(5605, 12694, 19139) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(23888, 31902, 32542) },
+ { AOM_CDF4(18612, 29687, 31987) },
+ { AOM_CDF4(16245, 24852, 29249) },
+ { AOM_CDF4(15765, 22608, 27559) },
+ { AOM_CDF4(19895, 24699, 27510) },
+ { AOM_CDF4(28401, 32212, 32457) },
+ { AOM_CDF4(15274, 27825, 30980) },
+ { AOM_CDF4(9364, 18128, 24332) },
+ { AOM_CDF4(2283, 8193, 15082) },
+ { AOM_CDF4(1228, 3972, 7881) },
+ { AOM_CDF4(29455, 32469, 32620) },
+ { AOM_CDF4(17981, 28245, 31388) },
+ { AOM_CDF4(10921, 20098, 26240) },
+ { AOM_CDF4(3743, 11829, 18657) },
+ { AOM_CDF4(2374, 9593, 15715) },
+ { AOM_CDF4(31068, 32466, 32635) },
+ { AOM_CDF4(20321, 29572, 31971) },
+ { AOM_CDF4(10771, 20255, 27119) },
+ { AOM_CDF4(2795, 10410, 17361) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(9320, 22102, 27840) },
+ { AOM_CDF4(27057, 32464, 32724) },
+ { AOM_CDF4(16331, 30268, 32309) },
+ { AOM_CDF4(10319, 23935, 29720) },
+ { AOM_CDF4(6189, 16448, 24106) },
+ { AOM_CDF4(3589, 10884, 18808) },
+ { AOM_CDF4(29026, 32624, 32748) },
+ { AOM_CDF4(19226, 31507, 32587) },
+ { AOM_CDF4(12692, 26921, 31203) },
+ { AOM_CDF4(7049, 19532, 27635) },
+ { AOM_CDF4(7727, 15669, 23252) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(28056, 32625, 32748) },
+ { AOM_CDF4(22383, 32075, 32669) },
+ { AOM_CDF4(15417, 27098, 31749) },
+ { AOM_CDF4(18127, 26493, 27190) },
+ { AOM_CDF4(5461, 16384, 21845) },
+ { AOM_CDF4(27982, 32091, 32584) },
+ { AOM_CDF4(19045, 29868, 31972) },
+ { AOM_CDF4(10397, 22266, 27932) },
+ { AOM_CDF4(5990, 13697, 21500) },
+ { AOM_CDF4(1792, 6912, 15104) },
+ { AOM_CDF4(28198, 32501, 32718) },
+ { AOM_CDF4(21534, 31521, 32569) },
+ { AOM_CDF4(11109, 25217, 30017) },
+ { AOM_CDF4(5671, 15124, 26151) },
+ { AOM_CDF4(4681, 14043, 18725) },
+ { AOM_CDF4(28688, 32580, 32741) },
+ { AOM_CDF4(22576, 32079, 32661) },
+ { AOM_CDF4(10627, 22141, 28340) },
+ { AOM_CDF4(9362, 14043, 28087) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7754, 16948, 22142) },
+ { AOM_CDF4(25670, 32330, 32691) },
+ { AOM_CDF4(15663, 29225, 31994) },
+ { AOM_CDF4(9878, 23288, 29158) },
+ { AOM_CDF4(6419, 17088, 24336) },
+ { AOM_CDF4(3859, 11003, 17039) },
+ { AOM_CDF4(27562, 32595, 32725) },
+ { AOM_CDF4(17575, 30588, 32399) },
+ { AOM_CDF4(10819, 24838, 30309) },
+ { AOM_CDF4(7124, 18686, 25916) },
+ { AOM_CDF4(4479, 12688, 19340) },
+ { AOM_CDF4(28385, 32476, 32673) },
+ { AOM_CDF4(15306, 29005, 31938) },
+ { AOM_CDF4(8937, 21615, 28322) },
+ { AOM_CDF4(5982, 15603, 22786) },
+ { AOM_CDF4(3620, 10267, 16136) },
+ { AOM_CDF4(27280, 32464, 32667) },
+ { AOM_CDF4(15607, 29160, 32004) },
+ { AOM_CDF4(9091, 22135, 28740) },
+ { AOM_CDF4(6232, 16632, 24020) },
+ { AOM_CDF4(4047, 11377, 17672) },
+ { AOM_CDF4(29220, 32630, 32718) },
+ { AOM_CDF4(19650, 31220, 32462) },
+ { AOM_CDF4(13050, 26312, 30827) },
+ { AOM_CDF4(9228, 20870, 27468) },
+ { AOM_CDF4(6146, 15149, 21971) },
+ { AOM_CDF4(30169, 32481, 32623) },
+ { AOM_CDF4(17212, 29311, 31554) },
+ { AOM_CDF4(9911, 21311, 26882) },
+ { AOM_CDF4(4487, 13314, 20372) },
+ { AOM_CDF4(2570, 7772, 12889) },
+ { AOM_CDF4(30924, 32613, 32708) },
+ { AOM_CDF4(19490, 30206, 32107) },
+ { AOM_CDF4(11232, 23998, 29276) },
+ { AOM_CDF4(6769, 17955, 25035) },
+ { AOM_CDF4(4398, 12623, 19214) },
+ { AOM_CDF4(30609, 32627, 32722) },
+ { AOM_CDF4(19370, 30582, 32287) },
+ { AOM_CDF4(10457, 23619, 29409) },
+ { AOM_CDF4(6443, 17637, 24834) },
+ { AOM_CDF4(4645, 13236, 20106) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8626, 20271, 26216) },
+ { AOM_CDF4(26707, 32406, 32711) },
+ { AOM_CDF4(16999, 30329, 32286) },
+ { AOM_CDF4(11445, 25123, 30286) },
+ { AOM_CDF4(6411, 18828, 25601) },
+ { AOM_CDF4(6801, 12458, 20248) },
+ { AOM_CDF4(29918, 32682, 32748) },
+ { AOM_CDF4(20649, 31739, 32618) },
+ { AOM_CDF4(12879, 27773, 31581) },
+ { AOM_CDF4(7896, 21751, 28244) },
+ { AOM_CDF4(5260, 14870, 23698) },
+ { AOM_CDF4(29252, 32593, 32731) },
+ { AOM_CDF4(17072, 30460, 32294) },
+ { AOM_CDF4(10653, 24143, 29365) },
+ { AOM_CDF4(6536, 17490, 23983) },
+ { AOM_CDF4(4929, 13170, 20085) },
+ { AOM_CDF4(28137, 32518, 32715) },
+ { AOM_CDF4(18171, 30784, 32407) },
+ { AOM_CDF4(11437, 25436, 30459) },
+ { AOM_CDF4(7252, 18534, 26176) },
+ { AOM_CDF4(4126, 13353, 20978) },
+ { AOM_CDF4(31162, 32726, 32748) },
+ { AOM_CDF4(23017, 32222, 32701) },
+ { AOM_CDF4(15629, 29233, 32046) },
+ { AOM_CDF4(9387, 22621, 29480) },
+ { AOM_CDF4(6922, 17616, 25010) },
+ { AOM_CDF4(28838, 32265, 32614) },
+ { AOM_CDF4(19701, 30206, 31920) },
+ { AOM_CDF4(11214, 22410, 27933) },
+ { AOM_CDF4(5320, 14177, 23034) },
+ { AOM_CDF4(5049, 12881, 17827) },
+ { AOM_CDF4(27484, 32471, 32734) },
+ { AOM_CDF4(21076, 31526, 32561) },
+ { AOM_CDF4(12707, 26303, 31211) },
+ { AOM_CDF4(8169, 21722, 28219) },
+ { AOM_CDF4(6045, 19406, 27042) },
+ { AOM_CDF4(27753, 32572, 32745) },
+ { AOM_CDF4(20832, 31878, 32653) },
+ { AOM_CDF4(13250, 27356, 31674) },
+ { AOM_CDF4(7718, 21508, 29858) },
+ { AOM_CDF4(7209, 18350, 25559) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(7876, 16901, 21741) },
+ { AOM_CDF4(24001, 31898, 32625) },
+ { AOM_CDF4(14529, 27959, 31451) },
+ { AOM_CDF4(8273, 20818, 27258) },
+ { AOM_CDF4(5278, 14673, 21510) },
+ { AOM_CDF4(2983, 8843, 14039) },
+ { AOM_CDF4(28016, 32574, 32732) },
+ { AOM_CDF4(17471, 30306, 32301) },
+ { AOM_CDF4(10224, 24063, 29728) },
+ { AOM_CDF4(6602, 17954, 25052) },
+ { AOM_CDF4(4002, 11585, 17759) },
+ { AOM_CDF4(30190, 32634, 32739) },
+ { AOM_CDF4(17497, 30282, 32270) },
+ { AOM_CDF4(10229, 23729, 29538) },
+ { AOM_CDF4(6344, 17211, 24440) },
+ { AOM_CDF4(3849, 11189, 17108) },
+ { AOM_CDF4(28570, 32583, 32726) },
+ { AOM_CDF4(17521, 30161, 32238) },
+ { AOM_CDF4(10153, 23565, 29378) },
+ { AOM_CDF4(6455, 17341, 24443) },
+ { AOM_CDF4(3907, 11042, 17024) },
+ { AOM_CDF4(30689, 32715, 32748) },
+ { AOM_CDF4(21546, 31840, 32610) },
+ { AOM_CDF4(13547, 27581, 31459) },
+ { AOM_CDF4(8912, 21757, 28309) },
+ { AOM_CDF4(5548, 15080, 22046) },
+ { AOM_CDF4(30783, 32540, 32685) },
+ { AOM_CDF4(17540, 29528, 31668) },
+ { AOM_CDF4(10160, 21468, 26783) },
+ { AOM_CDF4(4724, 13393, 20054) },
+ { AOM_CDF4(2702, 8174, 13102) },
+ { AOM_CDF4(31648, 32686, 32742) },
+ { AOM_CDF4(20954, 31094, 32337) },
+ { AOM_CDF4(12420, 25698, 30179) },
+ { AOM_CDF4(7304, 19320, 26248) },
+ { AOM_CDF4(4366, 12261, 18864) },
+ { AOM_CDF4(31581, 32723, 32748) },
+ { AOM_CDF4(21373, 31586, 32525) },
+ { AOM_CDF4(12744, 26625, 30885) },
+ { AOM_CDF4(7431, 20322, 26950) },
+ { AOM_CDF4(4692, 13323, 20111) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(7833, 18369, 24095) },
+ { AOM_CDF4(26650, 32273, 32702) },
+ { AOM_CDF4(16371, 29961, 32191) },
+ { AOM_CDF4(11055, 24082, 29629) },
+ { AOM_CDF4(6892, 18644, 25400) },
+ { AOM_CDF4(5006, 13057, 19240) },
+ { AOM_CDF4(29834, 32666, 32748) },
+ { AOM_CDF4(19577, 31335, 32570) },
+ { AOM_CDF4(12253, 26509, 31122) },
+ { AOM_CDF4(7991, 20772, 27711) },
+ { AOM_CDF4(5677, 15910, 23059) },
+ { AOM_CDF4(30109, 32532, 32720) },
+ { AOM_CDF4(16747, 30166, 32252) },
+ { AOM_CDF4(10134, 23542, 29184) },
+ { AOM_CDF4(5791, 16176, 23556) },
+ { AOM_CDF4(4362, 10414, 17284) },
+ { AOM_CDF4(29492, 32626, 32748) },
+ { AOM_CDF4(19894, 31402, 32525) },
+ { AOM_CDF4(12942, 27071, 30869) },
+ { AOM_CDF4(8346, 21216, 27405) },
+ { AOM_CDF4(6572, 17087, 23859) },
+ { AOM_CDF4(32035, 32735, 32748) },
+ { AOM_CDF4(22957, 31838, 32618) },
+ { AOM_CDF4(14724, 28572, 31772) },
+ { AOM_CDF4(10364, 23999, 29553) },
+ { AOM_CDF4(7004, 18433, 25655) },
+ { AOM_CDF4(27528, 32277, 32681) },
+ { AOM_CDF4(16959, 31171, 32096) },
+ { AOM_CDF4(10486, 23593, 27962) },
+ { AOM_CDF4(8192, 16384, 23211) },
+ { AOM_CDF4(8937, 17873, 20852) },
+ { AOM_CDF4(27715, 32002, 32615) },
+ { AOM_CDF4(15073, 29491, 31676) },
+ { AOM_CDF4(11264, 24576, 28672) },
+ { AOM_CDF4(2341, 18725, 23406) },
+ { AOM_CDF4(7282, 18204, 25486) },
+ { AOM_CDF4(28547, 32213, 32657) },
+ { AOM_CDF4(20788, 29773, 32239) },
+ { AOM_CDF4(6780, 21469, 30508) },
+ { AOM_CDF4(5958, 14895, 23831) },
+ { AOM_CDF4(16384, 21845, 27307) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(5992, 14304, 19765) },
+ { AOM_CDF4(22612, 31238, 32456) },
+ { AOM_CDF4(13456, 27162, 31087) },
+ { AOM_CDF4(8001, 20062, 26504) },
+ { AOM_CDF4(5168, 14105, 20764) },
+ { AOM_CDF4(2632, 7771, 12385) },
+ { AOM_CDF4(27034, 32344, 32709) },
+ { AOM_CDF4(15850, 29415, 31997) },
+ { AOM_CDF4(9494, 22776, 28841) },
+ { AOM_CDF4(6151, 16830, 23969) },
+ { AOM_CDF4(3461, 10039, 15722) },
+ { AOM_CDF4(30134, 32569, 32731) },
+ { AOM_CDF4(15638, 29422, 31945) },
+ { AOM_CDF4(9150, 21865, 28218) },
+ { AOM_CDF4(5647, 15719, 22676) },
+ { AOM_CDF4(3402, 9772, 15477) },
+ { AOM_CDF4(28530, 32586, 32735) },
+ { AOM_CDF4(17139, 30298, 32292) },
+ { AOM_CDF4(10200, 24039, 29685) },
+ { AOM_CDF4(6419, 17674, 24786) },
+ { AOM_CDF4(3544, 10225, 15824) },
+ { AOM_CDF4(31333, 32726, 32748) },
+ { AOM_CDF4(20618, 31487, 32544) },
+ { AOM_CDF4(12901, 27217, 31232) },
+ { AOM_CDF4(8624, 21734, 28171) },
+ { AOM_CDF4(5104, 14191, 20748) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(11206, 21090, 26561) },
+ { AOM_CDF4(28759, 32279, 32671) },
+ { AOM_CDF4(14171, 27952, 31569) },
+ { AOM_CDF4(9743, 22907, 29141) },
+ { AOM_CDF4(6871, 17886, 24868) },
+ { AOM_CDF4(4960, 13152, 19315) },
+ { AOM_CDF4(31077, 32661, 32748) },
+ { AOM_CDF4(19400, 31195, 32515) },
+ { AOM_CDF4(12752, 26858, 31040) },
+ { AOM_CDF4(8370, 22098, 28591) },
+ { AOM_CDF4(5457, 15373, 22298) },
+ { AOM_CDF4(31697, 32706, 32748) },
+ { AOM_CDF4(17860, 30657, 32333) },
+ { AOM_CDF4(12510, 24812, 29261) },
+ { AOM_CDF4(6180, 19124, 24722) },
+ { AOM_CDF4(5041, 13548, 17959) },
+ { AOM_CDF4(31552, 32716, 32748) },
+ { AOM_CDF4(21908, 31769, 32623) },
+ { AOM_CDF4(14470, 28201, 31565) },
+ { AOM_CDF4(9493, 22982, 28608) },
+ { AOM_CDF4(6858, 17240, 24137) },
+ { AOM_CDF4(32543, 32752, 32756) },
+ { AOM_CDF4(24286, 32097, 32666) },
+ { AOM_CDF4(15958, 29217, 32024) },
+ { AOM_CDF4(10207, 24234, 29958) },
+ { AOM_CDF4(6929, 18305, 25652) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } },
+ { { { AOM_CDF4(4137, 10847, 15682) },
+ { AOM_CDF4(17824, 27001, 30058) },
+ { AOM_CDF4(10204, 22796, 28291) },
+ { AOM_CDF4(6076, 15935, 22125) },
+ { AOM_CDF4(3852, 10937, 16816) },
+ { AOM_CDF4(2252, 6324, 10131) },
+ { AOM_CDF4(25840, 32016, 32662) },
+ { AOM_CDF4(15109, 28268, 31531) },
+ { AOM_CDF4(9385, 22231, 28340) },
+ { AOM_CDF4(6082, 16672, 23479) },
+ { AOM_CDF4(3318, 9427, 14681) },
+ { AOM_CDF4(30594, 32574, 32718) },
+ { AOM_CDF4(16836, 29552, 31859) },
+ { AOM_CDF4(9556, 22542, 28356) },
+ { AOM_CDF4(6305, 16725, 23540) },
+ { AOM_CDF4(3376, 9895, 15184) },
+ { AOM_CDF4(29383, 32617, 32745) },
+ { AOM_CDF4(18891, 30809, 32401) },
+ { AOM_CDF4(11688, 25942, 30687) },
+ { AOM_CDF4(7468, 19469, 26651) },
+ { AOM_CDF4(3909, 11358, 17012) },
+ { AOM_CDF4(31564, 32736, 32748) },
+ { AOM_CDF4(20906, 31611, 32600) },
+ { AOM_CDF4(13191, 27621, 31537) },
+ { AOM_CDF4(8768, 22029, 28676) },
+ { AOM_CDF4(5079, 14109, 20906) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } },
+ { { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) },
+ { AOM_CDF4(8192, 16384, 24576) } } } } };
+
+static const aom_cdf_prob av1_default_coeff_base_eob_multi_cdfs
+ [TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES][SIG_COEF_CONTEXTS_EOB][CDF_SIZE(
+ NUM_BASE_LEVELS + 1)] = { { { { { AOM_CDF3(17837, 29055) },
+ { AOM_CDF3(29600, 31446) },
+ { AOM_CDF3(30844, 31878) },
+ { AOM_CDF3(24926, 28948) } },
+ { { AOM_CDF3(21365, 30026) },
+ { AOM_CDF3(30512, 32423) },
+ { AOM_CDF3(31658, 32621) },
+ { AOM_CDF3(29630, 31881) } } },
+ { { { AOM_CDF3(5717, 26477) },
+ { AOM_CDF3(30491, 31703) },
+ { AOM_CDF3(31550, 32158) },
+ { AOM_CDF3(29648, 31491) } },
+ { { AOM_CDF3(12608, 27820) },
+ { AOM_CDF3(30680, 32225) },
+ { AOM_CDF3(30809, 32335) },
+ { AOM_CDF3(31299, 32423) } } },
+ { { { AOM_CDF3(1786, 12612) },
+ { AOM_CDF3(30663, 31625) },
+ { AOM_CDF3(32339, 32468) },
+ { AOM_CDF3(31148, 31833) } },
+ { { AOM_CDF3(18857, 23865) },
+ { AOM_CDF3(31428, 32428) },
+ { AOM_CDF3(31744, 32373) },
+ { AOM_CDF3(31775, 32526) } } },
+ { { { AOM_CDF3(1787, 2532) },
+ { AOM_CDF3(30832, 31662) },
+ { AOM_CDF3(31824, 32682) },
+ { AOM_CDF3(32133, 32569) } },
+ { { AOM_CDF3(13751, 22235) },
+ { AOM_CDF3(32089, 32409) },
+ { AOM_CDF3(27084, 27920) },
+ { AOM_CDF3(29291, 32594) } } },
+ { { { AOM_CDF3(1725, 3449) },
+ { AOM_CDF3(31102, 31935) },
+ { AOM_CDF3(32457, 32613) },
+ { AOM_CDF3(32412, 32649) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(17560, 29888) },
+ { AOM_CDF3(29671, 31549) },
+ { AOM_CDF3(31007, 32056) },
+ { AOM_CDF3(27286, 30006) } },
+ { { AOM_CDF3(26594, 31212) },
+ { AOM_CDF3(31208, 32582) },
+ { AOM_CDF3(31835, 32637) },
+ { AOM_CDF3(30595, 32206) } } },
+ { { { AOM_CDF3(15239, 29932) },
+ { AOM_CDF3(31315, 32095) },
+ { AOM_CDF3(32130, 32434) },
+ { AOM_CDF3(30864, 31996) } },
+ { { AOM_CDF3(26279, 30968) },
+ { AOM_CDF3(31142, 32495) },
+ { AOM_CDF3(31713, 32540) },
+ { AOM_CDF3(31929, 32594) } } },
+ { { { AOM_CDF3(2644, 25198) },
+ { AOM_CDF3(32038, 32451) },
+ { AOM_CDF3(32639, 32695) },
+ { AOM_CDF3(32166, 32518) } },
+ { { AOM_CDF3(17187, 27668) },
+ { AOM_CDF3(31714, 32550) },
+ { AOM_CDF3(32283, 32678) },
+ { AOM_CDF3(31930, 32563) } } },
+ { { { AOM_CDF3(1044, 2257) },
+ { AOM_CDF3(30755, 31923) },
+ { AOM_CDF3(32208, 32693) },
+ { AOM_CDF3(32244, 32615) } },
+ { { AOM_CDF3(21317, 26207) },
+ { AOM_CDF3(29133, 30868) },
+ { AOM_CDF3(29311, 31231) },
+ { AOM_CDF3(29657, 31087) } } },
+ { { { AOM_CDF3(478, 1834) },
+ { AOM_CDF3(31005, 31987) },
+ { AOM_CDF3(32317, 32724) },
+ { AOM_CDF3(30865, 32648) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(20092, 30774) },
+ { AOM_CDF3(30695, 32020) },
+ { AOM_CDF3(31131, 32103) },
+ { AOM_CDF3(28666, 30870) } },
+ { { AOM_CDF3(27258, 31095) },
+ { AOM_CDF3(31804, 32623) },
+ { AOM_CDF3(31763, 32528) },
+ { AOM_CDF3(31438, 32506) } } },
+ { { { AOM_CDF3(18049, 30489) },
+ { AOM_CDF3(31706, 32286) },
+ { AOM_CDF3(32163, 32473) },
+ { AOM_CDF3(31550, 32184) } },
+ { { AOM_CDF3(27116, 30842) },
+ { AOM_CDF3(31971, 32598) },
+ { AOM_CDF3(32088, 32576) },
+ { AOM_CDF3(32067, 32664) } } },
+ { { { AOM_CDF3(12854, 29093) },
+ { AOM_CDF3(32272, 32558) },
+ { AOM_CDF3(32667, 32729) },
+ { AOM_CDF3(32306, 32585) } },
+ { { AOM_CDF3(25476, 30366) },
+ { AOM_CDF3(32169, 32687) },
+ { AOM_CDF3(32479, 32689) },
+ { AOM_CDF3(31673, 32634) } } },
+ { { { AOM_CDF3(2809, 19301) },
+ { AOM_CDF3(32205, 32622) },
+ { AOM_CDF3(32338, 32730) },
+ { AOM_CDF3(31786, 32616) } },
+ { { AOM_CDF3(22737, 29105) },
+ { AOM_CDF3(30810, 32362) },
+ { AOM_CDF3(30014, 32627) },
+ { AOM_CDF3(30528, 32574) } } },
+ { { { AOM_CDF3(935, 3382) },
+ { AOM_CDF3(30789, 31909) },
+ { AOM_CDF3(32466, 32756) },
+ { AOM_CDF3(30860, 32513) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } },
+ { { { { AOM_CDF3(22497, 31198) },
+ { AOM_CDF3(31715, 32495) },
+ { AOM_CDF3(31606, 32337) },
+ { AOM_CDF3(30388, 31990) } },
+ { { AOM_CDF3(27877, 31584) },
+ { AOM_CDF3(32170, 32728) },
+ { AOM_CDF3(32155, 32688) },
+ { AOM_CDF3(32219, 32702) } } },
+ { { { AOM_CDF3(21457, 31043) },
+ { AOM_CDF3(31951, 32483) },
+ { AOM_CDF3(32153, 32562) },
+ { AOM_CDF3(31473, 32215) } },
+ { { AOM_CDF3(27558, 31151) },
+ { AOM_CDF3(32020, 32640) },
+ { AOM_CDF3(32097, 32575) },
+ { AOM_CDF3(32242, 32719) } } },
+ { { { AOM_CDF3(19980, 30591) },
+ { AOM_CDF3(32219, 32597) },
+ { AOM_CDF3(32581, 32706) },
+ { AOM_CDF3(31803, 32287) } },
+ { { AOM_CDF3(26473, 30507) },
+ { AOM_CDF3(32431, 32723) },
+ { AOM_CDF3(32196, 32611) },
+ { AOM_CDF3(31588, 32528) } } },
+ { { { AOM_CDF3(24647, 30463) },
+ { AOM_CDF3(32412, 32695) },
+ { AOM_CDF3(32468, 32720) },
+ { AOM_CDF3(31269, 32523) } },
+ { { AOM_CDF3(28482, 31505) },
+ { AOM_CDF3(32152, 32701) },
+ { AOM_CDF3(31732, 32598) },
+ { AOM_CDF3(31767, 32712) } } },
+ { { { AOM_CDF3(12358, 24977) },
+ { AOM_CDF3(31331, 32385) },
+ { AOM_CDF3(32634, 32756) },
+ { AOM_CDF3(30411, 32548) } },
+ { { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) },
+ { AOM_CDF3(10923, 21845) } } } } };
+
+#endif // AOM_AV1_COMMON_TOKEN_CDFS_H_
diff --git a/third_party/aom/av1/common/txb_common.c b/third_party/aom/av1/common/txb_common.c
new file mode 100644
index 0000000000..c96d37cca7
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+const int8_t av1_coeff_band_4x4[16] = { 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15 };
+
+const int8_t av1_coeff_band_8x8[64] = {
+ 0, 1, 2, 2, 3, 3, 4, 4, 5, 6, 2, 2, 3, 3, 4, 4,
+ 7, 7, 8, 8, 9, 9, 10, 10, 7, 7, 8, 8, 9, 9, 10, 10,
+ 11, 11, 12, 12, 13, 13, 14, 14, 11, 11, 12, 12, 13, 13, 14, 14,
+ 15, 15, 16, 16, 17, 17, 18, 18, 15, 15, 16, 16, 17, 17, 18, 18,
+};
+
+const int8_t av1_coeff_band_16x16[256] = {
+ 0, 1, 4, 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 2, 3, 4,
+ 4, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 5, 5, 6, 6, 7, 7,
+ 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 5, 5, 6, 6, 7, 7, 7, 7, 8,
+ 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12,
+ 13, 13, 13, 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13,
+ 13, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 10, 10,
+ 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15,
+ 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15,
+ 16, 16, 16, 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+ 16, 17, 17, 17, 17, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17,
+ 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18,
+ 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18,
+ 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 18, 18, 18, 18, 19, 19, 19,
+ 19, 20, 20, 20, 20, 21, 21, 21, 21,
+};
+
+const int8_t av1_coeff_band_32x32[1024] = {
+ 0, 1, 4, 4, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+ 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 2, 3, 4, 4, 7, 7,
+ 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+ 12, 12, 12, 12, 12, 12, 12, 5, 5, 6, 6, 7, 7, 7, 7, 10, 10, 10, 10,
+ 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12,
+ 12, 5, 5, 6, 6, 7, 7, 7, 7, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11,
+ 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8, 9,
+ 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11,
+ 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10,
+ 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+ 12, 12, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11,
+ 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 8, 8, 8, 8,
+ 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
+ 11, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+ 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+ 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+ 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13,
+ 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14,
+ 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16,
+ 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14,
+ 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13,
+ 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15,
+ 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13,
+ 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16,
+ 16, 16, 16, 16, 16, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14,
+ 14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 17,
+ 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
+ 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17,
+ 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20,
+ 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18,
+ 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20,
+ 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
+ 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17,
+ 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20,
+ 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18,
+ 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20,
+ 20, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
+ 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 17, 17, 17, 17, 17,
+ 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19, 19, 19,
+ 20, 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22,
+ 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24,
+ 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23,
+ 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21,
+ 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23,
+ 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22,
+ 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24,
+ 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21,
+ 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23,
+ 23, 23, 24, 24, 24, 24, 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22,
+ 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24,
+ 24, 24, 24, 24, 21, 21, 21, 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+ 22, 23, 23, 23, 23, 23, 23, 23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
+};
+
+// The ctx offset table when TX is TX_CLASS_2D.
+// TX col and row indices are clamped to 4
+
+const int8_t av1_nz_map_ctx_offset_4x4[16] = {
+ 0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x8[64] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21,
+ 6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x16[256] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x32[1024] = {
+ 0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x4[32] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21, 21, 21,
+ 16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x16[128] = {
+ 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21,
+ 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x8[128] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x32[512] = {
+ 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x16[512] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x64[1024] = {
+ 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_64x32[1024] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16,
+ 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_4x16[64] = {
+ 0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_16x4[64] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_8x32[256] = {
+ 0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21,
+ 21, 21, 21, 21, 21, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t av1_nz_map_ctx_offset_32x8[256] = {
+ 0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 6, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 16, 16, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21, 21, 21, 21, 21,
+};
+
+const int8_t *av1_nz_map_ctx_offset[19] = {
+ av1_nz_map_ctx_offset_4x4, // TX_4x4
+ av1_nz_map_ctx_offset_8x8, // TX_8x8
+ av1_nz_map_ctx_offset_16x16, // TX_16x16
+ av1_nz_map_ctx_offset_32x32, // TX_32x32
+ av1_nz_map_ctx_offset_32x32, // TX_32x32
+ av1_nz_map_ctx_offset_4x16, // TX_4x8
+ av1_nz_map_ctx_offset_8x4, // TX_8x4
+ av1_nz_map_ctx_offset_8x32, // TX_8x16
+ av1_nz_map_ctx_offset_16x8, // TX_16x8
+ av1_nz_map_ctx_offset_16x32, // TX_16x32
+ av1_nz_map_ctx_offset_32x16, // TX_32x16
+ av1_nz_map_ctx_offset_32x64, // TX_32x64
+ av1_nz_map_ctx_offset_64x32, // TX_64x32
+ av1_nz_map_ctx_offset_4x16, // TX_4x16
+ av1_nz_map_ctx_offset_16x4, // TX_16x4
+ av1_nz_map_ctx_offset_8x32, // TX_8x32
+ av1_nz_map_ctx_offset_32x8, // TX_32x8
+ av1_nz_map_ctx_offset_16x32, // TX_16x64
+ av1_nz_map_ctx_offset_64x32, // TX_64x16
+};
+
+void av1_init_lv_map(AV1_COMMON *cm) {
+ LV_MAP_CTX_TABLE *coeff_ctx_table = &cm->coeff_ctx_table;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ for (int sig_mag = 0; sig_mag < 3; ++sig_mag) {
+ for (int count = 0; count < BASE_CONTEXT_POSITION_NUM + 1; ++count) {
+ if (row == 0 && col == 0 && count > 5) continue;
+ if ((row == 0 || col == 0) && count > 8) continue;
+
+ coeff_ctx_table->base_ctx_table[row][col][sig_mag][count] =
+ get_base_ctx_from_count_mag(row, col, count, sig_mag);
+ }
+ }
+ }
+ }
+}
+
+const int16_t k_eob_group_start[12] = { 0, 1, 2, 3, 5, 9,
+ 17, 33, 65, 129, 257, 513 };
+const int16_t k_eob_offset_bits[12] = { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
diff --git a/third_party/aom/av1/common/txb_common.h b/third_party/aom/av1/common/txb_common.h
new file mode 100644
index 0000000000..1dda51f8b4
--- /dev/null
+++ b/third_party/aom/av1/common/txb_common.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_TXB_COMMON_H_
+#define AOM_AV1_COMMON_TXB_COMMON_H_
+
+extern const int16_t k_eob_group_start[12];
+extern const int16_t k_eob_offset_bits[12];
+
+extern const int8_t av1_coeff_band_4x4[16];
+
+extern const int8_t av1_coeff_band_8x8[64];
+
+extern const int8_t av1_coeff_band_16x16[256];
+
+extern const int8_t av1_coeff_band_32x32[1024];
+
+extern const int8_t *av1_nz_map_ctx_offset[TX_SIZES_ALL];
+
+typedef struct txb_ctx {
+ int txb_skip_ctx;
+ int dc_sign_ctx;
+} TXB_CTX;
+
+static const int base_level_count_to_index[13] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3,
+};
+
+static const TX_CLASS tx_type_to_class[TX_TYPES] = {
+ TX_CLASS_2D, // DCT_DCT
+ TX_CLASS_2D, // ADST_DCT
+ TX_CLASS_2D, // DCT_ADST
+ TX_CLASS_2D, // ADST_ADST
+ TX_CLASS_2D, // FLIPADST_DCT
+ TX_CLASS_2D, // DCT_FLIPADST
+ TX_CLASS_2D, // FLIPADST_FLIPADST
+ TX_CLASS_2D, // ADST_FLIPADST
+ TX_CLASS_2D, // FLIPADST_ADST
+ TX_CLASS_2D, // IDTX
+ TX_CLASS_VERT, // V_DCT
+ TX_CLASS_HORIZ, // H_DCT
+ TX_CLASS_VERT, // V_ADST
+ TX_CLASS_HORIZ, // H_ADST
+ TX_CLASS_VERT, // V_FLIPADST
+ TX_CLASS_HORIZ, // H_FLIPADST
+};
+
+static INLINE int get_txb_bwl(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_wide_log2[tx_size];
+}
+
+static INLINE int get_txb_wide(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_wide[tx_size];
+}
+
+static INLINE int get_txb_high(TX_SIZE tx_size) {
+ tx_size = av1_get_adjusted_tx_size(tx_size);
+ return tx_size_high[tx_size];
+}
+
+static INLINE uint8_t *set_levels(uint8_t *const levels_buf, const int width) {
+ return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
+}
+
+static INLINE int get_padded_idx(const int idx, const int bwl) {
+ return idx + ((idx >> bwl) << TX_PAD_HOR_LOG2);
+}
+
+static INLINE int get_base_ctx_from_count_mag(int row, int col, int count,
+ int sig_mag) {
+ const int ctx = base_level_count_to_index[count];
+ int ctx_idx = -1;
+
+ if (row == 0 && col == 0) {
+ if (sig_mag >= 2) return ctx_idx = 0;
+ if (sig_mag == 1) {
+ if (count >= 2)
+ ctx_idx = 1;
+ else
+ ctx_idx = 2;
+
+ return ctx_idx;
+ }
+
+ ctx_idx = 3 + ctx;
+ assert(ctx_idx <= 6);
+ return ctx_idx;
+ } else if (row == 0) {
+ if (sig_mag >= 2) return ctx_idx = 6;
+ if (sig_mag == 1) {
+ if (count >= 2)
+ ctx_idx = 7;
+ else
+ ctx_idx = 8;
+ return ctx_idx;
+ }
+
+ ctx_idx = 9 + ctx;
+ assert(ctx_idx <= 11);
+ return ctx_idx;
+ } else if (col == 0) {
+ if (sig_mag >= 2) return ctx_idx = 12;
+ if (sig_mag == 1) {
+ if (count >= 2)
+ ctx_idx = 13;
+ else
+ ctx_idx = 14;
+
+ return ctx_idx;
+ }
+
+ ctx_idx = 15 + ctx;
+ assert(ctx_idx <= 17);
+ // TODO(angiebird): turn this on once the optimization is finalized
+ // assert(ctx_idx < 28);
+ } else {
+ if (sig_mag >= 2) return ctx_idx = 18;
+ if (sig_mag == 1) {
+ if (count >= 2)
+ ctx_idx = 19;
+ else
+ ctx_idx = 20;
+ return ctx_idx;
+ }
+
+ ctx_idx = 21 + ctx;
+
+ assert(ctx_idx <= 24);
+ }
+ return ctx_idx;
+}
+
+static INLINE int get_br_ctx_2d(const uint8_t *const levels,
+ const int c, // raster order
+ const int bwl) {
+ assert(c > 0);
+ const int row = c >> bwl;
+ const int col = c - (row << bwl);
+ const int stride = (1 << bwl) + TX_PAD_HOR;
+ const int pos = row * stride + col;
+ int mag = AOMMIN(levels[pos + 1], MAX_BASE_BR_RANGE) +
+ AOMMIN(levels[pos + stride], MAX_BASE_BR_RANGE) +
+ AOMMIN(levels[pos + 1 + stride], MAX_BASE_BR_RANGE);
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ //((row | col) < 2) is equivalent to ((row < 2) && (col < 2))
+ if ((row | col) < 2) return mag + 7;
+ return mag + 14;
+}
+
+static AOM_FORCE_INLINE int get_br_ctx(const uint8_t *const levels,
+ const int c, // raster order
+ const int bwl, const TX_CLASS tx_class) {
+ const int row = c >> bwl;
+ const int col = c - (row << bwl);
+ const int stride = (1 << bwl) + TX_PAD_HOR;
+ const int pos = row * stride + col;
+ int mag = levels[pos + 1];
+ mag += levels[pos + stride];
+ switch (tx_class) {
+ case TX_CLASS_2D:
+ mag += levels[pos + stride + 1];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if ((row < 2) && (col < 2)) return mag + 7;
+ break;
+ case TX_CLASS_HORIZ:
+ mag += levels[pos + 2];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if (col == 0) return mag + 7;
+ break;
+ case TX_CLASS_VERT:
+ mag += levels[pos + (stride << 1)];
+ mag = AOMMIN((mag + 1) >> 1, 6);
+ if (c == 0) return mag;
+ if (row == 0) return mag + 7;
+ break;
+ default: break;
+ }
+
+ return mag + 14;
+}
+
+static const uint8_t clip_max3[256] = {
+ 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
+};
+
+static AOM_FORCE_INLINE int get_nz_mag(const uint8_t *const levels,
+ const int bwl, const TX_CLASS tx_class) {
+ int mag;
+
+ // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+ mag = clip_max3[levels[1]]; // { 0, 1 }
+ mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR]]; // { 1, 0 }
+
+ if (tx_class == TX_CLASS_2D) {
+ mag += clip_max3[levels[(1 << bwl) + TX_PAD_HOR + 1]]; // { 1, 1 }
+ mag += clip_max3[levels[2]]; // { 0, 2 }
+ mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]]; // { 2, 0 }
+ } else if (tx_class == TX_CLASS_VERT) {
+ mag += clip_max3[levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)]]; // { 2, 0 }
+ mag += clip_max3[levels[(3 << bwl) + (3 << TX_PAD_HOR_LOG2)]]; // { 3, 0 }
+ mag += clip_max3[levels[(4 << bwl) + (4 << TX_PAD_HOR_LOG2)]]; // { 4, 0 }
+ } else {
+ mag += clip_max3[levels[2]]; // { 0, 2 }
+ mag += clip_max3[levels[3]]; // { 0, 3 }
+ mag += clip_max3[levels[4]]; // { 0, 4 }
+ }
+
+ return mag;
+}
+
+#define NZ_MAP_CTX_0 SIG_COEF_CONTEXTS_2D
+#define NZ_MAP_CTX_5 (NZ_MAP_CTX_0 + 5)
+#define NZ_MAP_CTX_10 (NZ_MAP_CTX_0 + 10)
+
+static const int nz_map_ctx_offset_1d[32] = {
+ NZ_MAP_CTX_0, NZ_MAP_CTX_5, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+ NZ_MAP_CTX_10, NZ_MAP_CTX_10,
+};
+
+static AOM_FORCE_INLINE int get_nz_map_ctx_from_stats(
+ const int stats,
+ const int coeff_idx, // raster order
+ const int bwl, const TX_SIZE tx_size, const TX_CLASS tx_class) {
+ // tx_class == 0(TX_CLASS_2D)
+ if ((tx_class | coeff_idx) == 0) return 0;
+ int ctx = (stats + 1) >> 1;
+ ctx = AOMMIN(ctx, 4);
+ switch (tx_class) {
+ case TX_CLASS_2D: {
+ // This is the algorithm to generate av1_nz_map_ctx_offset[][]
+ // const int width = tx_size_wide[tx_size];
+ // const int height = tx_size_high[tx_size];
+ // if (width < height) {
+ // if (row < 2) return 11 + ctx;
+ // } else if (width > height) {
+ // if (col < 2) return 16 + ctx;
+ // }
+ // if (row + col < 2) return ctx + 1;
+ // if (row + col < 4) return 5 + ctx + 1;
+ // return 21 + ctx;
+ return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+ }
+ case TX_CLASS_HORIZ: {
+ const int row = coeff_idx >> bwl;
+ const int col = coeff_idx - (row << bwl);
+ return ctx + nz_map_ctx_offset_1d[col];
+ break;
+ }
+ case TX_CLASS_VERT: {
+ const int row = coeff_idx >> bwl;
+ return ctx + nz_map_ctx_offset_1d[row];
+ break;
+ }
+ default: break;
+ }
+ return 0;
+}
+
+typedef aom_cdf_prob (*base_cdf_arr)[CDF_SIZE(4)];
+typedef aom_cdf_prob (*br_cdf_arr)[CDF_SIZE(BR_CDF_SIZE)];
+
+static INLINE int get_lower_levels_ctx_eob(int bwl, int height, int scan_idx) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (height << bwl) / 8) return 1;
+ if (scan_idx <= (height << bwl) / 4) return 2;
+ return 3;
+}
+
+static INLINE int get_lower_levels_ctx_2d(const uint8_t *levels, int coeff_idx,
+ int bwl, TX_SIZE tx_size) {
+ assert(coeff_idx > 0);
+ int mag;
+ // Note: AOMMIN(level, 3) is useless for decoder since level < 3.
+ levels = levels + get_padded_idx(coeff_idx, bwl);
+ mag = AOMMIN(levels[1], 3); // { 0, 1 }
+ mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR], 3); // { 1, 0 }
+ mag += AOMMIN(levels[(1 << bwl) + TX_PAD_HOR + 1], 3); // { 1, 1 }
+ mag += AOMMIN(levels[2], 3); // { 0, 2 }
+ mag += AOMMIN(levels[(2 << bwl) + (2 << TX_PAD_HOR_LOG2)], 3); // { 2, 0 }
+
+ const int ctx = AOMMIN((mag + 1) >> 1, 4);
+ return ctx + av1_nz_map_ctx_offset[tx_size][coeff_idx];
+}
+static AOM_FORCE_INLINE int get_lower_levels_ctx(const uint8_t *levels,
+ int coeff_idx, int bwl,
+ TX_SIZE tx_size,
+ TX_CLASS tx_class) {
+ const int stats =
+ get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+ return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE int get_lower_levels_ctx_general(int is_last, int scan_idx,
+ int bwl, int height,
+ const uint8_t *levels,
+ int coeff_idx, TX_SIZE tx_size,
+ TX_CLASS tx_class) {
+ if (is_last) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (height << bwl) >> 3) return 1;
+ if (scan_idx <= (height << bwl) >> 2) return 2;
+ return 3;
+ }
+ return get_lower_levels_ctx(levels, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static INLINE void set_dc_sign(int *cul_level, int dc_val) {
+ if (dc_val < 0)
+ *cul_level |= 1 << COEFF_CONTEXT_BITS;
+ else if (dc_val > 0)
+ *cul_level += 2 << COEFF_CONTEXT_BITS;
+}
+
+static INLINE void get_txb_ctx(const BLOCK_SIZE plane_bsize,
+ const TX_SIZE tx_size, const int plane,
+ const ENTROPY_CONTEXT *const a,
+ const ENTROPY_CONTEXT *const l,
+ TXB_CTX *const txb_ctx) {
+#define MAX_TX_SIZE_UNIT 16
+ static const int8_t signs[3] = { 0, -1, 1 };
+ static const int8_t dc_sign_contexts[4 * MAX_TX_SIZE_UNIT + 1] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
+ };
+ const int txb_w_unit = tx_size_wide_unit[tx_size];
+ const int txb_h_unit = tx_size_high_unit[tx_size];
+ int dc_sign = 0;
+ int k = 0;
+
+ do {
+ const unsigned int sign = ((uint8_t)a[k]) >> COEFF_CONTEXT_BITS;
+ assert(sign <= 2);
+ dc_sign += signs[sign];
+ } while (++k < txb_w_unit);
+
+ k = 0;
+ do {
+ const unsigned int sign = ((uint8_t)l[k]) >> COEFF_CONTEXT_BITS;
+ assert(sign <= 2);
+ dc_sign += signs[sign];
+ } while (++k < txb_h_unit);
+
+ txb_ctx->dc_sign_ctx = dc_sign_contexts[dc_sign + 2 * MAX_TX_SIZE_UNIT];
+
+ if (plane == 0) {
+ if (plane_bsize == txsize_to_bsize[tx_size]) {
+ txb_ctx->txb_skip_ctx = 0;
+ } else {
+ // This is the algorithm to generate table skip_contexts[min][max].
+ // if (!max)
+ // txb_skip_ctx = 1;
+ // else if (!min)
+ // txb_skip_ctx = 2 + (max > 3);
+ // else if (max <= 3)
+ // txb_skip_ctx = 4;
+ // else if (min <= 3)
+ // txb_skip_ctx = 5;
+ // else
+ // txb_skip_ctx = 6;
+ static const uint8_t skip_contexts[5][5] = { { 1, 2, 2, 2, 3 },
+ { 1, 4, 4, 4, 5 },
+ { 1, 4, 4, 4, 5 },
+ { 1, 4, 4, 4, 5 },
+ { 1, 4, 4, 4, 6 } };
+ int top = 0;
+ int left = 0;
+
+ k = 0;
+ do {
+ top |= a[k];
+ } while (++k < txb_w_unit);
+ top &= COEFF_CONTEXT_MASK;
+
+ k = 0;
+ do {
+ left |= l[k];
+ } while (++k < txb_h_unit);
+ left &= COEFF_CONTEXT_MASK;
+ const int max = AOMMIN(top | left, 4);
+ const int min = AOMMIN(AOMMIN(top, left), 4);
+
+ txb_ctx->txb_skip_ctx = skip_contexts[min][max];
+ }
+ } else {
+ const int ctx_base = get_entropy_context(tx_size, a, l);
+ const int ctx_offset = (num_pels_log2_lookup[plane_bsize] >
+ num_pels_log2_lookup[txsize_to_bsize[tx_size]])
+ ? 10
+ : 7;
+ txb_ctx->txb_skip_ctx = ctx_base + ctx_offset;
+ }
+#undef MAX_TX_SIZE_UNIT
+}
+
+void av1_init_lv_map(AV1_COMMON *cm);
+
+#endif // AOM_AV1_COMMON_TXB_COMMON_H_
diff --git a/third_party/aom/av1/common/warped_motion.c b/third_party/aom/av1/common/warped_motion.c
new file mode 100644
index 0000000000..4144c43896
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.c
@@ -0,0 +1,1148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+#include "av1/common/scale.h"
+
+#define WARP_ERROR_BLOCK 32
+
+/* clang-format off */
+static const int error_measure_lut[512] = {
+ // pow 0.7
+ 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
+ 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
+ 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
+ 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
+ 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
+ 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
+ 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
+ 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
+ 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
+ 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
+ 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
+ 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
+ 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
+ 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
+ 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
+ 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
+ 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
+ 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
+ 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
+ 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
+ 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
+ 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
+ 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
+ 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
+ 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
+ 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
+ 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
+ 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
+ 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
+ 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
+ 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
+ 1323, 1187, 1045, 894, 731, 550, 339, 0,
+ 339, 550, 731, 894, 1045, 1187, 1323, 1452,
+ 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
+ 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
+ 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
+ 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
+ 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
+ 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
+ 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
+ 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
+ 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
+ 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
+ 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
+ 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
+ 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
+ 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
+ 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
+ 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
+ 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
+ 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
+ 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
+ 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
+ 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
+ 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
+ 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
+ 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
+ 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
+ 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
+ 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
+ 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
+ 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
+ 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
+ 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
+};
+/* clang-format on */
+
+// For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
+// at a time. The zoom/rotation/shear in the model are applied to the
+// "fractional" position of each pixel, which therefore varies within
+// [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
+// We need an extra 2 taps to fit this in, for a total of 8 taps.
+/* clang-format off */
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
+ { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
+ { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
+ { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
+ { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
+ { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
+ { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
+ { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
+ { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
+ { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
+ { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
+ { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
+ { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
+ { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
+ { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
+ { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
+ { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
+ { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
+ { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
+ { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
+ { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
+ { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
+ { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
+ { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
+ { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
+ { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
+ { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
+ { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
+ { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
+ { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
+ { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
+ { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
+
+ // [0, 1)
+ { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
+ { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
+ { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
+ {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
+ {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
+ {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
+ {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
+ {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
+ {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
+ {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
+ {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
+ {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
+ {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
+ {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
+ {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
+ {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
+ {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
+ {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
+ { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
+ { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
+
+ // [1, 2)
+ { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
+ { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
+ { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
+ { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
+ { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
+ { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
+ { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
+ { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
+ { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
+ { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
+ { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
+ { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
+ { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
+ { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
+ { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
+ { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
+ { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
+ { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
+ { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
+ { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
+ { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
+ { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
+ { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
+ { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
+ { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
+ { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
+ { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
+ { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
+ { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
+ { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
+ { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
+ { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
+ // dummy (replicate row index 191)
+ { 0, 0, 0, 0, 2, 127, - 1, 0 },
+
+#elif WARPEDPIXEL_PREC_BITS == 5
+ // [-1, 0)
+ {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0},
+ {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0},
+ {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0},
+ {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0},
+ {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0},
+ {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0},
+ {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0},
+ {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0},
+ {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0},
+ {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0},
+ {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0},
+ {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0},
+ {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0},
+ {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0},
+ {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0},
+ {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0},
+ // [0, 1)
+ { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0},
+ { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1},
+ {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1},
+ {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1},
+ {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2},
+ {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2},
+ {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2},
+ {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2},
+ {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2},
+ {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2},
+ {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2},
+ {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2},
+ {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2},
+ {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1},
+ {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1},
+ { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0},
+ // [1, 2)
+ {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0},
+ {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1},
+ {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2},
+ {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3},
+ {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3},
+ {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3},
+ {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4},
+ {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4},
+ {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4},
+ {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4},
+ {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4},
+ {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3},
+ {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3},
+ {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2},
+ {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1},
+ {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
+ // dummy (replicate row index 95)
+ {0, 0, 0, -1, 4, 127, -3, 1},
+
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+
+/* clang-format on */
+
+#define DIV_LUT_PREC_BITS 14
+#define DIV_LUT_BITS 8
+#define DIV_LUT_NUM (1 << DIV_LUT_BITS)
+
+static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
+ 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
+ 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
+ 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
+ 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
+ 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
+ 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
+ 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
+ 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
+ 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
+ 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
+ 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
+ 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
+ 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
+ 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
+ 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
+ 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
+ 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
+ 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
+ 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
+ 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
+ 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
+ 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
+ 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
+ 8240, 8224, 8208, 8192,
+};
+
+// Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
+// at precision of DIV_LUT_PREC_BITS along with the shift.
+static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
+ int64_t f;
+ *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+ : get_msb((unsigned int)D));
+ // e is obtained from D after resetting the most significant 1 bit.
+ const int64_t e = D - ((uint64_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
+ int32_t f;
+ *shift = get_msb(D);
+ // e is obtained from D after resetting the most significant 1 bit.
+ const int32_t e = D - ((uint32_t)1 << *shift);
+ // Get the most significant DIV_LUT_BITS (8) bits of e into f
+ if (*shift > DIV_LUT_BITS)
+ f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
+ else
+ f = e << (DIV_LUT_BITS - *shift);
+ assert(f <= DIV_LUT_NUM);
+ *shift += DIV_LUT_PREC_BITS;
+ // Use f as lookup into the precomputed table of multipliers
+ return div_lut[f];
+}
+
+static int is_affine_valid(const WarpedMotionParams *const wm) {
+ const int32_t *mat = wm->wmmat;
+ return (mat[2] > 0);
+}
+
+static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
+ int16_t delta) {
+ if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
+ (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
+ return 0;
+ else
+ return 1;
+}
+
+// Returns 1 on success or 0 on an invalid affine set
+int get_shear_params(WarpedMotionParams *wm) {
+ const int32_t *mat = wm->wmmat;
+ if (!is_affine_valid(wm)) return 0;
+ wm->alpha =
+ clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
+ wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
+ int16_t shift;
+ int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
+ int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
+ wm->gamma =
+ clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
+ v = ((int64_t)mat[3] * mat[4]) * y;
+ wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
+ (1 << WARPEDMODEL_PREC_BITS),
+ INT16_MIN, INT16_MAX);
+
+ wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+ wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
+ (1 << WARP_PARAM_REDUCE_BITS);
+
+ if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
+ return 0;
+
+ return 1;
+}
+
+static INLINE int highbd_error_measure(int err, int bd) {
+ const int b = bd - 8;
+ const int bmask = (1 << b) - 1;
+ const int v = (1 << b);
+ err = abs(err);
+ const int e1 = err >> b;
+ const int e2 = err & bmask;
+ return error_measure_lut[255 + e1] * (v - e2) +
+ error_measure_lut[256 + e1] * e2;
+}
+
+/* Note: For an explanation of the warp algorithm, and some notes on bit widths
+ for hardware implementations, see the comments above av1_warp_affine_c
+*/
+void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride, uint16_t *pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x,
+ int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ int32_t tmp[15 * 8];
+ const int reduce_bits_horiz =
+ conv_params->round_0 +
+ AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ for (int i = p_row; i < p_row + p_height; i += 8) {
+ for (int j = p_col; j < p_col + p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4) << subsampling_x;
+ const int32_t src_y = (i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ for (int k = -7; k < 8; ++k) {
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ int sx = sx4 + beta * (k + 4);
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ const int sample_x = clamp(ix + m, 0, width - 1);
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ assert(0 <= sum && sum < (1 << max_bits_horiz));
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+ sx += alpha;
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_vert;
+ for (int m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *p =
+ &conv_params
+ ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+ (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ if (conv_params->do_average) {
+ uint16_t *dst16 =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ int32_t tmp32 = *p;
+ if (conv_params->use_jnt_comp_avg) {
+ tmp32 = tmp32 * conv_params->fwd_offset +
+ sum * conv_params->bck_offset;
+ tmp32 = tmp32 >> DIST_PRECISION_BITS;
+ } else {
+ tmp32 += sum;
+ tmp32 = tmp32 >> 1;
+ }
+ tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ *dst16 =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
+ } else {
+ *p = sum;
+ }
+ } else {
+ uint16_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ assert(0 <= sum && sum < (1 << (bd + 2)));
+ *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
+ }
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
+static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
+ int width, int height, int stride,
+ const uint8_t *const pred8, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params) {
+ assert(wm->wmtype <= AFFINE);
+ if (wm->wmtype == ROTZOOM) {
+ wm->wmmat[5] = wm->wmmat[2];
+ wm->wmmat[4] = -wm->wmmat[3];
+ }
+ const int32_t *const mat = wm->wmmat;
+ const int16_t alpha = wm->alpha;
+ const int16_t beta = wm->beta;
+ const int16_t gamma = wm->gamma;
+ const int16_t delta = wm->delta;
+
+ const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, bd, conv_params, alpha, beta, gamma,
+ delta);
+}
+
+static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
+ const uint16_t *const dst, int p_width,
+ int p_height, int p_stride, int bd) {
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sum_error +=
+ highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
+ }
+ }
+ return sum_error;
+}
+
+static int64_t highbd_warp_error(
+ WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
+ int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
+ int64_t best_error) {
+ int64_t gm_sumerr = 0;
+ const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+
+ ConvolveParams conv_params = get_conv_params(0, 0, bd);
+ conv_params.use_jnt_comp_avg = 0;
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+ const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+ highbd_warp_plane(wm, ref8, width, height, stride,
+ CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
+ WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
+ &conv_params);
+
+ gm_sumerr += highbd_frame_error(
+ tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
+ warp_w, warp_h, p_stride, bd);
+ if (gm_sumerr > best_error) return gm_sumerr;
+ }
+ }
+ return gm_sumerr;
+}
+
+static INLINE int error_measure(int err) {
+ return error_measure_lut[255 + err];
+}
+
+/* The warp filter for ROTZOOM and AFFINE models works as follows:
+ * Split the input into 8x8 blocks
+ * For each block, project the point (4, 4) within the block, to get the
+ overall block position. Split into integer and fractional coordinates,
+ maintaining full WARPEDMODEL precision
+ * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
+ variable horizontal offset. This means that, while the rows of the
+ intermediate buffer align with the rows of the *reference* image, the
+ columns align with the columns of the *destination* image.
+ * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
+ destination is too small we crop the output at this stage). Each pixel has
+ a variable vertical offset, so that the resulting rows are aligned with
+ the rows of the destination image.
+
+ To accomplish these alignments, we factor the warp matrix as a
+ product of two shear / asymmetric zoom matrices:
+ / a b \ = / 1 0 \ * / 1+alpha beta \
+ \ c d / \ gamma 1+delta / \ 0 1 /
+ where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
+ The horizontal shear (with alpha and beta) is applied first,
+ then the vertical shear (with gamma and delta) is applied second.
+
+ The only limitation is that, to fit this in a fixed 8-tap filter size,
+ the fractional pixel offsets must be at most +-1. Since the horizontal filter
+ generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
+ within the block, the parameters must satisfy
+ 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
+ for this filter to be applicable.
+
+ Note: This function assumes that the caller has done all of the relevant
+ checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+ are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+ gamma, delta are all in range.
+
+ TODO(david.barker): Maybe support scaled references?
+*/
+/* A note on hardware implementation:
+ The warp filter is intended to be implementable using the same hardware as
+ the high-precision convolve filters from the loop-restoration and
+ convolve-round experiments.
+
+ For a single filter stage, considering all of the coefficient sets for the
+ warp filter and the regular convolution filter, an input in the range
+ [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
+ before rounding.
+
+ Allowing for some changes to the filter coefficient sets, call the range
+ [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
+ we can replace this by the range [0, 256 * 2^k], which can be stored in an
+ unsigned value with 8 + k bits.
+
+ This allows the derivation of the appropriate bit widths and offsets for
+ the various intermediate values: If
+
+ F := FILTER_BITS = 7 (or else the above ranges need adjusting)
+ So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
+ intermediate value.
+ H := ROUND0_BITS
+ V := VERSHEAR_REDUCE_PREC_BITS
+ (and note that we must have H + V = 2*F for the output to have the same
+ scale as the input)
+
+ then we end up with the following offsets and ranges:
+ Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
+ uint{bd + F + 1}
+ After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
+ Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
+ uint{bd + 2*F + 2 - H}
+ After rounding: The final value, before undoing the offset, fits into a
+ uint{bd + 2}.
+
+ Then we need to undo the offsets before clamping to a pixel. Note that,
+ if we do this at the end, the amount to subtract is actually independent
+ of H and V:
+
+ offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
+ (1 << ((bd + 2*F - H) - V))
+ == (1 << (bd - 1)) + (1 << bd)
+
+ This allows us to entirely avoid clamping in both the warp filter and
+ the convolve-round experiment. As of the time of writing, the Wiener filter
+ from loop-restoration can encode a central coefficient up to 216, which
+ leads to a maximum value of about 282 * 2^k after applying the offset.
+ So in that case we still need to clamp.
+*/
+void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha, int16_t beta,
+ int16_t gamma, int16_t delta) {
+ int32_t tmp[15 * 8];
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ (void)max_bits_horiz;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ for (int i = p_row; i < p_row + p_height; i += 8) {
+ for (int j = p_col; j < p_col + p_width; j += 8) {
+ // Calculate the center of this 8x8 block,
+ // project to luma coordinates (if in a subsampled chroma plane),
+ // apply the affine transformation,
+ // then convert back to the original coordinates (if necessary)
+ const int32_t src_x = (j + 4) << subsampling_x;
+ const int32_t src_y = (i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ sx4 += alpha * (-4) + beta * (-4);
+ sy4 += gamma * (-4) + delta * (-4);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ for (int k = -7; k < 8; ++k) {
+ // Clamp to top/bottom edge of the frame
+ const int iy = clamp(iy4 + k, 0, height - 1);
+
+ int sx = sx4 + beta * (k + 4);
+
+ for (int l = -4; l < 4; ++l) {
+ int ix = ix4 + l - 3;
+ // At this point, sx = sx4 + alpha * l + beta * k
+ const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_horiz;
+ for (int m = 0; m < 8; ++m) {
+ // Clamp to left/right edge of the frame
+ const int sample_x = clamp(ix + m, 0, width - 1);
+
+ sum += ref[iy * stride + sample_x] * coeffs[m];
+ }
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
+ assert(0 <= sum && sum < (1 << max_bits_horiz));
+ tmp[(k + 7) * 8 + (l + 4)] = sum;
+ sx += alpha;
+ }
+ }
+
+ // Vertical filter
+ for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+ for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
+ // At this point, sy = sy4 + gamma * l + delta * k
+ const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
+ WARPEDPIXEL_PREC_SHIFTS;
+ assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
+ const int16_t *coeffs = warped_filter[offs];
+
+ int32_t sum = 1 << offset_bits_vert;
+ for (int m = 0; m < 8; ++m) {
+ sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+ }
+
+ if (conv_params->is_compound) {
+ CONV_BUF_TYPE *p =
+ &conv_params
+ ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
+ (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ if (conv_params->do_average) {
+ uint8_t *dst8 =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ int32_t tmp32 = *p;
+ if (conv_params->use_jnt_comp_avg) {
+ tmp32 = tmp32 * conv_params->fwd_offset +
+ sum * conv_params->bck_offset;
+ tmp32 = tmp32 >> DIST_PRECISION_BITS;
+ } else {
+ tmp32 += sum;
+ tmp32 = tmp32 >> 1;
+ }
+ tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
+ } else {
+ *p = sum;
+ }
+ } else {
+ uint8_t *p =
+ &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+ sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
+ assert(0 <= sum && sum < (1 << (bd + 2)));
+ *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
+ }
+ sy += gamma;
+ }
+ }
+ }
+ }
+}
+
+static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
+ int width, int height, int stride, uint8_t *pred,
+ int p_col, int p_row, int p_width, int p_height,
+ int p_stride, int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params) {
+ assert(wm->wmtype <= AFFINE);
+ if (wm->wmtype == ROTZOOM) {
+ wm->wmmat[5] = wm->wmmat[2];
+ wm->wmmat[4] = -wm->wmmat[3];
+ }
+ const int32_t *const mat = wm->wmmat;
+ const int16_t alpha = wm->alpha;
+ const int16_t beta = wm->beta;
+ const int16_t gamma = wm->gamma;
+ const int16_t delta = wm->delta;
+ av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params,
+ alpha, beta, gamma, delta);
+}
+
+static int64_t frame_error(const uint8_t *const ref, int stride,
+ const uint8_t *const dst, int p_width, int p_height,
+ int p_stride) {
+ int64_t sum_error = 0;
+ for (int i = 0; i < p_height; ++i) {
+ for (int j = 0; j < p_width; ++j) {
+ sum_error +=
+ (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
+ }
+ }
+ return sum_error;
+}
+
+static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
+ int width, int height, int stride,
+ const uint8_t *const dst, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ int64_t best_error) {
+ int64_t gm_sumerr = 0;
+ int warp_w, warp_h;
+ int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
+ int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
+ uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
+ ConvolveParams conv_params = get_conv_params(0, 0, 8);
+ conv_params.use_jnt_comp_avg = 0;
+
+ for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
+ for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
+ // avoid warping extra 8x8 blocks in the padded region of the frame
+ // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
+ warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
+ warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
+ warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
+ WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
+
+ gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
+ warp_w, warp_h, p_stride);
+ if (gm_sumerr > best_error) return gm_sumerr;
+ }
+ }
+ return gm_sumerr;
+}
+
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+ uint8_t *dst, int p_width, int p_height, int p_stride) {
+ if (use_hbd) {
+ return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
+ CONVERT_TO_SHORTPTR(dst), p_width, p_height,
+ p_stride, bd);
+ }
+ return frame_error(ref, stride, dst, p_width, p_height, p_stride);
+}
+
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *dst, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, int64_t best_error) {
+ if (wm->wmtype <= AFFINE)
+ if (!get_shear_params(wm)) return 1;
+ if (use_hbd)
+ return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x,
+ subsampling_y, bd, best_error);
+ return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y,
+ best_error);
+}
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params) {
+ if (use_hbd)
+ highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
+ p_width, p_height, p_stride, subsampling_x, subsampling_y,
+ bd, conv_params);
+ else
+ warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
+ p_height, p_stride, subsampling_x, subsampling_y, conv_params);
+}
+
+#define LS_MV_MAX 256 // max mv in 1/8-pel
+// Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
+#define LS_STEP 8
+
+// Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
+// the precision needed is:
+// (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
+// (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
+// 1 [for sign] +
+// LEAST_SQUARES_SAMPLES_MAX_BITS
+// [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
+// The value is 23
+#define LS_MAT_RANGE_BITS \
+ ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
+
+// Bit-depth reduction from the full-range
+#define LS_MAT_DOWN_BITS 2
+
+// bits range of A, Bx and By after downshifting
+#define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
+#define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
+#define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
+
+// By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
+// 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
+#define LS_SQUARE(a) \
+ (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+ (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT1(a, b) \
+ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
+ (2 + LS_MAT_DOWN_BITS))
+#define LS_PRODUCT2(a, b) \
+ (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
+ (2 + LS_MAT_DOWN_BITS))
+
+#define USE_LIMITED_PREC_MULT 0
+
+#if USE_LIMITED_PREC_MULT
+
+#define MUL_PREC_BITS 16
+static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
+ int msb = 0;
+ uint16_t mult = 0;
+ *shift = 0;
+ if (D != 0) {
+ msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
+ : get_msb((unsigned int)D));
+ if (msb >= MUL_PREC_BITS) {
+ mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
+ *shift = msb + 1 - MUL_PREC_BITS;
+ } else {
+ mult = (uint16_t)D;
+ *shift = 0;
+ }
+ }
+ return mult;
+}
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int32_t ret;
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(v * (1 << (-shift)),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+ return ret;
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int16_t mshift;
+ uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
+ int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
+ shift -= mshift;
+ if (shift > 0) {
+ return (int32_t)clamp(
+ ROUND_POWER_OF_TWO_SIGNED(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ } else {
+ return (int32_t)clamp(
+ v * (1 << (-shift)),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+ }
+}
+
+#else
+
+static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+
+static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
+ int64_t v = Px * (int64_t)iDet;
+ return (int32_t)clamp64(
+ ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
+ (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
+ (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
+}
+#endif // USE_LIMITED_PREC_MULT
+
+static int find_affine_int(int np, const int *pts1, const int *pts2,
+ BLOCK_SIZE bsize, int mvy, int mvx,
+ WarpedMotionParams *wm, int mi_row, int mi_col) {
+ int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
+ int32_t Bx[2] = { 0, 0 };
+ int32_t By[2] = { 0, 0 };
+ int i;
+
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
+ const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
+ const int suy = rsuy * 8;
+ const int sux = rsux * 8;
+ const int duy = suy + mvy;
+ const int dux = sux + mvx;
+ const int isuy = (mi_row * MI_SIZE + rsuy);
+ const int isux = (mi_col * MI_SIZE + rsux);
+
+ // Assume the center pixel of the block has exactly the same motion vector
+ // as transmitted for the block. First shift the origin of the source
+ // points to the block center, and the origin of the destination points to
+ // the block center added to the motion vector transmitted.
+ // Let (xi, yi) denote the source points and (xi', yi') denote destination
+ // points after origin shfifting, for i = 0, 1, 2, .... n-1.
+ // Then if P = [x0, y0,
+ // x1, y1
+ // x2, y1,
+ // ....
+ // ]
+ // q = [x0', x1', x2', ... ]'
+ // r = [y0', y1', y2', ... ]'
+ // the least squares problems that need to be solved are:
+ // [h1, h2]' = inv(P'P)P'q and
+ // [h3, h4]' = inv(P'P)P'r
+ // where the affine transformation is given by:
+ // x' = h1.x + h2.y
+ // y' = h3.x + h4.y
+ //
+ // The loop below computes: A = P'P, Bx = P'q, By = P'r
+ // We need to just compute inv(A).Bx and inv(A).By for the solutions.
+ // Contribution from neighbor block
+ for (i = 0; i < np; i++) {
+ const int dx = pts2[i * 2] - dux;
+ const int dy = pts2[i * 2 + 1] - duy;
+ const int sx = pts1[i * 2] - sux;
+ const int sy = pts1[i * 2 + 1] - suy;
+ // (TODO)yunqing: This comparison wouldn't be necessary if the sample
+ // selection is done in find_samples(). Also, global offset can be removed
+ // while collecting samples.
+ if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
+ A[0][0] += LS_SQUARE(sx);
+ A[0][1] += LS_PRODUCT1(sx, sy);
+ A[1][1] += LS_SQUARE(sy);
+ Bx[0] += LS_PRODUCT2(sx, dx);
+ Bx[1] += LS_PRODUCT1(sy, dx);
+ By[0] += LS_PRODUCT1(sx, dy);
+ By[1] += LS_PRODUCT2(sy, dy);
+ }
+ }
+
+ // Just for debugging, and can be removed later.
+ assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
+ assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
+ assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
+ assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
+ assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
+ assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
+ assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
+
+ int64_t Det;
+ int16_t iDet, shift;
+
+ // Compute Determinant of A
+ Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
+ if (Det == 0) return 1;
+ iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
+ shift -= WARPEDMODEL_PREC_BITS;
+ if (shift < 0) {
+ iDet <<= (-shift);
+ shift = 0;
+ }
+
+ int64_t Px[2], Py[2];
+
+ // These divided by the Det, are the least squares solutions
+ Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
+ Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
+ Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
+ Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
+
+ wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
+ wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
+ wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
+ wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
+
+ // Note: In the vx, vy expressions below, the max value of each of the
+ // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
+ // for the first term so that the overall sum in the worst case fits
+ // within 32 bits overall.
+ int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
+ isuy * wm->wmmat[3]);
+ int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
+ (isux * wm->wmmat[4] +
+ isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
+ wm->wmmat[0] =
+ clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+ wm->wmmat[1] =
+ clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
+
+ wm->wmmat[6] = wm->wmmat[7] = 0;
+ return 0;
+}
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+ int mvx, WarpedMotionParams *wm_params, int mi_row,
+ int mi_col) {
+ assert(wm_params->wmtype == AFFINE);
+
+ if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
+ mi_col))
+ return 1;
+
+ // check compatibility with the fast warp filter
+ if (!get_shear_params(wm_params)) return 1;
+
+ return 0;
+}
diff --git a/third_party/aom/av1/common/warped_motion.h b/third_party/aom/av1/common/warped_motion.h
new file mode 100644
index 0000000000..a1a4f067dc
--- /dev/null
+++ b/third_party/aom/av1/common/warped_motion.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_WARPED_MOTION_H_
+#define AOM_AV1_COMMON_WARPED_MOTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/mv.h"
+#include "av1/common/convolve.h"
+
+#define MAX_PARAMDIM 9
+#define LEAST_SQUARES_SAMPLES_MAX_BITS 3
+#define LEAST_SQUARES_SAMPLES_MAX (1 << LEAST_SQUARES_SAMPLES_MAX_BITS)
+#define SAMPLES_ARRAY_SIZE (LEAST_SQUARES_SAMPLES_MAX * 2)
+#define WARPED_MOTION_DEBUG 0
+#define DEFAULT_WMTYPE AFFINE
+
+extern const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8];
+
+static const uint8_t warp_pad_left[14][16] = {
+ { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 2, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 3, 3, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 5, 5, 5, 5, 5, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 6, 6, 6, 6, 6, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 7, 7, 7, 7, 7, 7, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 11, 12, 13, 14, 15 },
+ { 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15 },
+ { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 14, 15 },
+ { 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 14, 15 },
+ { 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 15 },
+ { 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15 },
+};
+
+static const uint8_t warp_pad_right[14][16] = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 14 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 13 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 12, 12, 12 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 11, 11, 11, 11 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 9, 9, 9, 9 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 },
+ { 0, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 },
+ { 0, 1, 2, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 },
+ { 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 },
+ { 0, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
+};
+
+// Returns the error between the result of applying motion 'wm' to the frame
+// described by 'ref' and the frame described by 'dst'.
+int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *dst, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, int64_t best_error);
+
+// Returns the error between the frame described by 'ref' and the frame
+// described by 'dst'.
+int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
+ uint8_t *dst, int p_width, int p_height, int p_stride);
+
+void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
+ const uint8_t *ref, int width, int height, int stride,
+ uint8_t *pred, int p_col, int p_row, int p_width,
+ int p_height, int p_stride, int subsampling_x,
+ int subsampling_y, ConvolveParams *conv_params);
+
+int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
+ int mvx, WarpedMotionParams *wm_params, int mi_row,
+ int mi_col);
+
+int get_shear_params(WarpedMotionParams *wm);
+#endif // AOM_AV1_COMMON_WARPED_MOTION_H_
diff --git a/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 0000000000..8aa14696f6
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const int16_t *x_filters, int x0_qn,
+ int x_step_qn) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const uint8_t *src_y;
+ uint8_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint8_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint8_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 8-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_8 = xx_loadl_64(src_x0);
+ const __m128i src1_8 = xx_loadl_64(src_x1);
+ const __m128i src2_8 = xx_loadl_64(src_x2);
+ const __m128i src3_8 = xx_loadl_64(src_x3);
+
+ // Now zero-extend up to 16-bit precision, i.e.
+ // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+ const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+ const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+ const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+ const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Pack 16-bit values into 8-bit values, i.e.
+ // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+ // -> [ 0 0 0 0 0 0 DC BA ]
+ const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+ // Write to the output
+ xx_storel_32(&dst_y[x], shifted_8);
+ }
+ }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filters,
+ int x0_qn, int x_step_qn, int bd) {
+ assert(UPSCALE_NORMATIVE_TAPS == 8);
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+ const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+ const uint16_t *src_y;
+ uint16_t *dst_y;
+ int x_qn = x0_qn;
+ for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+ const int x_filter_idx0 =
+ ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx1 =
+ ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx2 =
+ ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+ const int x_filter_idx3 =
+ ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+ assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+ assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+ const int16_t *const x_filter0 =
+ &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter1 =
+ &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter2 =
+ &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+ const int16_t *const x_filter3 =
+ &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+ const __m128i fil0_16 = xx_loadu_128(x_filter0);
+ const __m128i fil1_16 = xx_loadu_128(x_filter1);
+ const __m128i fil2_16 = xx_loadu_128(x_filter2);
+ const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+ src_y = src;
+ dst_y = dst;
+ for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+ const uint16_t *const src_x0 =
+ &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x1 =
+ &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x2 =
+ &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+ const uint16_t *const src_x3 =
+ &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+ // Load up the source data. This is 16-bit input data, so each load
+ // gets 8 pixels.
+ const __m128i src0_16 = xx_loadu_128(src_x0);
+ const __m128i src1_16 = xx_loadu_128(src_x1);
+ const __m128i src2_16 = xx_loadu_128(src_x2);
+ const __m128i src3_16 = xx_loadu_128(src_x3);
+
+ // Multiply by filter coefficients (results in a 32-bit value),
+ // and add adjacent pairs, i.e.
+ // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+ // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+ const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+ const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+ const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+ const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+ // Reduce horizontally and add, i.e.
+ // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+ const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+ const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+ const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+ // Divide down by (1 << FILTER_BITS), rounding to nearest.
+ const __m128i shifted_32 =
+ _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+ // Pack 32-bit values into 16-bit values, i.e.
+ // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+ const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+ // Clip the values at (1 << bd) - 1
+ const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+ // Write to the output
+ xx_storel_64(&dst_y[x], clipped_16);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
new file mode 100644
index 0000000000..d9fb537856
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_convolve_scale_sse4.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void hfilter8(const uint8_t *src, int src_stride, int16_t *dst, int w,
+ int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params, unsigned round) {
+ const int bd = 8;
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint8_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+ const __m128i zero = _mm_castps_si128(_mm_setzero_ps());
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint8_t *const src0 = src_col + y * src_stride;
+ const uint8_t *const src1 = src0 + 1 * src_stride;
+ const uint8_t *const src2 = src0 + 2 * src_stride;
+ const uint8_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 8-bit input data; each load is just
+ // loading the lower half of the register and gets 8 pixels
+ const __m128i data08 = _mm_loadl_epi64((__m128i *)src0);
+ const __m128i data18 = _mm_loadl_epi64((__m128i *)src1);
+ const __m128i data28 = _mm_loadl_epi64((__m128i *)src2);
+ const __m128i data38 = _mm_loadl_epi64((__m128i *)src3);
+
+ // Now zero-extend up to 16-bit precision by interleaving with
+ // zeros. Drop the upper half of each register (which just had zeros)
+ const __m128i data0lo = _mm_unpacklo_epi8(data08, zero);
+ const __m128i data1lo = _mm_unpacklo_epi8(data18, zero);
+ const __m128i data2lo = _mm_unpacklo_epi8(data28, zero);
+ const __m128i data3lo = _mm_unpacklo_epi8(data38, zero);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint8_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+
+static __m128i convolve_16_8(const int16_t *src, __m128i coeff) {
+ __m128i data = _mm_loadu_si128((__m128i *)src);
+ return _mm_madd_epi16(data, coeff);
+}
+
+// A specialised version of vfilter, the vertical filter for
+// av1_convolve_2d_scale_sse4_1. This version only supports 8 tap filters.
+static void vfilter8(const int16_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn, const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi16(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi16(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+
+ conv = _mm_add_epi32(conv, res_add_const);
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint8_t *dst_x = dst + y * dst_stride + x;
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+ __m128i result;
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ const __m128i p_16 = _mm_loadl_epi64((__m128i *)dst_16_x);
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, shifted_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ shifted_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ shifted_16 = _mm_srai_epi16(_mm_add_epi16(p_16, shifted_16), 1);
+ }
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+ } else {
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi16(shifted_16, sub);
+ result = _mm_sra_epi16(_mm_add_epi16(subbed, bits_const), bits_shift);
+ const __m128i result_8 = _mm_packus_epi16(result, result);
+ *(uint32_t *)dst_x = _mm_cvtsi128_si32(result_8);
+ }
+ }
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - sub32;
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
+ }
+ }
+ }
+}
+void av1_convolve_2d_scale_sse4_1(const uint8_t *src, int src_stride,
+ uint8_t *dst8, int dst8_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int x_step_qn,
+ const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params) {
+ // TODO(yaowu): remove unnecessary initializations
+ int16_t tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE] = { 0 };
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h, subpel_x_qn,
+ x_step_qn, filter_params_x, conv_params->round_0);
+
+ // vertical filter (input is transposed)
+ vfilter8(tmp, im_h, dst8, dst8_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, 8);
+}
+
+// A specialised version of hfilter, the horizontal filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_hfilter8(const uint16_t *src, int src_stride, int16_t *dst,
+ int w, int h, int subpel_x_qn, int x_step_qn,
+ const InterpFilterParams *filter_params,
+ unsigned round, int bd) {
+ const int ntaps = 8;
+
+ src -= ntaps / 2 - 1;
+
+ int32_t round_add32 = (1 << round) / 2 + (1 << (bd + FILTER_BITS - 1));
+ const __m128i round_add = _mm_set1_epi32(round_add32);
+ const __m128i round_shift = _mm_cvtsi32_si128(round);
+
+ int x_qn = subpel_x_qn;
+ for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
+ const uint16_t *const src_col = src + (x_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ // Load the filter coefficients
+ const __m128i coefflo = _mm_loadu_si128((__m128i *)filter);
+
+ int y;
+ for (y = 0; y <= h - 4; y += 4) {
+ const uint16_t *const src0 = src_col + y * src_stride;
+ const uint16_t *const src1 = src0 + 1 * src_stride;
+ const uint16_t *const src2 = src0 + 2 * src_stride;
+ const uint16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load up source data. This is 16-bit input data, so each load gets the 8
+ // pixels we need.
+ const __m128i data0lo = _mm_loadu_si128((__m128i *)src0);
+ const __m128i data1lo = _mm_loadu_si128((__m128i *)src1);
+ const __m128i data2lo = _mm_loadu_si128((__m128i *)src2);
+ const __m128i data3lo = _mm_loadu_si128((__m128i *)src3);
+
+ // Multiply by coefficients
+ const __m128i conv0lo = _mm_madd_epi16(data0lo, coefflo);
+ const __m128i conv1lo = _mm_madd_epi16(data1lo, coefflo);
+ const __m128i conv2lo = _mm_madd_epi16(data2lo, coefflo);
+ const __m128i conv3lo = _mm_madd_epi16(data3lo, coefflo);
+
+ // Reduce horizontally and add
+ const __m128i conv01lo = _mm_hadd_epi32(conv0lo, conv1lo);
+ const __m128i conv23lo = _mm_hadd_epi32(conv2lo, conv3lo);
+ const __m128i conv = _mm_hadd_epi32(conv01lo, conv23lo);
+
+ // Divide down by (1 << round), rounding to nearest.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_add), round_shift);
+
+ shifted = _mm_packus_epi32(shifted, shifted);
+ // Write transposed to the output
+ _mm_storel_epi64((__m128i *)(dst + y + x * h), shifted);
+ }
+ for (; y < h; ++y) {
+ const uint16_t *const src_row = src_col + y * src_stride;
+
+ int32_t sum = (1 << (bd + FILTER_BITS - 1));
+ for (int k = 0; k < ntaps; ++k) {
+ sum += filter[k] * src_row[k];
+ }
+
+ dst[y + x * h] = ROUND_POWER_OF_TWO(sum, round);
+ }
+ }
+}
+// A specialised version of vfilter, the vertical filter for
+// av1_highbd_convolve_2d_scale_sse4_1. This version only supports 8 tap
+// filters.
+static void highbd_vfilter8(const int16_t *src, int src_stride, uint16_t *dst,
+ int dst_stride, int w, int h, int subpel_y_qn,
+ int y_step_qn,
+ const InterpFilterParams *filter_params,
+ const ConvolveParams *conv_params, int bd) {
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int ntaps = 8;
+
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int32_t sub32 = ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ const __m128i sub = _mm_set1_epi32(sub32);
+
+ CONV_BUF_TYPE *dst16 = conv_params->dst;
+ const int dst16_stride = conv_params->dst_stride;
+ const __m128i clip_pixel_ =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i bits_shift = _mm_cvtsi32_si128(bits);
+ const __m128i bits_const = _mm_set1_epi32(((1 << bits) >> 1));
+ const __m128i round_shift_add =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ int y_qn = subpel_y_qn;
+ for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
+ const int16_t *src_y = src + (y_qn >> SCALE_SUBPEL_BITS);
+ const int filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
+ assert(filter_idx < SUBPEL_SHIFTS);
+ const int16_t *filter =
+ av1_get_interp_filter_subpel_kernel(filter_params, filter_idx);
+
+ const __m128i coeff0716 = _mm_loadu_si128((__m128i *)filter);
+ int x;
+ for (x = 0; x <= w - 4; x += 4) {
+ const int16_t *const src0 = src_y + x * src_stride;
+ const int16_t *const src1 = src0 + 1 * src_stride;
+ const int16_t *const src2 = src0 + 2 * src_stride;
+ const int16_t *const src3 = src0 + 3 * src_stride;
+
+ // Load the source data for the three rows, adding the three registers of
+ // convolved products to one as we go (conv0..conv3) to avoid the
+ // register pressure getting too high.
+ const __m128i conv0 = convolve_16_8(src0, coeff0716);
+ const __m128i conv1 = convolve_16_8(src1, coeff0716);
+ const __m128i conv2 = convolve_16_8(src2, coeff0716);
+ const __m128i conv3 = convolve_16_8(src3, coeff0716);
+
+ // Now reduce horizontally to get one lane for each result
+ const __m128i conv01 = _mm_hadd_epi32(conv0, conv1);
+ const __m128i conv23 = _mm_hadd_epi32(conv2, conv3);
+ __m128i conv = _mm_hadd_epi32(conv01, conv23);
+ conv = _mm_add_epi32(conv, res_add_const);
+
+ // Divide down by (1 << round_1), rounding to nearest and subtract sub32.
+ __m128i shifted =
+ _mm_sra_epi32(_mm_add_epi32(conv, round_shift_add), round_shift);
+
+ uint16_t *dst_x = dst + y * dst_stride + x;
+ CONV_BUF_TYPE *dst_16_x = dst16 + y * dst16_stride + x;
+
+ __m128i result;
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ __m128i p_32 =
+ _mm_cvtepu16_epi32(_mm_loadl_epi64((__m128i *)dst_16_x));
+
+ if (conv_params->use_jnt_comp_avg) {
+ shifted = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(shifted, wt1));
+ shifted = _mm_srai_epi32(shifted, DIST_PRECISION_BITS);
+ } else {
+ shifted = _mm_srai_epi32(_mm_add_epi32(p_32, shifted), 1);
+ }
+ __m128i res32 = _mm_sub_epi32(shifted, sub);
+ res32 = _mm_sra_epi32(_mm_add_epi32(res32, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16 = _mm_packus_epi32(res32, res32);
+ res16 = _mm_min_epi16(res16, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, res16);
+ } else {
+ __m128i shifted_16 = _mm_packus_epi32(shifted, shifted);
+ _mm_storel_epi64((__m128i *)dst_16_x, shifted_16);
+ }
+ } else {
+ const __m128i subbed = _mm_sub_epi32(shifted, sub);
+ result = _mm_sra_epi16(_mm_add_epi32(subbed, bits_const), bits_shift);
+ result = _mm_packus_epi32(result, result);
+ result = _mm_min_epi16(result, clip_pixel_);
+ _mm_storel_epi64((__m128i *)dst_x, result);
+ }
+ }
+
+ for (; x < w; ++x) {
+ const int16_t *src_x = src_y + x * src_stride;
+ int32_t sum = 1 << offset_bits;
+ for (int k = 0; k < ntaps; ++k) sum += filter[k] * src_x[k];
+ CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
+ if (conv_params->is_compound) {
+ if (conv_params->do_average) {
+ int32_t tmp = dst16[y * dst16_stride + x];
+ if (conv_params->use_jnt_comp_avg) {
+ tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
+ tmp = tmp >> DIST_PRECISION_BITS;
+ } else {
+ tmp += res;
+ tmp = tmp >> 1;
+ }
+ /* Subtract round offset and convolve round */
+ tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ } else {
+ dst16[y * dst16_stride + x] = res;
+ }
+ } else {
+ /* Subtract round offset and convolve round */
+ int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ dst[y * dst_stride + x] =
+ clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_2d_scale_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_qn,
+ const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
+ ConvolveParams *conv_params, int bd) {
+ // TODO(yaowu): Move this out of stack
+ DECLARE_ALIGNED(16, int16_t,
+ tmp[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]);
+ int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
+ filter_params_y->taps;
+ const int xtaps = filter_params_x->taps;
+ const int ytaps = filter_params_y->taps;
+ const int fo_vert = ytaps / 2 - 1;
+
+ memset(tmp, 0, sizeof(tmp));
+ assert((xtaps == 8) && (ytaps == 8));
+ (void)xtaps;
+
+ // horizontal filter
+ highbd_hfilter8(src - fo_vert * src_stride, src_stride, tmp, w, im_h,
+ subpel_x_qn, x_step_qn, filter_params_x, conv_params->round_0,
+ bd);
+
+ // vertical filter (input is transposed)
+ highbd_vfilter8(tmp, im_h, dst, dst_stride, w, h, subpel_y_qn, y_step_qn,
+ filter_params_y, conv_params, bd);
+}
diff --git a/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
new file mode 100644
index 0000000000..212d3bd723
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_highbd_convolve_sse4.c
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/filter.h"
+
+typedef void (*TransposeSave)(int width, int pixelsNum, uint32_t *src,
+ int src_stride, uint16_t *dst, int dst_stride,
+ int bd);
+
+// pixelsNum 0: write all 4 pixels
+// 1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
+ int dst_stride) {
+ if (2 == width) {
+ if (0 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+ *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+ } else if (1 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ } else if (2 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ } else if (3 == pixelsNum) {
+ *(int *)dst = _mm_cvtsi128_si32(u[0]);
+ *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+ *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+ }
+ } else {
+ if (0 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+ } else if (1 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ } else if (2 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ } else if (3 == pixelsNum) {
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+ _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+ }
+ }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+ int i;
+
+ for (i = 0; i < numVecs; i++) {
+ mask = _mm_cmpgt_epi16(p[i], max);
+ clamped = _mm_andnot_si128(mask, p[i]);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ p[i] = _mm_and_si128(clamped, mask);
+ }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+ __m128i v0, v1;
+ __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+ u[0] = _mm_loadu_si128((__m128i const *)src);
+ u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+ u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ u[0] = _mm_add_epi32(u[0], rnd);
+ u[1] = _mm_add_epi32(u[1], rnd);
+ u[2] = _mm_add_epi32(u[2], rnd);
+ u[3] = _mm_add_epi32(u[3], rnd);
+
+ u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+ u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+ u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+ u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+ u[0] = _mm_packus_epi32(u[0], u[1]);
+ u[1] = _mm_packus_epi32(u[2], u[3]);
+
+ highbd_clip(u, 2, bd);
+
+ v0 = _mm_unpacklo_epi16(u[0], u[1]);
+ v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+ u[0] = _mm_unpacklo_epi16(v0, v1);
+ u[2] = _mm_unpackhi_epi16(v0, v1);
+
+ u[1] = _mm_srli_si128(u[0], 8);
+ u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0 : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(int width, int pixelsNum, uint32_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int bd) {
+ __m128i u[4];
+ transClipPixel(src, src_stride, u, bd);
+ writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(int width, int pixelsNum, uint32_t *src,
+ int src_stride, uint16_t *dst, int dst_stride,
+ int bd) {
+ __m128i u[4], v[4];
+ const __m128i ones = _mm_set1_epi16(1);
+
+ transClipPixel(src, src_stride, u, bd);
+
+ v[0] = _mm_loadl_epi64((__m128i const *)dst);
+ v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+ v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+ v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+ u[0] = _mm_add_epi16(u[0], v[0]);
+ u[1] = _mm_add_epi16(u[1], v[1]);
+ u[2] = _mm_add_epi16(u[2], v[2]);
+ u[3] = _mm_add_epi16(u[3], v[3]);
+
+ u[0] = _mm_add_epi16(u[0], ones);
+ u[1] = _mm_add_epi16(u[1], ones);
+ u[2] = _mm_add_epi16(u[2], ones);
+ u[3] = _mm_add_epi16(u[3], ones);
+
+ u[0] = _mm_srai_epi16(u[0], 1);
+ u[1] = _mm_srai_epi16(u[1], 1);
+ u[2] = _mm_srai_epi16(u[2], 1);
+ u[3] = _mm_srai_epi16(u[3], 1);
+
+ writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+ __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+ u[0] = _mm_add_epi32(u[0], rnd);
+ u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+ u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+ const __m128i ones = _mm_set1_epi16(1);
+
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+
+ v = _mm_add_epi16(v, u[0]);
+ v = _mm_add_epi16(v, ones);
+ v = _mm_srai_epi16(v, 1);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = { write2pixelsOnly, write2pixelsAccum };
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+ _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+ __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+ const __m128i ones = _mm_set1_epi16(1);
+
+ highbdRndingPacks(u);
+ highbd_clip(u, 1, bd);
+
+ v = _mm_add_epi16(v, u[0]);
+ v = _mm_add_epi16(v, ones);
+ v = _mm_srai_epi16(v, 1);
+ _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = { write4pixelsOnly, write4pixelsAccum };
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
new file mode 100644
index 0000000000..5db2ccf6c3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.c
@@ -0,0 +1,1945 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/av1_inv_txfm_avx2.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+static INLINE void idct16_stage5_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+}
+
+static INLINE void idct16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+}
+
+static INLINE void idct16_stage7_avx2(__m256i *output, __m256i *x1) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[15], x1[0], x1[15]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[14], x1[1], x1[14]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[13], x1[2], x1[13]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[12], x1[3], x1[12]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[11], x1[4], x1[11]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[10], x1[5], x1[10]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[9], x1[6], x1[9]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[8], x1[7], x1[8]);
+}
+
+static void idct16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = input[8];
+ x1[2] = input[4];
+ x1[3] = input[12];
+ x1[4] = input[2];
+ x1[5] = input[10];
+ x1[6] = input[6];
+ x1[7] = input[14];
+ x1[8] = input[1];
+ x1[9] = input[9];
+ x1[10] = input[5];
+ x1[11] = input[13];
+ x1[12] = input[3];
+ x1[13] = input[11];
+ x1[14] = input[7];
+ x1[15] = input[15];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[2] = input[4];
+ x1[4] = input[2];
+ x1[6] = input[6];
+ x1[8] = input[1];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x1[8], x1[8], x1[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x1[14], x1[9], x1[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x1[10], x1[10], x1[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x1[12], x1[11], x1[12]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x1[4], x1[4], x1[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x1[6], x1[5], x1[6]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x1[2], x1[2], x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ idct16_stage5_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage6_avx2(x1, cospi, _r, cos_bit);
+ idct16_stage7_avx2(output, x1);
+}
+
+static void idct16_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x1[2];
+ x1[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x1[0], x1[0], x1[1]);
+
+ // stage 5
+ // stage 6
+ output[0] = x1[0];
+ output[1] = x1[1];
+ output[2] = x1[1];
+ output[3] = x1[0];
+ output[4] = x1[0];
+ output[5] = x1[1];
+ output[6] = x1[1];
+ output[7] = x1[0];
+ output[8] = x1[0];
+ output[9] = x1[1];
+ output[10] = x1[1];
+ output[11] = x1[0];
+ output[12] = x1[0];
+ output[13] = x1[1];
+ output[14] = x1[1];
+ output[15] = x1[0];
+}
+
+static INLINE void iadst16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[8]);
+ btf_16_adds_subs_avx2(&x[1], &x[9]);
+ btf_16_adds_subs_avx2(&x[2], &x[10]);
+ btf_16_adds_subs_avx2(&x[3], &x[11]);
+ btf_16_adds_subs_avx2(&x[4], &x[12]);
+ btf_16_adds_subs_avx2(&x[5], &x[13]);
+ btf_16_adds_subs_avx2(&x[6], &x[14]);
+ btf_16_adds_subs_avx2(&x[7], &x[15]);
+}
+
+static INLINE void iadst16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ const __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ const __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ const __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x[8], &x[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x[10], &x[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage5_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[4]);
+ btf_16_adds_subs_avx2(&x[1], &x[5]);
+ btf_16_adds_subs_avx2(&x[2], &x[6]);
+ btf_16_adds_subs_avx2(&x[3], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[12]);
+ btf_16_adds_subs_avx2(&x[9], &x[13]);
+ btf_16_adds_subs_avx2(&x[10], &x[14]);
+ btf_16_adds_subs_avx2(&x[11], &x[15]);
+}
+
+static INLINE void iadst16_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ const __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[4], &x[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[6], &x[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x[12], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x[14], &x[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage7_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[0], &x[2]);
+ btf_16_adds_subs_avx2(&x[1], &x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[6]);
+ btf_16_adds_subs_avx2(&x[5], &x[7]);
+ btf_16_adds_subs_avx2(&x[8], &x[10]);
+ btf_16_adds_subs_avx2(&x[9], &x[11]);
+ btf_16_adds_subs_avx2(&x[12], &x[14]);
+ btf_16_adds_subs_avx2(&x[13], &x[15]);
+}
+
+static INLINE void iadst16_stage8_avx2(__m256i *x1, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+}
+
+static INLINE void iadst16_stage9_avx2(__m256i *output, __m256i *x1) {
+ const __m256i __zero = _mm256_setzero_si256();
+ output[0] = x1[0];
+ output[1] = _mm256_subs_epi16(__zero, x1[8]);
+ output[2] = x1[12];
+ output[3] = _mm256_subs_epi16(__zero, x1[4]);
+ output[4] = x1[6];
+ output[5] = _mm256_subs_epi16(__zero, x1[14]);
+ output[6] = x1[10];
+ output[7] = _mm256_subs_epi16(__zero, x1[2]);
+ output[8] = x1[3];
+ output[9] = _mm256_subs_epi16(__zero, x1[11]);
+ output[10] = x1[15];
+ output[11] = _mm256_subs_epi16(__zero, x1[7]);
+ output[12] = x1[5];
+ output[13] = _mm256_subs_epi16(__zero, x1[13]);
+ output[14] = x1[9];
+ output[15] = _mm256_subs_epi16(__zero, x1[1]);
+}
+
+static void iadst16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[15];
+ x1[1] = input[0];
+ x1[2] = input[13];
+ x1[3] = input[2];
+ x1[4] = input[11];
+ x1[5] = input[4];
+ x1[6] = input[9];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[9] = input[8];
+ x1[10] = input[5];
+ x1[11] = input[10];
+ x1[12] = input[3];
+ x1[13] = input[12];
+ x1[14] = input[1];
+ x1[15] = input[14];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+ x1[3] = input[2];
+ x1[5] = input[4];
+ x1[7] = input[6];
+ x1[8] = input[7];
+ x1[10] = input[5];
+ x1[12] = input[3];
+ x1[14] = input[1];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+ btf_16_w16_0_avx2(cospi[54], -cospi[10], x1[3], x1[2], x1[3]);
+ btf_16_w16_0_avx2(cospi[46], -cospi[18], x1[5], x1[4], x1[5]);
+ btf_16_w16_0_avx2(cospi[38], -cospi[26], x1[7], x1[6], x1[7]);
+ btf_16_w16_0_avx2(cospi[34], cospi[30], x1[8], x1[8], x1[9]);
+ btf_16_w16_0_avx2(cospi[42], cospi[22], x1[10], x1[10], x1[11]);
+ btf_16_w16_0_avx2(cospi[50], cospi[14], x1[12], x1[12], x1[13]);
+ btf_16_w16_0_avx2(cospi[58], cospi[06], x1[14], x1[14], x1[15]);
+
+ iadst16_stage3_avx2(x1);
+ iadst16_stage4_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage5_avx2(x1);
+ iadst16_stage6_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage7_avx2(x1);
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static void iadst16_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ const __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ const __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ const __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[1] = input[0];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], -cospi[2], x1[1], x1[0], x1[1]);
+
+ // stage 3
+ x1[8] = x1[0];
+ x1[9] = x1[1];
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+
+ // stage 5
+ x1[4] = x1[0];
+ x1[5] = x1[1];
+
+ x1[12] = x1[8];
+ x1[13] = x1[9];
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+
+ // stage 7
+ x1[2] = x1[0];
+ x1[3] = x1[1];
+ x1[6] = x1[4];
+ x1[7] = x1[5];
+ x1[10] = x1[8];
+ x1[11] = x1[9];
+ x1[14] = x1[12];
+ x1[15] = x1[13];
+
+ iadst16_stage8_avx2(x1, cospi, _r, cos_bit);
+ iadst16_stage9_avx2(output, x1);
+}
+
+static INLINE void idct32_high16_stage3_avx2(__m256i *x) {
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+}
+
+static INLINE void idct32_high16_stage4_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+}
+
+static INLINE void idct32_high24_stage5_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+}
+
+static INLINE void idct32_high28_stage6_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[31], x[0], x[31]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[30], x[1], x[30]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[29], x[2], x[29]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[28], x[3], x[28]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[27], x[4], x[27]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[26], x[5], x[26]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[25], x[6], x[25]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[24], x[7], x[24]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[23], x[8], x[23]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[22], x[9], x[22]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[21], x[10], x[21]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[20], x[11], x[20]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[19], x[12], x[19]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[18], x[13], x[18]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[17], x[14], x[17]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_low16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_avx2(x);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct32_high16_stage4_avx2(x, cospi, _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ idct32_high24_stage5_avx2(x, cospi, _r, cos_bit);
+
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ idct32_high28_stage6_avx2(x, cospi, _r, cos_bit);
+
+ idct32_stage7_avx2(x, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x, cospi, _r, cos_bit);
+ idct32_stage9_avx2(output, x);
+}
+
+static void idct32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)(cos_bit);
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p60_m04 = pair_set_w16_epi16(cospi[60], -cospi[4]);
+ __m256i cospi_p04_p60 = pair_set_w16_epi16(cospi[4], cospi[60]);
+ __m256i cospi_p28_m36 = pair_set_w16_epi16(cospi[28], -cospi[36]);
+ __m256i cospi_p36_p28 = pair_set_w16_epi16(cospi[36], cospi[28]);
+ __m256i cospi_p44_m20 = pair_set_w16_epi16(cospi[44], -cospi[20]);
+ __m256i cospi_p20_p44 = pair_set_w16_epi16(cospi[20], cospi[44]);
+ __m256i cospi_p12_m52 = pair_set_w16_epi16(cospi[12], -cospi[52]);
+ __m256i cospi_p52_p12 = pair_set_w16_epi16(cospi[52], cospi[12]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m256i x1[32];
+ x1[0] = input[0];
+ x1[1] = input[16];
+ x1[2] = input[8];
+ x1[3] = input[24];
+ x1[4] = input[4];
+ x1[5] = input[20];
+ x1[6] = input[12];
+ x1[7] = input[28];
+ x1[8] = input[2];
+ x1[9] = input[18];
+ x1[10] = input[10];
+ x1[11] = input[26];
+ x1[12] = input[6];
+ x1[13] = input[22];
+ x1[14] = input[14];
+ x1[15] = input[30];
+ x1[16] = input[1];
+ x1[17] = input[17];
+ x1[18] = input[9];
+ x1[19] = input[25];
+ x1[20] = input[5];
+ x1[21] = input[21];
+ x1[22] = input[13];
+ x1[23] = input[29];
+ x1[24] = input[3];
+ x1[25] = input[19];
+ x1[26] = input[11];
+ x1[27] = input[27];
+ x1[28] = input[7];
+ x1[29] = input[23];
+ x1[30] = input[15];
+ x1[31] = input[31];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p62_m02, cospi_p02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_m34, cospi_p34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_m18, cospi_p18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_m50, cospi_p50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_m10, cospi_p10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_m42, cospi_p42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_m26, cospi_p26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_m58, cospi_p58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_16_w16_avx2(cospi_p60_m04, cospi_p04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_m36, cospi_p36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_m20, cospi_p20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_m52, cospi_p52_p12, &x1[11], &x1[12], _r, cos_bit);
+ idct32_high16_stage3_avx2(x1);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p56_m08, cospi_p08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_m40, cospi_p40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ idct32_high16_stage4_avx2(x1, cospi, _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_m16, cospi_p16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ idct32_high24_stage5_avx2(x1, cospi, _r, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ idct32_high28_stage6_avx2(x1, cospi, _r, cos_bit);
+
+ idct32_stage7_avx2(x1, cospi, _r, cos_bit);
+ idct32_stage8_avx2(x1, cospi, _r, cos_bit);
+ idct32_stage9_avx2(output, x1);
+}
+
+static INLINE void idct64_stage4_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+}
+
+static INLINE void idct64_stage5_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[35]);
+ btf_16_adds_subs_avx2(&x[33], &x[34]);
+ btf_16_adds_subs_avx2(&x[39], &x[36]);
+ btf_16_adds_subs_avx2(&x[38], &x[37]);
+ btf_16_adds_subs_avx2(&x[40], &x[43]);
+ btf_16_adds_subs_avx2(&x[41], &x[42]);
+ btf_16_adds_subs_avx2(&x[47], &x[44]);
+ btf_16_adds_subs_avx2(&x[46], &x[45]);
+ btf_16_adds_subs_avx2(&x[48], &x[51]);
+ btf_16_adds_subs_avx2(&x[49], &x[50]);
+ btf_16_adds_subs_avx2(&x[55], &x[52]);
+ btf_16_adds_subs_avx2(&x[54], &x[53]);
+ btf_16_adds_subs_avx2(&x[56], &x[59]);
+ btf_16_adds_subs_avx2(&x[57], &x[58]);
+ btf_16_adds_subs_avx2(&x[63], &x[60]);
+ btf_16_adds_subs_avx2(&x[62], &x[61]);
+}
+
+static INLINE void idct64_stage6_high32_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[34], &x[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[35], &x[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[45], &x[50], _r, cos_bit);
+}
+
+static INLINE void idct64_stage6_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ btf_16_adds_subs_avx2(&x[16], &x[19]);
+ btf_16_adds_subs_avx2(&x[17], &x[18]);
+ btf_16_adds_subs_avx2(&x[23], &x[20]);
+ btf_16_adds_subs_avx2(&x[22], &x[21]);
+ btf_16_adds_subs_avx2(&x[24], &x[27]);
+ btf_16_adds_subs_avx2(&x[25], &x[26]);
+ btf_16_adds_subs_avx2(&x[31], &x[28]);
+ btf_16_adds_subs_avx2(&x[30], &x[29]);
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[18], &x[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[19], &x[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[21], &x[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[39]);
+ btf_16_adds_subs_avx2(&x[33], &x[38]);
+ btf_16_adds_subs_avx2(&x[34], &x[37]);
+ btf_16_adds_subs_avx2(&x[35], &x[36]);
+ btf_16_adds_subs_avx2(&x[47], &x[40]);
+ btf_16_adds_subs_avx2(&x[46], &x[41]);
+ btf_16_adds_subs_avx2(&x[45], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[43]);
+ btf_16_adds_subs_avx2(&x[48], &x[55]);
+ btf_16_adds_subs_avx2(&x[49], &x[54]);
+ btf_16_adds_subs_avx2(&x[50], &x[53]);
+ btf_16_adds_subs_avx2(&x[51], &x[52]);
+ btf_16_adds_subs_avx2(&x[63], &x[56]);
+ btf_16_adds_subs_avx2(&x[62], &x[57]);
+ btf_16_adds_subs_avx2(&x[61], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[59]);
+}
+
+static INLINE void idct64_stage8_high48_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_avx2(&x[16], &x[23]);
+ btf_16_adds_subs_avx2(&x[17], &x[22]);
+ btf_16_adds_subs_avx2(&x[18], &x[21]);
+ btf_16_adds_subs_avx2(&x[19], &x[20]);
+ btf_16_adds_subs_avx2(&x[31], &x[24]);
+ btf_16_adds_subs_avx2(&x[30], &x[25]);
+ btf_16_adds_subs_avx2(&x[29], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[36], &x[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[37], &x[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[39], &x[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[43], &x[52], _r, cos_bit);
+}
+
+static INLINE void idct64_stage9_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[15]);
+ btf_16_adds_subs_avx2(&x[1], &x[14]);
+ btf_16_adds_subs_avx2(&x[2], &x[13]);
+ btf_16_adds_subs_avx2(&x[3], &x[12]);
+ btf_16_adds_subs_avx2(&x[4], &x[11]);
+ btf_16_adds_subs_avx2(&x[5], &x[10]);
+ btf_16_adds_subs_avx2(&x[6], &x[9]);
+ btf_16_adds_subs_avx2(&x[7], &x[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[20], &x[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[21], &x[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[22], &x[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[23], &x[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[32], &x[47]);
+ btf_16_adds_subs_avx2(&x[33], &x[46]);
+ btf_16_adds_subs_avx2(&x[34], &x[45]);
+ btf_16_adds_subs_avx2(&x[35], &x[44]);
+ btf_16_adds_subs_avx2(&x[36], &x[43]);
+ btf_16_adds_subs_avx2(&x[37], &x[42]);
+ btf_16_adds_subs_avx2(&x[38], &x[41]);
+ btf_16_adds_subs_avx2(&x[39], &x[40]);
+ btf_16_adds_subs_avx2(&x[63], &x[48]);
+ btf_16_adds_subs_avx2(&x[62], &x[49]);
+ btf_16_adds_subs_avx2(&x[61], &x[50]);
+ btf_16_adds_subs_avx2(&x[60], &x[51]);
+ btf_16_adds_subs_avx2(&x[59], &x[52]);
+ btf_16_adds_subs_avx2(&x[58], &x[53]);
+ btf_16_adds_subs_avx2(&x[57], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[55]);
+}
+
+static INLINE void idct64_stage10_avx2(__m256i *x, const int32_t *cospi,
+ const __m256i _r, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_avx2(&x[0], &x[31]);
+ btf_16_adds_subs_avx2(&x[1], &x[30]);
+ btf_16_adds_subs_avx2(&x[2], &x[29]);
+ btf_16_adds_subs_avx2(&x[3], &x[28]);
+ btf_16_adds_subs_avx2(&x[4], &x[27]);
+ btf_16_adds_subs_avx2(&x[5], &x[26]);
+ btf_16_adds_subs_avx2(&x[6], &x[25]);
+ btf_16_adds_subs_avx2(&x[7], &x[24]);
+ btf_16_adds_subs_avx2(&x[8], &x[23]);
+ btf_16_adds_subs_avx2(&x[9], &x[22]);
+ btf_16_adds_subs_avx2(&x[10], &x[21]);
+ btf_16_adds_subs_avx2(&x[11], &x[20]);
+ btf_16_adds_subs_avx2(&x[12], &x[19]);
+ btf_16_adds_subs_avx2(&x[13], &x[18]);
+ btf_16_adds_subs_avx2(&x[14], &x[17]);
+ btf_16_adds_subs_avx2(&x[15], &x[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[40], &x[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[42], &x[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[43], &x[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[44], &x[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[45], &x[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[46], &x[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[47], &x[48], _r, cos_bit);
+}
+
+static INLINE void idct64_stage11_avx2(__m256i *output, __m256i *x) {
+ btf_16_adds_subs_out_avx2(&output[0], &output[63], x[0], x[63]);
+ btf_16_adds_subs_out_avx2(&output[1], &output[62], x[1], x[62]);
+ btf_16_adds_subs_out_avx2(&output[2], &output[61], x[2], x[61]);
+ btf_16_adds_subs_out_avx2(&output[3], &output[60], x[3], x[60]);
+ btf_16_adds_subs_out_avx2(&output[4], &output[59], x[4], x[59]);
+ btf_16_adds_subs_out_avx2(&output[5], &output[58], x[5], x[58]);
+ btf_16_adds_subs_out_avx2(&output[6], &output[57], x[6], x[57]);
+ btf_16_adds_subs_out_avx2(&output[7], &output[56], x[7], x[56]);
+ btf_16_adds_subs_out_avx2(&output[8], &output[55], x[8], x[55]);
+ btf_16_adds_subs_out_avx2(&output[9], &output[54], x[9], x[54]);
+ btf_16_adds_subs_out_avx2(&output[10], &output[53], x[10], x[53]);
+ btf_16_adds_subs_out_avx2(&output[11], &output[52], x[11], x[52]);
+ btf_16_adds_subs_out_avx2(&output[12], &output[51], x[12], x[51]);
+ btf_16_adds_subs_out_avx2(&output[13], &output[50], x[13], x[50]);
+ btf_16_adds_subs_out_avx2(&output[14], &output[49], x[14], x[49]);
+ btf_16_adds_subs_out_avx2(&output[15], &output[48], x[15], x[48]);
+ btf_16_adds_subs_out_avx2(&output[16], &output[47], x[16], x[47]);
+ btf_16_adds_subs_out_avx2(&output[17], &output[46], x[17], x[46]);
+ btf_16_adds_subs_out_avx2(&output[18], &output[45], x[18], x[45]);
+ btf_16_adds_subs_out_avx2(&output[19], &output[44], x[19], x[44]);
+ btf_16_adds_subs_out_avx2(&output[20], &output[43], x[20], x[43]);
+ btf_16_adds_subs_out_avx2(&output[21], &output[42], x[21], x[42]);
+ btf_16_adds_subs_out_avx2(&output[22], &output[41], x[22], x[41]);
+ btf_16_adds_subs_out_avx2(&output[23], &output[40], x[23], x[40]);
+ btf_16_adds_subs_out_avx2(&output[24], &output[39], x[24], x[39]);
+ btf_16_adds_subs_out_avx2(&output[25], &output[38], x[25], x[38]);
+ btf_16_adds_subs_out_avx2(&output[26], &output[37], x[26], x[37]);
+ btf_16_adds_subs_out_avx2(&output[27], &output[36], x[27], x[36]);
+ btf_16_adds_subs_out_avx2(&output[28], &output[35], x[28], x[35]);
+ btf_16_adds_subs_out_avx2(&output[29], &output[34], x[29], x[34]);
+ btf_16_adds_subs_out_avx2(&output[30], &output[33], x[30], x[33]);
+ btf_16_adds_subs_out_avx2(&output[31], &output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m256i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ const __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ const __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ const __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ const __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ const __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ const __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ const __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ const __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ const __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ const __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ const __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x[33], &x[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x[38], &x[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x[41], &x[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x[46], &x[49], _r, cos_bit);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x[17], &x[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x[22], &x[25], _r, cos_bit);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_avx2(x, cospi, _r, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ x[9] = x[9];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+static void idct64_low32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m256i _r = _mm256_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ const __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ const __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ const __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ const __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m256i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_w16_0_avx2(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_w16_0_avx2(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_w16_0_avx2(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_w16_0_avx2(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_w16_0_avx2(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_w16_0_avx2(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_w16_0_avx2(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_w16_0_avx2(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_w16_0_avx2(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_w16_0_avx2(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_w16_0_avx2(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_w16_0_avx2(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_w16_0_avx2(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_w16_0_avx2(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_w16_0_avx2(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_w16_0_avx2(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_w16_0_avx2(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_w16_0_avx2(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_w16_0_avx2(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_w16_0_avx2(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_w16_0_avx2(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_w16_0_avx2(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_w16_0_avx2(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_w16_0_avx2(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_avx2(&x[32], &x[33]);
+ btf_16_adds_subs_avx2(&x[35], &x[34]);
+ btf_16_adds_subs_avx2(&x[36], &x[37]);
+ btf_16_adds_subs_avx2(&x[39], &x[38]);
+ btf_16_adds_subs_avx2(&x[40], &x[41]);
+ btf_16_adds_subs_avx2(&x[43], &x[42]);
+ btf_16_adds_subs_avx2(&x[44], &x[45]);
+ btf_16_adds_subs_avx2(&x[47], &x[46]);
+ btf_16_adds_subs_avx2(&x[48], &x[49]);
+ btf_16_adds_subs_avx2(&x[51], &x[50]);
+ btf_16_adds_subs_avx2(&x[52], &x[53]);
+ btf_16_adds_subs_avx2(&x[55], &x[54]);
+ btf_16_adds_subs_avx2(&x[56], &x[57]);
+ btf_16_adds_subs_avx2(&x[59], &x[58]);
+ btf_16_adds_subs_avx2(&x[60], &x[61]);
+ btf_16_adds_subs_avx2(&x[63], &x[62]);
+
+ // stage 4
+ btf_16_w16_0_avx2(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_w16_0_avx2(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_w16_0_avx2(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_w16_0_avx2(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_avx2(&x[16], &x[17]);
+ btf_16_adds_subs_avx2(&x[19], &x[18]);
+ btf_16_adds_subs_avx2(&x[20], &x[21]);
+ btf_16_adds_subs_avx2(&x[23], &x[22]);
+ btf_16_adds_subs_avx2(&x[24], &x[25]);
+ btf_16_adds_subs_avx2(&x[27], &x[26]);
+ btf_16_adds_subs_avx2(&x[28], &x[29]);
+ btf_16_adds_subs_avx2(&x[31], &x[30]);
+ idct64_stage4_high32_avx2(x, cospi, _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_0_avx2(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_w16_0_avx2(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_avx2(&x[8], &x[9]);
+ btf_16_adds_subs_avx2(&x[11], &x[10]);
+ btf_16_adds_subs_avx2(&x[12], &x[13]);
+ btf_16_adds_subs_avx2(&x[15], &x[14]);
+ idct64_stage5_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 6
+ btf_16_w16_0_avx2(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_w16_0_avx2(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_avx2(&x[4], &x[5]);
+ btf_16_adds_subs_avx2(&x[7], &x[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x[9], &x[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x[10], &x[13], _r, cos_bit);
+ idct64_stage6_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x[0], &x[3]);
+ btf_16_adds_subs_avx2(&x[1], &x[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[5], &x[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x[8], &x[11]);
+ btf_16_adds_subs_avx2(&x[9], &x[10]);
+ btf_16_adds_subs_avx2(&x[15], &x[12]);
+ btf_16_adds_subs_avx2(&x[14], &x[13]);
+ idct64_stage7_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_avx2(&x[0], &x[7]);
+ btf_16_adds_subs_avx2(&x[1], &x[6]);
+ btf_16_adds_subs_avx2(&x[2], &x[5]);
+ btf_16_adds_subs_avx2(&x[3], &x[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[10], &x[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x[11], &x[12], _r, cos_bit);
+ idct64_stage8_high48_avx2(x, cospi, _r, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_avx2(x, cospi, _r, cos_bit);
+ idct64_stage10_avx2(x, cospi, _r, cos_bit);
+ idct64_stage11_avx2(output, x);
+}
+
+// 1D functions process 16 pixels at one time.
+static const transform_1d_avx2
+ lowbd_txfm_all_1d_zeros_w16_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16_low1_new_avx2, idct16_low8_new_avx2, idct16_new_avx2, NULL },
+ { iadst16_low1_new_avx2, iadst16_low8_new_avx2, iadst16_new_avx2,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_new_avx2, idct32_low8_new_avx2, idct32_low16_new_avx2,
+ idct32_new_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_new_avx2, idct64_low8_new_avx2, idct64_low16_new_avx2,
+ idct64_low32_new_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// only process w >= 16 h >= 16
+static INLINE void lowbd_inv_txfm2d_add_no_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_nonzero_w_div16 = (eobx + 16) >> 4;
+ const int buf_size_nonzero_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div16; i++) {
+ __m256i buf0[64];
+ const int32_t *input_row = input + (i << 4) * input_stride;
+ for (int j = 0; j < buf_size_nonzero_w_div16; ++j) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ const int32_t *input_cur = input_row + j * 16;
+ load_buffer_32bit_to_16bit_w16_avx2(input_cur, input_stride, buf0_cur,
+ 16);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+
+ __m256i *buf1_cur = buf1 + (i << 4);
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ int offset = txfm_size_row * (buf_size_w_div16 - 1 - j);
+ transpose_16bit_16x16_avx2(temp, buf1_cur + offset);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, buf1_cur + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i *buf1_cur = buf1 + i * txfm_size_row;
+ col_txfm(buf1_cur, buf1_cur, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf1_cur, txfm_size_row, shift[1]);
+ }
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ lowbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row, output + 16 * i,
+ stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void iidentity_row_16xn_avx2(__m256i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m256i _r = _mm256_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale__r = _mm256_unpacklo_epi16(scale, _r);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m256i rect_scale =
+ _mm256_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m256i src = load_32bit_to_16bit_w16_avx2(input_row);
+ src = _mm256_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m256i lo = _mm256_unpacklo_epi16(src, one);
+ __m256i hi = _mm256_unpackhi_epi16(src, one);
+ lo = _mm256_madd_epi16(lo, scale__r);
+ hi = _mm256_madd_epi16(hi, scale__r);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm256_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_16xn_avx2(uint8_t *output, int stride,
+ __m256i *buf, int shift, int height,
+ int txh_idx) {
+ const __m256i scale = _mm256_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m256i scale__r = _mm256_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m256i shift__r = _mm256_set1_epi32(1 << (-shift - 1));
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i scale_coeff = _mm256_unpacklo_epi16(scale, scale__r);
+ for (int h = 0; h < height; ++h) {
+ __m256i lo = _mm256_unpacklo_epi16(buf[h], one);
+ __m256i hi = _mm256_unpackhi_epi16(buf[h], one);
+ lo = _mm256_madd_epi16(lo, scale_coeff);
+ hi = _mm256_madd_epi16(hi, scale_coeff);
+ lo = _mm256_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm256_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm256_add_epi32(lo, shift__r);
+ hi = _mm256_add_epi32(hi, shift__r);
+ lo = _mm256_srai_epi32(lo, -shift);
+ hi = _mm256_srai_epi32(hi, -shift);
+ const __m256i x = _mm256_packs_epi32(lo, hi);
+ write_recon_w16_avx2(x, output);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size,
+ int32_t eob) {
+ (void)eob;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m256i buf[32];
+ for (int i = 0; i < input_stride; i += 16) {
+ iidentity_row_16xn_avx2(buf, input + i, input_stride, shift[0], row_max,
+ txw_idx, rect_type);
+ iidentity_col_16xn_avx2(output + i, stride, buf, shift[1], row_max,
+ txh_idx);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int txfm_size_col_notzero = AOMMIN(32, txfm_size_col);
+ const int input_stride = txfm_size_col_notzero;
+ const int buf_size_w_div16 = (eobx + 16) >> 4;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 col_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div16; i++) {
+ __m256i buf0[64];
+ iidentity_row_16xn_avx2(buf0, input + (i << 4), input_stride, shift[0],
+ eoby + 1, txw_idx, rect_type);
+ col_txfm(buf0, buf0, cos_bit_col);
+ __m256i mshift = _mm256_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ __m256i res = _mm256_mulhrs_epi16(buf0[k], mshift);
+ write_recon_w16_avx2(res, output + (i << 4) + j * stride);
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m256i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div16 = txfm_size_col >> 4;
+ const int buf_size_h_div16 = (eoby + 16) >> 4;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_avx2 row_txfm =
+ lowbd_txfm_all_1d_zeros_w16_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+
+ assert(row_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div16; i++) {
+ __m256i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 16;
+ for (int j = 0; j < AOMMIN(4, buf_size_w_div16); ++j) {
+ __m256i *buf0_cur = buf0 + j * 16;
+ load_buffer_32bit_to_16bit_w16_avx2(input_row + j * 16, input_stride,
+ buf0_cur, 16);
+ transpose_16bit_16x16_avx2(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_avx2(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf0, txfm_size_col, shift[0]);
+ __m256i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ __m256i temp[16];
+ flip_buf_avx2(buf0 + 16 * j, temp, 16);
+ transpose_16bit_16x16_avx2(temp,
+ _buf1 + 16 * (buf_size_w_div16 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + 16 * j, _buf1 + 16 * j);
+ }
+ }
+ for (int j = 0; j < buf_size_w_div16; ++j) {
+ iidentity_col_16xn_avx2(output + i * 16 * stride + j * 16, stride,
+ buf1 + j * 16, shift[1], 16, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_avx2(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ (void)eob;
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT: // ADST in vertical, DCT in horizontal
+ case DCT_ADST: // DCT in vertical, ADST in horizontal
+ case ADST_ADST: // ADST in both directions
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ lowbd_inv_txfm2d_add_no_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_avx2(input, output, stride, tx_size, eob);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ case TX_8X8:
+ case TX_4X8:
+ case TX_8X4:
+ case TX_8X16:
+ case TX_16X8:
+ case TX_4X16:
+ case TX_16X4:
+ case TX_8X32:
+ case TX_32X8:
+ av1_lowbd_inv_txfm2d_add_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X16:
+ case TX_32X32:
+ case TX_64X64:
+ case TX_16X32:
+ case TX_32X16:
+ case TX_32X64:
+ case TX_64X32:
+ case TX_16X64:
+ case TX_64X16:
+ default:
+ lowbd_inv_txfm2d_add_universe_avx2(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void av1_inv_txfm_add_avx2(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_avx2(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
new file mode 100644
index 0000000000..f74cbaeaa5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_avx2.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// half input is zero
+#define btf_16_w16_0_avx2(w0, w1, in, out0, out1) \
+ { \
+ const __m256i _w0 = _mm256_set1_epi16(w0 * 8); \
+ const __m256i _w1 = _mm256_set1_epi16(w1 * 8); \
+ const __m256i _in = in; \
+ out0 = _mm256_mulhrs_epi16(_in, _w0); \
+ out1 = _mm256_mulhrs_epi16(_in, _w1); \
+ }
+
+static INLINE void round_shift_avx2(const __m256i *input, __m256i *output,
+ int size) {
+ const __m256i scale = _mm256_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm256_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void write_recon_w16_avx2(__m256i res, uint8_t *output) {
+ __m128i pred = _mm_loadu_si128((__m128i const *)(output));
+ __m256i u = _mm256_adds_epi16(_mm256_cvtepu8_epi16(pred), res);
+ __m128i y = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(u, u), 168));
+ _mm_storeu_si128((__m128i *)(output), y);
+}
+
+static INLINE void lowbd_write_buffer_16xn_avx2(__m256i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ write_recon_w16_avx2(in[j], output + i * stride);
+ }
+}
+
+void av1_lowbd_inv_txfm2d_add_avx2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
new file mode 100644
index 0000000000..995bc3da44
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -0,0 +1,2923 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+
+// TODO(venkatsanampudi@ittiam.com): move this to header file
+
+// Sqrt2, Sqrt2^2, Sqrt2^3, Sqrt2^4, Sqrt2^5
+static int32_t NewSqrt2list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
+ 4 * 5793 };
+
+// TODO(binpengsmail@gmail.com): replace some for loop with do {} while
+
+static void idct4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[4];
+ x[0] = input[0];
+ x[1] = input[2];
+ x[2] = input[1];
+ x[3] = input[3];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+
+ // stage 3
+ btf_16_adds_subs_out_sse2(output[0], output[3], x[0], x[3]);
+ btf_16_adds_subs_out_sse2(output[1], output[2], x[1], x[2]);
+}
+
+void idct8_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 4
+ // stage 5
+ output[0] = x[0];
+ output[7] = x[0];
+ output[1] = x[1];
+ output[6] = x[1];
+ output[2] = x[1];
+ output[5] = x[1];
+ output[3] = x[0];
+ output[4] = x[0];
+}
+
+void idct8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+void idct8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[0];
+ x[1] = input[4];
+ x[2] = input[2];
+ x[3] = input[6];
+ x[4] = input[1];
+ x[5] = input[5];
+ x[6] = input[3];
+ x[7] = input[7];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+
+ // stage 4
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+
+ // stage 5
+ btf_16_adds_subs_out_sse2(output[0], output[7], x[0], x[7]);
+ btf_16_adds_subs_out_sse2(output[1], output[6], x[1], x[6]);
+ btf_16_adds_subs_out_sse2(output[2], output[5], x[2], x[5]);
+ btf_16_adds_subs_out_sse2(output[3], output[4], x[3], x[4]);
+}
+
+static INLINE void idct16_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+}
+
+static INLINE void idct16_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+}
+
+static INLINE void idct16_stage7_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[15], x[0], x[15]);
+ btf_16_adds_subs_out_sse2(output[1], output[14], x[1], x[14]);
+ btf_16_adds_subs_out_sse2(output[2], output[13], x[2], x[13]);
+ btf_16_adds_subs_out_sse2(output[3], output[12], x[3], x[12]);
+ btf_16_adds_subs_out_sse2(output[4], output[11], x[4], x[11]);
+ btf_16_adds_subs_out_sse2(output[5], output[10], x[5], x[10]);
+ btf_16_adds_subs_out_sse2(output[6], output[9], x[6], x[9]);
+ btf_16_adds_subs_out_sse2(output[7], output[8], x[7], x[8]);
+}
+
+static void idct16_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ output[0] = x[0];
+ output[15] = x[0];
+ output[1] = x[1];
+ output[14] = x[1];
+ output[2] = x[1];
+ output[13] = x[1];
+ output[3] = x[0];
+ output[12] = x[0];
+ output[4] = x[0];
+ output[11] = x[0];
+ output[5] = x[1];
+ output[10] = x[1];
+ output[6] = x[1];
+ output[9] = x[1];
+ output[7] = x[0];
+ output[8] = x[0];
+}
+
+static void idct16_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[2] = input[4];
+ x[4] = input[2];
+ x[6] = input[6];
+ x[8] = input[1];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+void idct16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5~7
+ idct16_stage5_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage6_sse2(x, cospi, __rounding, cos_bit);
+ idct16_stage7_sse2(output, x);
+}
+
+void idct16_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[0];
+ x[1] = input[8];
+ x[2] = input[4];
+ x[3] = input[12];
+ x[4] = input[2];
+ x[5] = input[10];
+ x[6] = input[6];
+ x[7] = input[14];
+ x[8] = input[1];
+ x[9] = input[9];
+ x[10] = input[5];
+ x[11] = input[13];
+ x[12] = input[3];
+ x[13] = input[11];
+ x[14] = input[7];
+ x[15] = input[15];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_4p_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+
+ // stage 3
+ btf_16_4p_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_4p_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_4p_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_4p_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_4p_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+
+ // stage 7
+ idct16_stage7_sse2(output, x);
+}
+
+static INLINE void idct32_high16_stage3_sse2(__m128i *x) {
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+}
+
+static INLINE void idct32_high16_stage4_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+}
+
+static INLINE void idct32_high24_stage5_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+}
+
+static INLINE void idct32_high28_stage6_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+}
+
+static INLINE void idct32_stage7_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+}
+
+static INLINE void idct32_stage8_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+}
+
+static INLINE void idct32_stage9_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[31], x[0], x[31]);
+ btf_16_adds_subs_out_sse2(output[1], output[30], x[1], x[30]);
+ btf_16_adds_subs_out_sse2(output[2], output[29], x[2], x[29]);
+ btf_16_adds_subs_out_sse2(output[3], output[28], x[3], x[28]);
+ btf_16_adds_subs_out_sse2(output[4], output[27], x[4], x[27]);
+ btf_16_adds_subs_out_sse2(output[5], output[26], x[5], x[26]);
+ btf_16_adds_subs_out_sse2(output[6], output[25], x[6], x[25]);
+ btf_16_adds_subs_out_sse2(output[7], output[24], x[7], x[24]);
+ btf_16_adds_subs_out_sse2(output[8], output[23], x[8], x[23]);
+ btf_16_adds_subs_out_sse2(output[9], output[22], x[9], x[22]);
+ btf_16_adds_subs_out_sse2(output[10], output[21], x[10], x[21]);
+ btf_16_adds_subs_out_sse2(output[11], output[20], x[11], x[20]);
+ btf_16_adds_subs_out_sse2(output[12], output[19], x[12], x[19]);
+ btf_16_adds_subs_out_sse2(output[13], output[18], x[13], x[18]);
+ btf_16_adds_subs_out_sse2(output[14], output[17], x[14], x[17]);
+ btf_16_adds_subs_out_sse2(output[15], output[16], x[15], x[16]);
+}
+
+static void idct32_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[2];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ output[0] = x[0];
+ output[31] = x[0];
+ output[1] = x[1];
+ output[30] = x[1];
+ output[2] = x[1];
+ output[29] = x[1];
+ output[3] = x[0];
+ output[28] = x[0];
+ output[4] = x[0];
+ output[27] = x[0];
+ output[5] = x[1];
+ output[26] = x[1];
+ output[6] = x[1];
+ output[25] = x[1];
+ output[7] = x[0];
+ output[24] = x[0];
+ output[8] = x[0];
+ output[23] = x[0];
+ output[9] = x[1];
+ output[22] = x[1];
+ output[10] = x[1];
+ output[21] = x[1];
+ output[11] = x[0];
+ output[20] = x[0];
+ output[12] = x[0];
+ output[19] = x[0];
+ output[13] = x[1];
+ output[18] = x[1];
+ output[14] = x[1];
+ output[17] = x[1];
+ output[15] = x[0];
+ output[16] = x[0];
+}
+
+static void idct32_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[4] = input[4];
+ x[8] = input[2];
+ x[12] = input[6];
+ x[16] = input[1];
+ x[20] = input[5];
+ x[24] = input[3];
+ x[28] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+ // stage 6
+ x[3] = x[0];
+ x[2] = x[1];
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_low16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[2] = input[8];
+ x[4] = input[4];
+ x[6] = input[12];
+ x[8] = input[2];
+ x[10] = input[10];
+ x[12] = input[6];
+ x[14] = input[14];
+ x[16] = input[1];
+ x[18] = input[9];
+ x[20] = input[5];
+ x[22] = input[13];
+ x[24] = input[3];
+ x[26] = input[11];
+ x[28] = input[7];
+ x[30] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static void idct32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+ x[1] = input[16];
+ x[2] = input[8];
+ x[3] = input[24];
+ x[4] = input[4];
+ x[5] = input[20];
+ x[6] = input[12];
+ x[7] = input[28];
+ x[8] = input[2];
+ x[9] = input[18];
+ x[10] = input[10];
+ x[11] = input[26];
+ x[12] = input[6];
+ x[13] = input[22];
+ x[14] = input[14];
+ x[15] = input[30];
+ x[16] = input[1];
+ x[17] = input[17];
+ x[18] = input[9];
+ x[19] = input[25];
+ x[20] = input[5];
+ x[21] = input[21];
+ x[22] = input[13];
+ x[23] = input[29];
+ x[24] = input[3];
+ x[25] = input[19];
+ x[26] = input[11];
+ x[27] = input[27];
+ x[28] = input[7];
+ x[29] = input[23];
+ x[30] = input[15];
+ x[31] = input[31];
+
+ // stage 2
+ btf_16_sse2(cospi_p62_m02, cospi_p02_p62, x[16], x[31], x[16], x[31]);
+ btf_16_sse2(cospi_p30_m34, cospi_p34_p30, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_p46_m18, cospi_p18_p46, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_p14_m50, cospi_p50_p14, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_p54_m10, cospi_p10_p54, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_p22_m42, cospi_p42_p22, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_p38_m26, cospi_p26_p38, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_p06_m58, cospi_p58_p06, x[23], x[24], x[23], x[24]);
+
+ // stage 3
+ btf_16_sse2(cospi_p60_m04, cospi_p04_p60, x[8], x[15], x[8], x[15]);
+ btf_16_sse2(cospi_p28_m36, cospi_p36_p28, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_p44_m20, cospi_p20_p44, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_p12_m52, cospi_p52_p12, x[11], x[12], x[11], x[12]);
+ idct32_high16_stage3_sse2(x);
+
+ // stage 4
+ btf_16_sse2(cospi_p56_m08, cospi_p08_p56, x[4], x[7], x[4], x[7]);
+ btf_16_sse2(cospi_p24_m40, cospi_p40_p24, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct32_high16_stage4_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p48_m16, cospi_p16_p48, x[2], x[3], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_adds_subs_sse2(x[7], x[6]);
+ idct32_high24_stage5_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ idct32_high28_stage6_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7~8
+ idct32_stage7_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage8_sse2(x, cospi, __rounding, cos_bit);
+ idct32_stage9_sse2(output, x);
+}
+
+static INLINE void idct64_stage4_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+}
+
+static INLINE void idct64_stage5_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ btf_16_adds_subs_sse2(x[32], x[35]);
+ btf_16_adds_subs_sse2(x[33], x[34]);
+ btf_16_subs_adds_sse2(x[39], x[36]);
+ btf_16_subs_adds_sse2(x[38], x[37]);
+ btf_16_adds_subs_sse2(x[40], x[43]);
+ btf_16_adds_subs_sse2(x[41], x[42]);
+ btf_16_subs_adds_sse2(x[47], x[44]);
+ btf_16_subs_adds_sse2(x[46], x[45]);
+ btf_16_adds_subs_sse2(x[48], x[51]);
+ btf_16_adds_subs_sse2(x[49], x[50]);
+ btf_16_subs_adds_sse2(x[55], x[52]);
+ btf_16_subs_adds_sse2(x[54], x[53]);
+ btf_16_adds_subs_sse2(x[56], x[59]);
+ btf_16_adds_subs_sse2(x[57], x[58]);
+ btf_16_subs_adds_sse2(x[63], x[60]);
+ btf_16_subs_adds_sse2(x[62], x[61]);
+}
+
+static INLINE void idct64_stage6_high32_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[34], x[61], x[34], x[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[35], x[60], x[35], x[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[45], x[50], x[45], x[50]);
+}
+
+static INLINE void idct64_stage6_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ btf_16_adds_subs_sse2(x[16], x[19]);
+ btf_16_adds_subs_sse2(x[17], x[18]);
+ btf_16_subs_adds_sse2(x[23], x[20]);
+ btf_16_subs_adds_sse2(x[22], x[21]);
+ btf_16_adds_subs_sse2(x[24], x[27]);
+ btf_16_adds_subs_sse2(x[25], x[26]);
+ btf_16_subs_adds_sse2(x[31], x[28]);
+ btf_16_subs_adds_sse2(x[30], x[29]);
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+}
+
+static INLINE void idct64_stage7_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[18], x[29], x[18], x[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[19], x[28], x[19], x[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[21], x[26], x[21], x[26]);
+ btf_16_adds_subs_sse2(x[32], x[39]);
+ btf_16_adds_subs_sse2(x[33], x[38]);
+ btf_16_adds_subs_sse2(x[34], x[37]);
+ btf_16_adds_subs_sse2(x[35], x[36]);
+ btf_16_subs_adds_sse2(x[47], x[40]);
+ btf_16_subs_adds_sse2(x[46], x[41]);
+ btf_16_subs_adds_sse2(x[45], x[42]);
+ btf_16_subs_adds_sse2(x[44], x[43]);
+ btf_16_adds_subs_sse2(x[48], x[55]);
+ btf_16_adds_subs_sse2(x[49], x[54]);
+ btf_16_adds_subs_sse2(x[50], x[53]);
+ btf_16_adds_subs_sse2(x[51], x[52]);
+ btf_16_subs_adds_sse2(x[63], x[56]);
+ btf_16_subs_adds_sse2(x[62], x[57]);
+ btf_16_subs_adds_sse2(x[61], x[58]);
+ btf_16_subs_adds_sse2(x[60], x[59]);
+}
+
+static INLINE void idct64_stage8_high48_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ btf_16_adds_subs_sse2(x[16], x[23]);
+ btf_16_adds_subs_sse2(x[17], x[22]);
+ btf_16_adds_subs_sse2(x[18], x[21]);
+ btf_16_adds_subs_sse2(x[19], x[20]);
+ btf_16_subs_adds_sse2(x[31], x[24]);
+ btf_16_subs_adds_sse2(x[30], x[25]);
+ btf_16_subs_adds_sse2(x[29], x[26]);
+ btf_16_subs_adds_sse2(x[28], x[27]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[36], x[59], x[36], x[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[37], x[58], x[37], x[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[39], x[56], x[39], x[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[43], x[52], x[43], x[52]);
+}
+
+static INLINE void idct64_stage9_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[15]);
+ btf_16_adds_subs_sse2(x[1], x[14]);
+ btf_16_adds_subs_sse2(x[2], x[13]);
+ btf_16_adds_subs_sse2(x[3], x[12]);
+ btf_16_adds_subs_sse2(x[4], x[11]);
+ btf_16_adds_subs_sse2(x[5], x[10]);
+ btf_16_adds_subs_sse2(x[6], x[9]);
+ btf_16_adds_subs_sse2(x[7], x[8]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[20], x[27], x[20], x[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[21], x[26], x[21], x[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[22], x[25], x[22], x[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[23], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[47]);
+ btf_16_adds_subs_sse2(x[33], x[46]);
+ btf_16_adds_subs_sse2(x[34], x[45]);
+ btf_16_adds_subs_sse2(x[35], x[44]);
+ btf_16_adds_subs_sse2(x[36], x[43]);
+ btf_16_adds_subs_sse2(x[37], x[42]);
+ btf_16_adds_subs_sse2(x[38], x[41]);
+ btf_16_adds_subs_sse2(x[39], x[40]);
+ btf_16_subs_adds_sse2(x[63], x[48]);
+ btf_16_subs_adds_sse2(x[62], x[49]);
+ btf_16_subs_adds_sse2(x[61], x[50]);
+ btf_16_subs_adds_sse2(x[60], x[51]);
+ btf_16_subs_adds_sse2(x[59], x[52]);
+ btf_16_subs_adds_sse2(x[58], x[53]);
+ btf_16_subs_adds_sse2(x[57], x[54]);
+ btf_16_subs_adds_sse2(x[56], x[55]);
+}
+
+static INLINE void idct64_stage10_sse2(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ btf_16_adds_subs_sse2(x[0], x[31]);
+ btf_16_adds_subs_sse2(x[1], x[30]);
+ btf_16_adds_subs_sse2(x[2], x[29]);
+ btf_16_adds_subs_sse2(x[3], x[28]);
+ btf_16_adds_subs_sse2(x[4], x[27]);
+ btf_16_adds_subs_sse2(x[5], x[26]);
+ btf_16_adds_subs_sse2(x[6], x[25]);
+ btf_16_adds_subs_sse2(x[7], x[24]);
+ btf_16_adds_subs_sse2(x[8], x[23]);
+ btf_16_adds_subs_sse2(x[9], x[22]);
+ btf_16_adds_subs_sse2(x[10], x[21]);
+ btf_16_adds_subs_sse2(x[11], x[20]);
+ btf_16_adds_subs_sse2(x[12], x[19]);
+ btf_16_adds_subs_sse2(x[13], x[18]);
+ btf_16_adds_subs_sse2(x[14], x[17]);
+ btf_16_adds_subs_sse2(x[15], x[16]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[40], x[55], x[40], x[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[42], x[53], x[42], x[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[43], x[52], x[43], x[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[44], x[51], x[44], x[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[45], x[50], x[45], x[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[46], x[49], x[46], x[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[47], x[48], x[47], x[48]);
+}
+
+static INLINE void idct64_stage11_sse2(__m128i *output, __m128i *x) {
+ btf_16_adds_subs_out_sse2(output[0], output[63], x[0], x[63]);
+ btf_16_adds_subs_out_sse2(output[1], output[62], x[1], x[62]);
+ btf_16_adds_subs_out_sse2(output[2], output[61], x[2], x[61]);
+ btf_16_adds_subs_out_sse2(output[3], output[60], x[3], x[60]);
+ btf_16_adds_subs_out_sse2(output[4], output[59], x[4], x[59]);
+ btf_16_adds_subs_out_sse2(output[5], output[58], x[5], x[58]);
+ btf_16_adds_subs_out_sse2(output[6], output[57], x[6], x[57]);
+ btf_16_adds_subs_out_sse2(output[7], output[56], x[7], x[56]);
+ btf_16_adds_subs_out_sse2(output[8], output[55], x[8], x[55]);
+ btf_16_adds_subs_out_sse2(output[9], output[54], x[9], x[54]);
+ btf_16_adds_subs_out_sse2(output[10], output[53], x[10], x[53]);
+ btf_16_adds_subs_out_sse2(output[11], output[52], x[11], x[52]);
+ btf_16_adds_subs_out_sse2(output[12], output[51], x[12], x[51]);
+ btf_16_adds_subs_out_sse2(output[13], output[50], x[13], x[50]);
+ btf_16_adds_subs_out_sse2(output[14], output[49], x[14], x[49]);
+ btf_16_adds_subs_out_sse2(output[15], output[48], x[15], x[48]);
+ btf_16_adds_subs_out_sse2(output[16], output[47], x[16], x[47]);
+ btf_16_adds_subs_out_sse2(output[17], output[46], x[17], x[46]);
+ btf_16_adds_subs_out_sse2(output[18], output[45], x[18], x[45]);
+ btf_16_adds_subs_out_sse2(output[19], output[44], x[19], x[44]);
+ btf_16_adds_subs_out_sse2(output[20], output[43], x[20], x[43]);
+ btf_16_adds_subs_out_sse2(output[21], output[42], x[21], x[42]);
+ btf_16_adds_subs_out_sse2(output[22], output[41], x[22], x[41]);
+ btf_16_adds_subs_out_sse2(output[23], output[40], x[23], x[40]);
+ btf_16_adds_subs_out_sse2(output[24], output[39], x[24], x[39]);
+ btf_16_adds_subs_out_sse2(output[25], output[38], x[25], x[38]);
+ btf_16_adds_subs_out_sse2(output[26], output[37], x[26], x[37]);
+ btf_16_adds_subs_out_sse2(output[27], output[36], x[27], x[36]);
+ btf_16_adds_subs_out_sse2(output[28], output[35], x[28], x[35]);
+ btf_16_adds_subs_out_sse2(output[29], output[34], x[29], x[34]);
+ btf_16_adds_subs_out_sse2(output[30], output[33], x[30], x[33]);
+ btf_16_adds_subs_out_sse2(output[31], output[32], x[31], x[32]);
+}
+
+static void idct64_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+
+ // stage 1
+ __m128i x[32];
+ x[0] = input[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+
+ // stage 7
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ output[0] = x[0];
+ output[63] = x[0];
+ output[1] = x[1];
+ output[62] = x[1];
+ output[2] = x[1];
+ output[61] = x[1];
+ output[3] = x[0];
+ output[60] = x[0];
+ output[4] = x[0];
+ output[59] = x[0];
+ output[5] = x[1];
+ output[58] = x[1];
+ output[6] = x[1];
+ output[57] = x[1];
+ output[7] = x[0];
+ output[56] = x[0];
+ output[8] = x[0];
+ output[55] = x[0];
+ output[9] = x[1];
+ output[54] = x[1];
+ output[10] = x[1];
+ output[53] = x[1];
+ output[11] = x[0];
+ output[52] = x[0];
+ output[12] = x[0];
+ output[51] = x[0];
+ output[13] = x[1];
+ output[50] = x[1];
+ output[14] = x[1];
+ output[49] = x[1];
+ output[15] = x[0];
+ output[48] = x[0];
+ output[16] = x[0];
+ output[47] = x[0];
+ output[17] = x[1];
+ output[46] = x[1];
+ output[18] = x[1];
+ output[45] = x[1];
+ output[19] = x[0];
+ output[44] = x[0];
+ output[20] = x[0];
+ output[43] = x[0];
+ output[21] = x[1];
+ output[42] = x[1];
+ output[22] = x[1];
+ output[41] = x[1];
+ output[23] = x[0];
+ output[40] = x[0];
+ output[24] = x[0];
+ output[39] = x[0];
+ output[25] = x[1];
+ output[38] = x[1];
+ output[26] = x[1];
+ output[37] = x[1];
+ output[27] = x[0];
+ output[36] = x[0];
+ output[28] = x[0];
+ output[35] = x[0];
+ output[29] = x[1];
+ output[34] = x[1];
+ output[30] = x[1];
+ output[33] = x[1];
+ output[31] = x[0];
+ output[32] = x[0];
+}
+
+static void idct64_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ const __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ const __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ const __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ const __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ const __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ const __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ const __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ const __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ const __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ const __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ const __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[8] = input[4];
+ x[16] = input[2];
+ x[24] = input[6];
+ x[32] = input[1];
+ x[40] = input[5];
+ x[48] = input[3];
+ x[56] = input[7];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ x[17] = x[16];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[30] = x[31];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x[33], x[62], x[33], x[62]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x[38], x[57], x[38], x[57]);
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x[41], x[54], x[41], x[54]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x[46], x[49], x[46], x[49]);
+
+ // stage 5
+ x[9] = x[8];
+ x[14] = x[15];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x[17], x[30], x[17], x[30]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x[22], x[25], x[22], x[25]);
+ x[35] = x[32];
+ x[34] = x[33];
+ x[36] = x[39];
+ x[37] = x[38];
+ x[43] = x[40];
+ x[42] = x[41];
+ x[44] = x[47];
+ x[45] = x[46];
+ x[51] = x[48];
+ x[50] = x[49];
+ x[52] = x[55];
+ x[53] = x[54];
+ x[59] = x[56];
+ x[58] = x[57];
+ x[60] = x[63];
+ x[61] = x[62];
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ x[19] = x[16];
+ x[18] = x[17];
+ x[20] = x[23];
+ x[21] = x[22];
+ x[27] = x[24];
+ x[26] = x[25];
+ x[28] = x[31];
+ x[29] = x[30];
+ idct64_stage6_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ x[11] = x[8];
+ x[10] = x[9];
+ x[12] = x[15];
+ x[13] = x[14];
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ x[7] = x[0];
+ x[6] = x[1];
+ x[5] = x[2];
+ x[4] = x[3];
+ x[9] = x[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[4] = input[8];
+ x[8] = input[4];
+ x[12] = input[12];
+ x[16] = input[2];
+ x[20] = input[10];
+ x[24] = input[6];
+ x[28] = input[14];
+ x[32] = input[1];
+ x[36] = input[9];
+ x[40] = input[5];
+ x[44] = input[13];
+ x[48] = input[3];
+ x[52] = input[11];
+ x[56] = input[7];
+ x[60] = input[15];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ x[33] = x[32];
+ x[34] = x[35];
+ x[37] = x[36];
+ x[38] = x[39];
+ x[41] = x[40];
+ x[42] = x[43];
+ x[45] = x[44];
+ x[46] = x[47];
+ x[49] = x[48];
+ x[50] = x[51];
+ x[53] = x[52];
+ x[54] = x[55];
+ x[57] = x[56];
+ x[58] = x[59];
+ x[61] = x[60];
+ x[62] = x[63];
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ x[17] = x[16];
+ x[18] = x[19];
+ x[21] = x[20];
+ x[22] = x[23];
+ x[25] = x[24];
+ x[26] = x[27];
+ x[29] = x[28];
+ x[30] = x[31];
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ x[9] = x[8];
+ x[10] = x[11];
+ x[13] = x[12];
+ x[14] = x[15];
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ x[5] = x[4];
+ x[6] = x[7];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ x[3] = x[0];
+ x[2] = x[1];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+static void idct64_low32_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ const __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ const __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ const __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+
+ // stage 1
+ __m128i x[64];
+ x[0] = input[0];
+ x[2] = input[16];
+ x[4] = input[8];
+ x[6] = input[24];
+ x[8] = input[4];
+ x[10] = input[20];
+ x[12] = input[12];
+ x[14] = input[28];
+ x[16] = input[2];
+ x[18] = input[18];
+ x[20] = input[10];
+ x[22] = input[26];
+ x[24] = input[6];
+ x[26] = input[22];
+ x[28] = input[14];
+ x[30] = input[30];
+ x[32] = input[1];
+ x[34] = input[17];
+ x[36] = input[9];
+ x[38] = input[25];
+ x[40] = input[5];
+ x[42] = input[21];
+ x[44] = input[13];
+ x[46] = input[29];
+ x[48] = input[3];
+ x[50] = input[19];
+ x[52] = input[11];
+ x[54] = input[27];
+ x[56] = input[7];
+ x[58] = input[23];
+ x[60] = input[15];
+ x[62] = input[31];
+
+ // stage 2
+ btf_16_ssse3(cospi[63], cospi[1], x[32], x[32], x[63]);
+ btf_16_ssse3(-cospi[33], cospi[31], x[62], x[33], x[62]);
+ btf_16_ssse3(cospi[47], cospi[17], x[34], x[34], x[61]);
+ btf_16_ssse3(-cospi[49], cospi[15], x[60], x[35], x[60]);
+ btf_16_ssse3(cospi[55], cospi[9], x[36], x[36], x[59]);
+ btf_16_ssse3(-cospi[41], cospi[23], x[58], x[37], x[58]);
+ btf_16_ssse3(cospi[39], cospi[25], x[38], x[38], x[57]);
+ btf_16_ssse3(-cospi[57], cospi[7], x[56], x[39], x[56]);
+ btf_16_ssse3(cospi[59], cospi[5], x[40], x[40], x[55]);
+ btf_16_ssse3(-cospi[37], cospi[27], x[54], x[41], x[54]);
+ btf_16_ssse3(cospi[43], cospi[21], x[42], x[42], x[53]);
+ btf_16_ssse3(-cospi[53], cospi[11], x[52], x[43], x[52]);
+ btf_16_ssse3(cospi[51], cospi[13], x[44], x[44], x[51]);
+ btf_16_ssse3(-cospi[45], cospi[19], x[50], x[45], x[50]);
+ btf_16_ssse3(cospi[35], cospi[29], x[46], x[46], x[49]);
+ btf_16_ssse3(-cospi[61], cospi[3], x[48], x[47], x[48]);
+
+ // stage 3
+ btf_16_ssse3(cospi[62], cospi[2], x[16], x[16], x[31]);
+ btf_16_ssse3(-cospi[34], cospi[30], x[30], x[17], x[30]);
+ btf_16_ssse3(cospi[46], cospi[18], x[18], x[18], x[29]);
+ btf_16_ssse3(-cospi[50], cospi[14], x[28], x[19], x[28]);
+ btf_16_ssse3(cospi[54], cospi[10], x[20], x[20], x[27]);
+ btf_16_ssse3(-cospi[42], cospi[22], x[26], x[21], x[26]);
+ btf_16_ssse3(cospi[38], cospi[26], x[22], x[22], x[25]);
+ btf_16_ssse3(-cospi[58], cospi[6], x[24], x[23], x[24]);
+ btf_16_adds_subs_sse2(x[32], x[33]);
+ btf_16_subs_adds_sse2(x[35], x[34]);
+ btf_16_adds_subs_sse2(x[36], x[37]);
+ btf_16_subs_adds_sse2(x[39], x[38]);
+ btf_16_adds_subs_sse2(x[40], x[41]);
+ btf_16_subs_adds_sse2(x[43], x[42]);
+ btf_16_adds_subs_sse2(x[44], x[45]);
+ btf_16_subs_adds_sse2(x[47], x[46]);
+ btf_16_adds_subs_sse2(x[48], x[49]);
+ btf_16_subs_adds_sse2(x[51], x[50]);
+ btf_16_adds_subs_sse2(x[52], x[53]);
+ btf_16_subs_adds_sse2(x[55], x[54]);
+ btf_16_adds_subs_sse2(x[56], x[57]);
+ btf_16_subs_adds_sse2(x[59], x[58]);
+ btf_16_adds_subs_sse2(x[60], x[61]);
+ btf_16_subs_adds_sse2(x[63], x[62]);
+
+ // stage 4
+ btf_16_ssse3(cospi[60], cospi[4], x[8], x[8], x[15]);
+ btf_16_ssse3(-cospi[36], cospi[28], x[14], x[9], x[14]);
+ btf_16_ssse3(cospi[44], cospi[20], x[10], x[10], x[13]);
+ btf_16_ssse3(-cospi[52], cospi[12], x[12], x[11], x[12]);
+ btf_16_adds_subs_sse2(x[16], x[17]);
+ btf_16_subs_adds_sse2(x[19], x[18]);
+ btf_16_adds_subs_sse2(x[20], x[21]);
+ btf_16_subs_adds_sse2(x[23], x[22]);
+ btf_16_adds_subs_sse2(x[24], x[25]);
+ btf_16_subs_adds_sse2(x[27], x[26]);
+ btf_16_adds_subs_sse2(x[28], x[29]);
+ btf_16_subs_adds_sse2(x[31], x[30]);
+ idct64_stage4_high32_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 5
+ btf_16_ssse3(cospi[56], cospi[8], x[4], x[4], x[7]);
+ btf_16_ssse3(-cospi[40], cospi[24], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[9]);
+ btf_16_subs_adds_sse2(x[11], x[10]);
+ btf_16_adds_subs_sse2(x[12], x[13]);
+ btf_16_subs_adds_sse2(x[15], x[14]);
+ idct64_stage5_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 6
+ btf_16_ssse3(cospi[32], cospi[32], x[0], x[0], x[1]);
+ btf_16_ssse3(cospi[48], cospi[16], x[2], x[2], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[5]);
+ btf_16_subs_adds_sse2(x[7], x[6]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x[9], x[14], x[9], x[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x[10], x[13], x[10], x[13]);
+ idct64_stage6_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_sse2(x[0], x[3]);
+ btf_16_adds_subs_sse2(x[1], x[2]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[5], x[6], x[5], x[6]);
+ btf_16_adds_subs_sse2(x[8], x[11]);
+ btf_16_adds_subs_sse2(x[9], x[10]);
+ btf_16_subs_adds_sse2(x[15], x[12]);
+ btf_16_subs_adds_sse2(x[14], x[13]);
+ idct64_stage7_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 8
+ btf_16_adds_subs_sse2(x[0], x[7]);
+ btf_16_adds_subs_sse2(x[1], x[6]);
+ btf_16_adds_subs_sse2(x[2], x[5]);
+ btf_16_adds_subs_sse2(x[3], x[4]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[10], x[13], x[10], x[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x[11], x[12], x[11], x[12]);
+ idct64_stage8_high48_sse2(x, cospi, __rounding, cos_bit);
+
+ // stage 9~11
+ idct64_stage9_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage10_sse2(x, cospi, __rounding, cos_bit);
+ idct64_stage11_sse2(output, x);
+}
+
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[4];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+ u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+ u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+ __m128i x1[16];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+ x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+ x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+ x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+ x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+ x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
+ x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+ x1[12] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[13] = _mm_madd_epi16(u[1], sinpi_p04_p02);
+ x1[14] = _mm_madd_epi16(u[2], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+ x1[15] = _mm_madd_epi16(u[3], sinpi_m03_m01);
+
+ __m128i x2[8];
+ x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 +x2*sin4 +x1*sin3 +x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[5]);
+ x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 -x2*sin1 +x1*sin3 -x3*sin4
+ x2[3] = _mm_add_epi32(x1[3], x1[7]);
+ x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 -x2*sin3 +x3*sin3
+ x2[5] = _mm_add_epi32(x1[9], x1[11]);
+ x2[6] = _mm_add_epi32(x1[12], x1[14]); // x0*sin1 +x2*sin4 +x0*sin2 -x2*sin1
+ x2[7] = _mm_add_epi32(x1[13], x1[15]);
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+ __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ out1 = _mm_srai_epi32(out1, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out1);
+ }
+}
+
+// TODO(binpengsmail@gmail.com):
+// To explore the reuse of VP9 versions of corresponding SSE2 functions and
+// evaluate whether there is a possibility for further speedup.
+void iadst4_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *sinpi = sinpi_arr(INV_COS_BIT);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_p04_p02 = pair_set_epi16(sinpi[4], sinpi[2]);
+ const __m128i sinpi_m03_m01 = pair_set_epi16(-sinpi[3], -sinpi[1]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[2];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpacklo_epi16(x0[1], x0[3]);
+
+ __m128i x1[8];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[2] = _mm_madd_epi16(u[1], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[4] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[5] = _mm_madd_epi16(u[1], sinpi_0_p03); // x2*sin3
+ x1[6] = _mm_madd_epi16(u[0], sinpi_p04_p02); // x0*sin4 + x2*sin2
+ x1[7] = _mm_madd_epi16(u[1], sinpi_m03_m01); // -x1*sin3 - x3*sin1
+
+ __m128i x2[4];
+ x2[0] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[3]); // x0*sin2 - x2*sin1 + x1*sin3 - x3*sin4
+ x2[2] = _mm_add_epi32(x1[4], x1[5]); // x0*sin3 - x2*sin3 + x3*sin3
+ x2[3] = _mm_add_epi32(x1[6], x1[7]); // x0*sin4 + x2*sin2 - x1*sin3 - x3*sin1
+
+ const __m128i rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[i], rounding);
+ out0 = _mm_srai_epi32(out0, INV_COS_BIT);
+ output[i] = _mm_packs_epi32(out0, out0);
+ }
+}
+
+static void iadst8_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[60], -cospi[4], x[1], x[0], x[1]);
+
+ // stage 3
+ x[4] = x[0];
+ x[5] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+
+ // stage 5
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+void iadst8_w4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ const __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ const __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ const __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ const __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ const __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ const __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ const __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[8];
+ x[0] = input[7];
+ x[1] = input[0];
+ x[2] = input[5];
+ x[3] = input[2];
+ x[4] = input[3];
+ x[5] = input[4];
+ x[6] = input[1];
+ x[7] = input[6];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p04_p60, cospi_p60_m04, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p20_p44, cospi_p44_m20, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p36_p28, cospi_p28_m36, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p52_p12, cospi_p12_m52, x[6], x[7], x[6], x[7]);
+
+ // stage 3
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+
+ // stage 5
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+
+ // stage 7
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[4]);
+ output[2] = x[6];
+ output[3] = _mm_subs_epi16(__zero, x[2]);
+ output[4] = x[3];
+ output[5] = _mm_subs_epi16(__zero, x[7]);
+ output[6] = x[5];
+ output[7] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static INLINE void iadst16_stage3_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[8]);
+ btf_16_adds_subs_sse2(x[1], x[9]);
+ btf_16_adds_subs_sse2(x[2], x[10]);
+ btf_16_adds_subs_sse2(x[3], x[11]);
+ btf_16_adds_subs_sse2(x[4], x[12]);
+ btf_16_adds_subs_sse2(x[5], x[13]);
+ btf_16_adds_subs_sse2(x[6], x[14]);
+ btf_16_adds_subs_sse2(x[7], x[15]);
+}
+
+static INLINE void iadst16_stage4_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage5_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[4]);
+ btf_16_adds_subs_sse2(x[1], x[5]);
+ btf_16_adds_subs_sse2(x[2], x[6]);
+ btf_16_adds_subs_sse2(x[3], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[12]);
+ btf_16_adds_subs_sse2(x[9], x[13]);
+ btf_16_adds_subs_sse2(x[10], x[14]);
+ btf_16_adds_subs_sse2(x[11], x[15]);
+}
+
+static INLINE void iadst16_stage6_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage7_ssse3(__m128i *x) {
+ btf_16_adds_subs_sse2(x[0], x[2]);
+ btf_16_adds_subs_sse2(x[1], x[3]);
+ btf_16_adds_subs_sse2(x[4], x[6]);
+ btf_16_adds_subs_sse2(x[5], x[7]);
+ btf_16_adds_subs_sse2(x[8], x[10]);
+ btf_16_adds_subs_sse2(x[9], x[11]);
+ btf_16_adds_subs_sse2(x[12], x[14]);
+ btf_16_adds_subs_sse2(x[13], x[15]);
+}
+
+static INLINE void iadst16_stage8_ssse3(__m128i *x, const int32_t *cospi,
+ const __m128i __rounding,
+ int8_t cos_bit) {
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+}
+
+static INLINE void iadst16_stage9_ssse3(__m128i *output, __m128i *x) {
+ const __m128i __zero = _mm_setzero_si128();
+ output[0] = x[0];
+ output[1] = _mm_subs_epi16(__zero, x[8]);
+ output[2] = x[12];
+ output[3] = _mm_subs_epi16(__zero, x[4]);
+ output[4] = x[6];
+ output[5] = _mm_subs_epi16(__zero, x[14]);
+ output[6] = x[10];
+ output[7] = _mm_subs_epi16(__zero, x[2]);
+ output[8] = x[3];
+ output[9] = _mm_subs_epi16(__zero, x[11]);
+ output[10] = x[15];
+ output[11] = _mm_subs_epi16(__zero, x[7]);
+ output[12] = x[5];
+ output[13] = _mm_subs_epi16(__zero, x[13]);
+ output[14] = x[9];
+ output[15] = _mm_subs_epi16(__zero, x[1]);
+}
+
+static void iadst16_low1_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+
+ // stage 3
+ x[8] = x[0];
+ x[9] = x[1];
+
+ // stage 4
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+
+ // stage 5
+ x[4] = x[0];
+ x[5] = x[1];
+ x[12] = x[8];
+ x[13] = x[9];
+
+ // stage 6
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+
+ // stage 7
+ x[2] = x[0];
+ x[3] = x[1];
+ x[6] = x[4];
+ x[7] = x[5];
+ x[10] = x[8];
+ x[11] = x[9];
+ x[14] = x[12];
+ x[15] = x[13];
+
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iadst16_low8_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ // stage 1
+ __m128i x[16];
+ x[1] = input[0];
+ x[3] = input[2];
+ x[5] = input[4];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[10] = input[5];
+ x[12] = input[3];
+ x[14] = input[1];
+
+ // stage 2
+ btf_16_ssse3(cospi[62], -cospi[2], x[1], x[0], x[1]);
+ btf_16_ssse3(cospi[54], -cospi[10], x[3], x[2], x[3]);
+ btf_16_ssse3(cospi[46], -cospi[18], x[5], x[4], x[5]);
+ btf_16_ssse3(cospi[38], -cospi[26], x[7], x[6], x[7]);
+ btf_16_ssse3(cospi[34], cospi[30], x[8], x[8], x[9]);
+ btf_16_ssse3(cospi[42], cospi[22], x[10], x[10], x[11]);
+ btf_16_ssse3(cospi[50], cospi[14], x[12], x[12], x[13]);
+ btf_16_ssse3(cospi[58], cospi[6], x[14], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+void iadst16_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3~9
+ iadst16_stage3_ssse3(x);
+ iadst16_stage4_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage5_ssse3(x);
+ iadst16_stage6_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage7_ssse3(x);
+ iadst16_stage8_ssse3(x, cospi, __rounding, cos_bit);
+ iadst16_stage9_ssse3(output, x);
+}
+
+void iadst16_w4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int32_t *cospi = cospi_arr(INV_COS_BIT);
+ const __m128i __rounding = _mm_set1_epi32(1 << (INV_COS_BIT - 1));
+
+ const __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ const __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ const __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ const __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ const __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ const __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ const __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ const __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ const __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ const __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ const __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ const __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ const __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ const __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ const __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ const __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+ const __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ const __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ const __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ const __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ const __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ const __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+
+ // stage 1
+ __m128i x[16];
+ x[0] = input[15];
+ x[1] = input[0];
+ x[2] = input[13];
+ x[3] = input[2];
+ x[4] = input[11];
+ x[5] = input[4];
+ x[6] = input[9];
+ x[7] = input[6];
+ x[8] = input[7];
+ x[9] = input[8];
+ x[10] = input[5];
+ x[11] = input[10];
+ x[12] = input[3];
+ x[13] = input[12];
+ x[14] = input[1];
+ x[15] = input[14];
+
+ // stage 2
+ btf_16_4p_sse2(cospi_p02_p62, cospi_p62_m02, x[0], x[1], x[0], x[1]);
+ btf_16_4p_sse2(cospi_p10_p54, cospi_p54_m10, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p18_p46, cospi_p46_m18, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_p26_p38, cospi_p38_m26, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p34_p30, cospi_p30_m34, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p42_p22, cospi_p22_m42, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p50_p14, cospi_p14_m50, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_p58_p06, cospi_p06_m58, x[14], x[15], x[14], x[15]);
+
+ // stage 3
+ iadst16_stage3_ssse3(x);
+
+ // stage 4
+ btf_16_4p_sse2(cospi_p08_p56, cospi_p56_m08, x[8], x[9], x[8], x[9]);
+ btf_16_4p_sse2(cospi_p40_p24, cospi_p24_m40, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_m56_p08, cospi_p08_p56, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m24_p40, cospi_p40_p24, x[14], x[15], x[14], x[15]);
+
+ // stage 5
+ iadst16_stage5_ssse3(x);
+
+ // stage 6
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[4], x[5], x[4], x[5]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p16_p48, cospi_p48_m16, x[12], x[13], x[12], x[13]);
+ btf_16_4p_sse2(cospi_m48_p16, cospi_p16_p48, x[14], x[15], x[14], x[15]);
+
+ // stage 7
+ iadst16_stage7_ssse3(x);
+
+ // stage 8
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[2], x[3], x[2], x[3]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[6], x[7], x[6], x[7]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[10], x[11], x[10], x[11]);
+ btf_16_4p_sse2(cospi_p32_p32, cospi_p32_m32, x[14], x[15], x[14], x[15]);
+
+ // stage 9
+ iadst16_stage9_ssse3(output, x);
+}
+
+static void iidentity4_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int16_t scale_fractional = (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 4; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ output[i] = _mm_adds_epi16(x, input[i]);
+ }
+}
+
+static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 8; ++i) {
+ output[i] = _mm_adds_epi16(input[i], input[i]);
+ }
+}
+
+static void iidentity16_new_ssse3(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const int16_t scale_fractional = 2 * (NewSqrt2 - (1 << NewSqrt2Bits));
+ const __m128i scale = _mm_set1_epi16(scale_fractional << (15 - NewSqrt2Bits));
+ for (int i = 0; i < 16; ++i) {
+ __m128i x = _mm_mulhrs_epi16(input[i], scale);
+ __m128i srcx2 = _mm_adds_epi16(input[i], input[i]);
+ output[i] = _mm_adds_epi16(x, srcx2);
+ }
+}
+
+static INLINE __m128i lowbd_get_recon_8x8_sse2(const __m128i pred,
+ __m128i res) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_adds_epi16(res, _mm_unpacklo_epi8(pred, zero));
+ return _mm_packus_epi16(x0, x0);
+}
+
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+ __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+ u = _mm_packus_epi16(u, zero);
+ *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+ }
+}
+
+static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(output + i * stride));
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, in[j]);
+ _mm_storel_epi64((__m128i *)(output + i * stride), u);
+ }
+}
+
+// 1D functions process process 8 pixels at one time.
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w8_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_new_sse2, iadst4_new_sse2, iidentity4_new_ssse3 },
+ { idct8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+ { idct16_new_sse2, iadst16_new_sse2, iidentity16_new_ssse3 },
+ { idct32_new_sse2, NULL, NULL },
+ { idct64_low32_new_ssse3, NULL, NULL },
+ };
+
+// functions for blocks with eob at DC and within
+// topleft 8x8, 16x16, 32x32 corner
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { idct4_new_sse2, idct4_new_sse2, NULL, NULL },
+ { iadst4_new_sse2, iadst4_new_sse2, NULL, NULL },
+ { iidentity4_new_ssse3, iidentity4_new_ssse3, NULL, NULL },
+ },
+ { { idct8_low1_new_ssse3, idct8_new_sse2, NULL, NULL },
+ { iadst8_low1_new_ssse3, iadst8_new_sse2, NULL, NULL },
+ { iidentity8_new_sse2, iidentity8_new_sse2, NULL, NULL } },
+ {
+ { idct16_low1_new_ssse3, idct16_low8_new_ssse3, idct16_new_sse2,
+ NULL },
+ { iadst16_low1_new_ssse3, iadst16_low8_new_ssse3, iadst16_new_sse2,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_new_ssse3, idct32_low8_new_ssse3, idct32_low16_new_ssse3,
+ idct32_new_sse2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64_low1_new_ssse3, idct64_low8_new_ssse3, idct64_low16_new_ssse3,
+ idct64_low32_new_ssse3 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+// 1D functions process process 4 pixels at one time.
+// used in 4x4, 4x8, 4x16, 8x4, 16x4
+static const transform_1d_ssse3
+ lowbd_txfm_all_1d_w4_arr[TX_SIZES][ITX_TYPES_1D] = {
+ { idct4_w4_new_sse2, iadst4_w4_new_sse2, iidentity4_new_ssse3 },
+ { idct8_w4_new_sse2, iadst8_w4_new_sse2, iidentity8_new_sse2 },
+ { idct16_w4_new_sse2, iadst16_w4_new_sse2, iidentity16_new_ssse3 },
+ { NULL, NULL, NULL },
+ { NULL, NULL, NULL },
+ };
+
+static INLINE void iidentity_row_8xn_ssse3(__m128i *out, const int32_t *input,
+ int stride, int shift, int height,
+ int txw_idx, int rect_type) {
+ const int32_t *input_row = input;
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txw_idx]);
+ const __m128i rounding = _mm_set1_epi16((1 << (NewSqrt2Bits - 1)) +
+ (1 << (NewSqrt2Bits - shift - 1)));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+ if (rect_type != 1 && rect_type != -1) {
+ for (int i = 0; i < height; ++i) {
+ const __m128i src = load_32bit_to_16bit(input_row);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ } else {
+ const __m128i rect_scale =
+ _mm_set1_epi16(NewInvSqrt2 << (15 - NewSqrt2Bits));
+ for (int i = 0; i < height; ++i) {
+ __m128i src = load_32bit_to_16bit(input_row);
+ src = _mm_mulhrs_epi16(src, rect_scale);
+ input_row += stride;
+ __m128i lo = _mm_unpacklo_epi16(src, one);
+ __m128i hi = _mm_unpackhi_epi16(src, one);
+ lo = _mm_madd_epi16(lo, scale_rounding);
+ hi = _mm_madd_epi16(hi, scale_rounding);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits - shift);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits - shift);
+ out[i] = _mm_packs_epi32(lo, hi);
+ }
+ }
+}
+
+static INLINE void iidentity_col_8xn_ssse3(uint8_t *output, int stride,
+ __m128i *buf, int shift, int height,
+ int txh_idx) {
+ const __m128i scale = _mm_set1_epi16(NewSqrt2list[txh_idx]);
+ const __m128i scale_rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m128i shift_rounding = _mm_set1_epi32(1 << (-shift - 1));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_coeff = _mm_unpacklo_epi16(scale, scale_rounding);
+ const __m128i zero = _mm_setzero_si128();
+ for (int h = 0; h < height; ++h) {
+ __m128i lo = _mm_unpacklo_epi16(buf[h], one);
+ __m128i hi = _mm_unpackhi_epi16(buf[h], one);
+ lo = _mm_madd_epi16(lo, scale_coeff);
+ hi = _mm_madd_epi16(hi, scale_coeff);
+ lo = _mm_srai_epi32(lo, NewSqrt2Bits);
+ hi = _mm_srai_epi32(hi, NewSqrt2Bits);
+ lo = _mm_add_epi32(lo, shift_rounding);
+ hi = _mm_add_epi32(hi, shift_rounding);
+ lo = _mm_srai_epi32(lo, -shift);
+ hi = _mm_srai_epi32(hi, -shift);
+ __m128i x = _mm_packs_epi32(lo, hi);
+
+ const __m128i pred = _mm_loadl_epi64((__m128i const *)(output));
+ x = _mm_adds_epi16(x, _mm_unpacklo_epi8(pred, zero));
+ const __m128i u = _mm_packus_epi16(x, x);
+ _mm_storel_epi64((__m128i *)(output), u);
+ output += stride;
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_idtx_ssse3(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_SIZE tx_size) {
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int row_max = AOMMIN(32, txfm_size_row);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ __m128i buf[32];
+
+ for (int i = 0; i < (input_stride >> 3); ++i) {
+ iidentity_row_8xn_ssse3(buf, input + 8 * i, input_stride, shift[0], row_max,
+ txw_idx, rect_type);
+ iidentity_col_8xn_ssse3(output + 8 * i, stride, buf, shift[1], row_max,
+ txh_idx);
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[4];
+ const TX_SIZE tx_size = TX_4X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x4(buf, buf);
+ row_txfm(buf, buf, cos_bit_row);
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x4(temp, buf);
+ } else {
+ transpose_16bit_4x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+static INLINE __m128i lowbd_get_recon_16x16_sse2(const __m128i pred,
+ __m128i res0, __m128i res1) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i x0 = _mm_unpacklo_epi8(pred, zero);
+ __m128i x1 = _mm_unpackhi_epi8(pred, zero);
+ x0 = _mm_adds_epi16(res0, x0);
+ x1 = _mm_adds_epi16(res1, x1);
+ return _mm_packus_epi16(x0, x1);
+}
+
+static INLINE void lowbd_write_buffer_16xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = lowbd_get_recon_16x16_sse2(v, in[j], in[j + height]);
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void round_shift_ssse3(const __m128i *input, __m128i *output,
+ int size) {
+ const __m128i scale = _mm_set1_epi16(NewInvSqrt2 * 8);
+ for (int i = 0; i < size; ++i) {
+ output[i] = _mm_mulhrs_epi16(input[i], scale);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_no_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64 * 8];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1 + i * 8;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp,
+ _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+ }
+ }
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+ round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+ }
+
+ if (txfm_size_col >= 16) {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row);
+ }
+ } else if (txfm_size_col == 8) {
+ lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_h_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ int eobx, eoby;
+ get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = (eobx + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eoby];
+ assert(fun_idx < 5);
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx];
+
+ assert(col_txfm != NULL);
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ __m128i buf0[64];
+ iidentity_row_8xn_ssse3(buf0, input + 8 * i, input_stride, shift[0],
+ eoby + 1, txw_idx, rect_type);
+ col_txfm(buf0, buf0, cos_bit_col);
+ __m128i mshift = _mm_set1_epi16(1 << (15 + shift[1]));
+ int k = ud_flip ? (txfm_size_row - 1) : 0;
+ const int step = ud_flip ? -1 : 1;
+ uint8_t *out = output + 8 * i;
+ for (int j = 0; j < txfm_size_row; ++j, k += step) {
+ const __m128i v = _mm_loadl_epi64((__m128i const *)(out));
+ __m128i res = _mm_mulhrs_epi16(buf0[k], mshift);
+ const __m128i u = lowbd_get_recon_8x8_sse2(v, res);
+ _mm_storel_epi64((__m128i *)(out), u);
+ out += stride;
+ }
+ }
+}
+
+static INLINE void lowbd_inv_txfm2d_add_v_identity_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ __m128i buf1[64];
+ int eobx, eoby;
+ get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < buf_size_h_div8; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < AOMMIN(4, buf_size_w_div8); ++j) {
+ __m128i *buf0_cur = buf0 + j * 8;
+ load_buffer_32bit_to_16bit(input_row + j * 8, input_stride, buf0_cur, 8);
+ transpose_16bit_8x8(buf0_cur, buf0_cur);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ round_shift_ssse3(buf0, buf0, input_stride); // rect special code
+ }
+ row_txfm(buf0, buf0, cos_bit_row);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
+ __m128i *_buf1 = buf1;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ __m128i temp[8];
+ flip_buf_sse2(buf0 + 8 * j, temp, 8);
+ transpose_16bit_8x8(temp, _buf1 + 8 * (buf_size_w_div8 - 1 - j));
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_16bit_8x8(buf0 + 8 * j, _buf1 + 8 * j);
+ }
+ }
+
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ iidentity_col_8xn_ssse3(output + i * 8 * stride + j * 8, stride,
+ buf1 + j * 8, shift[1], 8, txh_idx);
+ }
+ }
+}
+
+// for 32x32,32x64,64x32,64x64,32x8,8x32,16x32,32x16,64x16,16x64
+static INLINE void lowbd_inv_txfm2d_add_universe_ssse3(
+ const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_type) {
+ case DCT_DCT:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case IDTX:
+ lowbd_inv_txfm2d_add_idtx_ssse3(input, output, stride, tx_size);
+ break;
+ case V_DCT:
+ case V_ADST:
+ case V_FLIPADST:
+ lowbd_inv_txfm2d_add_h_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ case H_DCT:
+ case H_ADST:
+ case H_FLIPADST:
+ lowbd_inv_txfm2d_add_v_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_no_identity_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+
+void lowbd_inv_txfm2d_add_4x8_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_4X8;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x8(buf, buf);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf);
+ } else {
+ transpose_16bit_8x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_8x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_8x4(buf, buf);
+ round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x8(temp, buf);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_4x16_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_4X16;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w8_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w4_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ const int row_one_loop = 8;
+ for (int i = 0; i < 2; ++i) {
+ const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+ row_one_loop);
+ transpose_16bit_4x8(buf_cur, buf_cur);
+ row_txfm(buf_cur, buf_cur, cos_bit_row);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf_cur, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf_cur);
+ } else {
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void lowbd_inv_txfm2d_add_16x4_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size_, int eob) {
+ (void)tx_size_;
+ (void)eob;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_16X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+
+ const transform_1d_ssse3 row_txfm =
+ lowbd_txfm_all_1d_w4_arr[txw_idx][hitx_1d_tab[tx_type]];
+ const transform_1d_ssse3 col_txfm =
+ lowbd_txfm_all_1d_w8_arr[txh_idx][vitx_1d_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int row_one_loop = 8;
+ for (int i = 0; i < buf_size_w_div8; ++i) {
+ const int32_t *input_cur = input + i * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+ txfm_size_row);
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
+ if (lr_flip) {
+ __m128i temp[16];
+ flip_buf_sse2(buf, temp, 16);
+ transpose_16bit_4x8(temp, buf);
+ transpose_16bit_4x8(temp + 8, buf + 8);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+ round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ }
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+ lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob) {
+ switch (tx_size) {
+ case TX_4X4:
+ lowbd_inv_txfm2d_add_4x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X8:
+ lowbd_inv_txfm2d_add_4x8_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_8X4:
+ lowbd_inv_txfm2d_add_8x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_4X16:
+ lowbd_inv_txfm2d_add_4x16_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ case TX_16X4:
+ lowbd_inv_txfm2d_add_16x4_ssse3(input, output, stride, tx_type, tx_size,
+ eob);
+ break;
+ default:
+ lowbd_inv_txfm2d_add_universe_ssse3(input, output, stride, tx_type,
+ tx_size, eob);
+ break;
+ }
+}
+void av1_inv_txfm_add_ssse3(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
+ const TxfmParam *txfm_param) {
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (!txfm_param->lossless) {
+ av1_lowbd_inv_txfm2d_add_ssse3(dqcoeff, dst, stride, tx_type,
+ txfm_param->tx_size, txfm_param->eob);
+ } else {
+ av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
new file mode 100644
index 0000000000..66bd339d11
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+#define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
+
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h> // SSSE3
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define btf_16_ssse3(w0, w1, in, out0, out1) \
+ do { \
+ const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
+ const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
+ const __m128i _in = in; \
+ out0 = _mm_mulhrs_epi16(_in, _w0); \
+ out1 = _mm_mulhrs_epi16(_in, _w1); \
+ } while (0)
+
+#define btf_16_adds_subs_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_subs_adds_sse2(in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ in1 = _mm_subs_epi16(_in0, _in1); \
+ in0 = _mm_adds_epi16(_in0, _in1); \
+ } while (0)
+
+#define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
+ do { \
+ const __m128i _in0 = in0; \
+ const __m128i _in1 = in1; \
+ out0 = _mm_adds_epi16(_in0, _in1); \
+ out1 = _mm_subs_epi16(_in0, _in1); \
+ } while (0)
+
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+// 1D itx types
+typedef enum ATTRIBUTE_PACKED {
+ IDCT_1D,
+ IADST_1D,
+ IFLIPADST_1D = IADST_1D,
+ IIDENTITY_1D,
+ ITX_TYPES_1D,
+} ITX_TYPE_1D;
+
+static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
+ IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
+ IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
+ IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
+};
+
+static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
+ IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
+ IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
+ IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
+ IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x16_default[16]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x32_default[32]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+ 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
+ 0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_16x32_default[32]) = {
+ 0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
+ 0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+ 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+ av1_eob_to_eobxy_32x16_default[16]) = {
+ 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+ 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
+ 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
+ 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+ 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
+ 0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
+};
+
+DECLARE_ALIGNED(16, static const int16_t *,
+ av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
+ NULL,
+ av1_eob_to_eobxy_8x8_default,
+ av1_eob_to_eobxy_16x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x16_default,
+ av1_eob_to_eobxy_16x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+ av1_eob_to_eobxy_32x32_default,
+ av1_eob_to_eobxy_32x32_default,
+ NULL,
+ NULL,
+ av1_eob_to_eobxy_8x32_default,
+ av1_eob_to_eobxy_32x8_default,
+ av1_eob_to_eobxy_16x32_default,
+ av1_eob_to_eobxy_32x16_default,
+};
+
+static const int lowbd_txfm_all_1d_zeros_idx[32] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+};
+
+// Transform block width in log2 for eob (size of 64 map to 32)
+static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
+ 2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
+};
+
+static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ if (eob == 1) {
+ *eobx = 0;
+ *eoby = 0;
+ return;
+ }
+
+ const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
+ const int eob_row = (eob - 1) >> tx_w_log2;
+ const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
+ *eobx = eobxy & 0xFF;
+ *eoby = eobxy >> 8;
+}
+
+static int eob_fill[32] = {
+ 0, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15,
+ 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
+};
+
+static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
+ *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
+ const int temp_eoby = eob / (eobx_max + 1);
+ assert(temp_eoby < 32);
+ *eoby = eob_fill[temp_eoby];
+}
+
+static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
+ TX_SIZE tx_size, int eob) {
+ eob -= 1;
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
+ *eobx = eob / (eoby_max + 1);
+ *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
+}
+
+typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse2.h b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
new file mode 100644
index 0000000000..77aeb6eb13
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse2.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE void btf_16_w4_sse2(
+ const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
+ const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
+ __m128i *const out0, __m128i *const out1) {
+ const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i u0 = _mm_madd_epi16(t0, *w0);
+ const __m128i v0 = _mm_madd_epi16(t0, *w1);
+ const __m128i a0 = _mm_add_epi32(u0, __rounding);
+ const __m128i b0 = _mm_add_epi32(v0, __rounding);
+ const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
+ const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
+
+ *out0 = _mm_packs_epi32(c0, c0);
+ *out1 = _mm_packs_epi32(d0, c0);
+}
+
+#define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
+ { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c0); \
+ out1 = _mm_packs_epi32(d0, d0); \
+ }
+
+#define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
+ { \
+ __m128i t0 = _mm_unpacklo_epi16(in0, in1); \
+ __m128i t1 = _mm_unpackhi_epi16(in0, in1); \
+ __m128i u0 = _mm_madd_epi16(t0, w0); \
+ __m128i u1 = _mm_madd_epi16(t1, w0); \
+ __m128i v0 = _mm_madd_epi16(t0, w1); \
+ __m128i v1 = _mm_madd_epi16(t1, w1); \
+ \
+ __m128i a0 = _mm_add_epi32(u0, __rounding); \
+ __m128i a1 = _mm_add_epi32(u1, __rounding); \
+ __m128i b0 = _mm_add_epi32(v0, __rounding); \
+ __m128i b1 = _mm_add_epi32(v1, __rounding); \
+ \
+ __m128i c0 = _mm_srai_epi32(a0, cos_bit); \
+ __m128i c1 = _mm_srai_epi32(a1, cos_bit); \
+ __m128i d0 = _mm_srai_epi32(b0, cos_bit); \
+ __m128i d1 = _mm_srai_epi32(b1, cos_bit); \
+ \
+ out0 = _mm_packs_epi32(c0, c1); \
+ out1 = _mm_packs_epi32(d0, d1); \
+ }
+
+static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, a_low);
+}
+
+// Store 4 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a);
+ const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
+ const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
+ _mm_store_si128((__m128i *)b, a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+
+static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
+ const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m128i b = _mm_madd_epi16(a, scale_rounding);
+ return _mm_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+}
+
+static INLINE void store_rect_16bit_to_32bit(const __m128i a,
+ int32_t *const b) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, one);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ _mm_store_si128((__m128i *)b, b_lo);
+ _mm_store_si128((__m128i *)(b + 4), b_hi);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
+ const int stride,
+ __m128i *const out,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w4(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
+ int32_t *const out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit(in[i], out + i * stride);
+ }
+}
+
+static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
+ uint16_t *out,
+ const int stride) {
+ for (int i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)(out + i * stride), in[i]);
+ }
+}
+
+static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_adds_epi16(in[i], rounding);
+ in[i] = _mm_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+
+typedef struct {
+ transform_1d_sse2 col, row; // vertical and horizontal
+} transform_2d_sse2;
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.c b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
new file mode 100644
index 0000000000..90b9879cc4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.c
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+void av1_round_shift_array_sse4_1(int32_t *arr, int size, int bit) {
+ __m128i *const vec = (__m128i *)arr;
+ const int vec_size = size >> 2;
+ av1_round_shift_array_32_sse4_1(vec, vec, vec_size, bit);
+}
diff --git a/third_party/aom/av1/common/x86/av1_txfm_sse4.h b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
new file mode 100644
index 0000000000..6cad821b1b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/av1_txfm_sse4.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+#define AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
+
+#include <smmintrin.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i av1_round_shift_32_sse4_1(__m128i vec, int bit) {
+ __m128i tmp, round;
+ round = _mm_set1_epi32(1 << (bit - 1));
+ tmp = _mm_add_epi32(vec, round);
+ return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_sse4_1(__m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_sse4_1(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_sse4_1(__m128i *input,
+ __m128i *output,
+ const int size,
+ const int bit,
+ const int val) {
+ const __m128i sqrt2 = _mm_set1_epi32(val);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = av1_round_shift_32_sse4_1(input[i], bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m128i r0 = _mm_slli_epi32(input[i], -bit);
+ const __m128i r1 = _mm_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_sse4_1(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_COMMON_X86_AV1_TXFM_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/cfl_avx2.c b/third_party/aom/av1/common/x86/cfl_avx2.c
new file mode 100644
index 0000000000..a8bfdcce6b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_avx2.c
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
+ }
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i twos = _mm256_set1_epi8(2); // Thirty two twos
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+ __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+ __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
+
+ _mm256_storeu_si256(row, sum_16x16);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+ _mm256_storeu_si256(row, top_16x16);
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3). The AVX2 subsampling is only
+ * performed on block of width 32.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_444_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ const __m256i zeros = _mm256_setzero_si256();
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ top = _mm256_permute4x64_epi64(top, _MM_SHUFFLE(3, 1, 2, 0));
+
+ __m256i row_lo = _mm256_unpacklo_epi8(top, zeros);
+ row_lo = _mm256_slli_epi16(row_lo, 3);
+ __m256i row_hi = _mm256_unpackhi_epi8(top, zeros);
+ row_hi = _mm256_slli_epi16(row_hi, 3);
+
+ _mm256_storeu_si256(row, row_lo);
+ _mm256_storeu_si256(row + 1, row_hi);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, lbd)
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
+ */
+static void cfl_luma_subsampling_420_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const int luma_stride = input_stride << 1;
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
+ __m256i sum = _mm256_add_epi16(top, bot);
+
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i bot_1 = _mm256_loadu_si256((__m256i *)(input + 16 + input_stride));
+ __m256i sum_1 = _mm256_add_epi16(top_1, bot_1);
+
+ __m256i hsum = _mm256_hadd_epi16(sum, sum_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_add_epi16(hsum, hsum);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += luma_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, hbd)
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ *
+ */
+static void cfl_luma_subsampling_422_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ __m256i hsum = _mm256_hadd_epi16(top, top_1);
+ hsum = _mm256_permute4x64_epi64(hsum, _MM_SHUFFLE(3, 1, 2, 0));
+ hsum = _mm256_slli_epi16(hsum, 2);
+
+ _mm256_storeu_si256(row, hsum);
+
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, hbd)
+
+static void cfl_luma_subsampling_444_hbd_avx2(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_1 = _mm256_loadu_si256((__m256i *)(input + 16));
+ _mm256_storeu_si256(row, _mm256_slli_epi16(top, 3));
+ _mm256_storeu_si256(row + 1, _mm256_slli_epi16(top_1, 3));
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(444, hbd)
+
+static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
+ __m256i alpha_sign, __m256i dc_q0) {
+ __m256i ac_q3 = _mm256_loadu_si256(input);
+ __m256i ac_sign = _mm256_sign_epi16(alpha_sign, ac_q3);
+ __m256i scaled_luma_q0 =
+ _mm256_mulhrs_epi16(_mm256_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm256_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm256_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_avx2(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ (void)width;
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+
+ do {
+ __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm256_packus_epi16(res, next);
+ res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+ _mm256_storeu_si256((__m256i *)dst, res);
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 32, 8, lbd);
+CFL_PREDICT_X(avx2, 32, 16, lbd);
+CFL_PREDICT_X(avx2, 32, 32, lbd);
+
+cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = {
+ predict_lbd_4x4_ssse3, /* 4x4 */
+ predict_lbd_8x8_ssse3, /* 8x8 */
+ predict_lbd_16x16_ssse3, /* 16x16 */
+ predict_lbd_32x32_avx2, /* 32x32 */
+ cfl_predict_lbd_null, /* 64x64 (invalid CFL size) */
+ predict_lbd_4x8_ssse3, /* 4x8 */
+ predict_lbd_8x4_ssse3, /* 8x4 */
+ predict_lbd_8x16_ssse3, /* 8x16 */
+ predict_lbd_16x8_ssse3, /* 16x8 */
+ predict_lbd_16x32_ssse3, /* 16x32 */
+ predict_lbd_32x16_avx2, /* 32x16 */
+ cfl_predict_lbd_null, /* 32x64 (invalid CFL size) */
+ cfl_predict_lbd_null, /* 64x32 (invalid CFL size) */
+ predict_lbd_4x16_ssse3, /* 4x16 */
+ predict_lbd_16x4_ssse3, /* 16x4 */
+ predict_lbd_8x32_ssse3, /* 8x32 */
+ predict_lbd_32x8_avx2, /* 32x8 */
+ cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */
+ cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+static __m256i highbd_max_epi16(int bd) {
+ const __m256i neg_one = _mm256_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm256_xor_si256(_mm256_slli_epi16(neg_one, bd), neg_one);
+}
+
+static __m256i highbd_clamp_epi16(__m256i u, __m256i zero, __m256i max) {
+ return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_avx2(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ // Use SSSE3 version for smaller widths
+ assert(width == 16 || width == 32);
+ const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
+ const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
+ const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
+ const __m256i max = highbd_max_epi16(bd);
+
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ const __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256((__m256i *)dst,
+ highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+ if (width == 32) {
+ const __m256i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm256_storeu_si256(
+ (__m256i *)(dst + 16),
+ highbd_clamp_epi16(res_1, _mm256_setzero_si256(), max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
+}
+
+CFL_PREDICT_X(avx2, 16, 4, hbd)
+CFL_PREDICT_X(avx2, 16, 8, hbd)
+CFL_PREDICT_X(avx2, 16, 16, hbd)
+CFL_PREDICT_X(avx2, 16, 32, hbd)
+CFL_PREDICT_X(avx2, 32, 8, hbd)
+CFL_PREDICT_X(avx2, 32, 16, hbd)
+CFL_PREDICT_X(avx2, 32, 32, hbd)
+
+cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = {
+ predict_hbd_4x4_ssse3, /* 4x4 */
+ predict_hbd_8x8_ssse3, /* 8x8 */
+ predict_hbd_16x16_avx2, /* 16x16 */
+ predict_hbd_32x32_avx2, /* 32x32 */
+ cfl_predict_hbd_null, /* 64x64 (invalid CFL size) */
+ predict_hbd_4x8_ssse3, /* 4x8 */
+ predict_hbd_8x4_ssse3, /* 8x4 */
+ predict_hbd_8x16_ssse3, /* 8x16 */
+ predict_hbd_16x8_avx2, /* 16x8 */
+ predict_hbd_16x32_avx2, /* 16x32 */
+ predict_hbd_32x16_avx2, /* 32x16 */
+ cfl_predict_hbd_null, /* 32x64 (invalid CFL size) */
+ cfl_predict_hbd_null, /* 64x32 (invalid CFL size) */
+ predict_hbd_4x16_ssse3, /* 4x16 */
+ predict_hbd_16x4_avx2, /* 16x4 */
+ predict_hbd_8x32_ssse3, /* 8x32 */
+ predict_hbd_32x8_avx2, /* 32x8 */
+ cfl_predict_hbd_null, /* 16x64 (invalid CFL size) */
+ cfl_predict_hbd_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
+ return pred[tx_size % TX_SIZES_ALL];
+}
+
+// Returns a vector where all the (32-bits) elements are the sum of all the
+// lanes in a.
+static INLINE __m256i fill_sum_epi32(__m256i a) {
+ // Given that a == [A, B, C, D, E, F, G, H]
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A' == A + B, C' == C + D, E' == E + F, G' == G + H
+ // a == [A', C', A', C', E', G', E', G']
+ a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0));
+ // a == [A', C', E', G', A', C', E', G']
+ a = _mm256_hadd_epi32(a, a);
+ // Given that A'' == A' + C' and E'' == E' + G'
+ // a == [A'', E'', A'', E'', A'', E'', A'', E'']
+ return _mm256_hadd_epi32(a, a);
+ // Given that A''' == A'' + E''
+ // a == [A''', A''', A''', A''', A''', A''', A''', A''']
+}
+
+static INLINE __m256i _mm256_addl_epi16(__m256i a) {
+ return _mm256_add_epi32(_mm256_unpacklo_epi16(a, _mm256_setzero_si256()),
+ _mm256_unpackhi_epi16(a, _mm256_setzero_si256()));
+}
+
+static INLINE void subtract_average_avx2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ // Use SSE2 version for smaller widths
+ assert(width == 16 || width == 32);
+
+ const __m256i *src = (__m256i *)src_ptr;
+ const __m256i *const end = src + height * CFL_BUF_LINE_I256;
+ // To maximize usage of the AVX2 registers, we sum two rows per loop
+ // iteration
+ const int step = 2 * CFL_BUF_LINE_I256;
+
+ __m256i sum = _mm256_setzero_si256();
+ // For width 32, we use a second sum accumulator to reduce accumulator
+ // dependencies in the loop.
+ __m256i sum2;
+ if (width == 32) sum2 = _mm256_setzero_si256();
+
+ do {
+ // Add top row to the bottom row
+ __m256i l0 = _mm256_add_epi16(_mm256_loadu_si256(src),
+ _mm256_loadu_si256(src + CFL_BUF_LINE_I256));
+ sum = _mm256_add_epi32(sum, _mm256_addl_epi16(l0));
+ if (width == 32) { /* Don't worry, this if it gets optimized out. */
+ // Add the second part of the top row to the second part of the bottom row
+ __m256i l1 =
+ _mm256_add_epi16(_mm256_loadu_si256(src + 1),
+ _mm256_loadu_si256(src + 1 + CFL_BUF_LINE_I256));
+ sum2 = _mm256_add_epi32(sum2, _mm256_addl_epi16(l1));
+ }
+ src += step;
+ } while (src < end);
+ // Combine both sum accumulators
+ if (width == 32) sum = _mm256_add_epi32(sum, sum2);
+
+ __m256i fill = fill_sum_epi32(sum);
+
+ __m256i avg_epi16 = _mm256_srli_epi32(
+ _mm256_add_epi32(fill, _mm256_set1_epi32(round_offset)), num_pel_log2);
+ avg_epi16 = _mm256_packs_epi32(avg_epi16, avg_epi16);
+
+ // Store and subtract loop
+ src = (__m256i *)src_ptr;
+ __m256i *dst = (__m256i *)dst_ptr;
+ do {
+ _mm256_storeu_si256(dst,
+ _mm256_sub_epi16(_mm256_loadu_si256(src), avg_epi16));
+ if (width == 32) {
+ _mm256_storeu_si256(
+ dst + 1, _mm256_sub_epi16(_mm256_loadu_si256(src + 1), avg_epi16));
+ }
+ src += CFL_BUF_LINE_I256;
+ dst += CFL_BUF_LINE_I256;
+ } while (src < end);
+}
+
+// Declare wrappers for AVX2 sizes
+CFL_SUB_AVG_X(avx2, 16, 4, 32, 6)
+CFL_SUB_AVG_X(avx2, 16, 8, 64, 7)
+CFL_SUB_AVG_X(avx2, 16, 16, 128, 8)
+CFL_SUB_AVG_X(avx2, 16, 32, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 8, 128, 8)
+CFL_SUB_AVG_X(avx2, 32, 16, 256, 9)
+CFL_SUB_AVG_X(avx2, 32, 32, 512, 10)
+
+// Based on the observation that for small blocks AVX2 does not outperform
+// SSE2, we call the SSE2 code for block widths 4 and 8.
+cfl_subtract_average_fn get_subtract_average_fn_avx2(TX_SIZE tx_size) {
+ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
+ subtract_average_4x4_sse2, /* 4x4 */
+ subtract_average_8x8_sse2, /* 8x8 */
+ subtract_average_16x16_avx2, /* 16x16 */
+ subtract_average_32x32_avx2, /* 32x32 */
+ cfl_subtract_average_null, /* 64x64 (invalid CFL size) */
+ subtract_average_4x8_sse2, /* 4x8 */
+ subtract_average_8x4_sse2, /* 8x4 */
+ subtract_average_8x16_sse2, /* 8x16 */
+ subtract_average_16x8_avx2, /* 16x8 */
+ subtract_average_16x32_avx2, /* 16x32 */
+ subtract_average_32x16_avx2, /* 32x16 */
+ cfl_subtract_average_null, /* 32x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x32 (invalid CFL size) */
+ subtract_average_4x16_sse2, /* 4x16 */
+ subtract_average_16x4_avx2, /* 16x4 */
+ subtract_average_8x32_sse2, /* 8x32 */
+ subtract_average_32x8_avx2, /* 32x8 */
+ cfl_subtract_average_null, /* 16x64 (invalid CFL size) */
+ cfl_subtract_average_null, /* 64x16 (invalid CFL size) */
+ };
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
+ // index the function pointer array out of bounds.
+ return sub_avg[tx_size % TX_SIZES_ALL];
+}
diff --git a/third_party/aom/av1/common/x86/cfl_simd.h b/third_party/aom/av1/common/x86/cfl_simd.h
new file mode 100644
index 0000000000..3b342cd4e4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_simd.h
@@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_CFL_SIMD_H_
+#define AOM_AV1_COMMON_X86_CFL_SIMD_H_
+
+#include "av1/common/blockd.h"
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSSE3 version is faster for with == 16, we reuse it in AVX2
+void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride,
+ uint16_t *output_q3);
+
+// SSE2 version is optimal for with == 4, we reuse them in AVX2
+void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst);
+
+// SSE2 version is optimal for with == 8, we reuse them in AVX2
+void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst);
+void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst);
+
+void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+ int dst_stride, int alpha_q3);
+
+void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+ int dst_stride, int alpha_q3, int bd);
+
+#endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_
diff --git a/third_party/aom/av1/common/x86/cfl_sse2.c b/third_party/aom/av1/common/x86/cfl_sse2.c
new file mode 100644
index 0000000000..4783fe098c
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_sse2.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "av1/common/cfl.h"
+#include "config/av1_rtcd.h"
+
+static INLINE __m128i fill_sum_epi32(__m128i l0) {
+ l0 = _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(1, 0, 3, 2)));
+ return _mm_add_epi32(l0, _mm_shuffle_epi32(l0, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+static INLINE void subtract_average_sse2(const uint16_t *src_ptr,
+ int16_t *dst_ptr, int width,
+ int height, int round_offset,
+ int num_pel_log2) {
+ const __m128i zeros = _mm_setzero_si128();
+ const __m128i round_offset_epi32 = _mm_set1_epi32(round_offset);
+ const __m128i *src = (__m128i *)src_ptr;
+ const __m128i *const end = src + height * CFL_BUF_LINE_I128;
+ const int step = CFL_BUF_LINE_I128 * (1 + (width == 8) + 3 * (width == 4));
+
+ __m128i sum = zeros;
+ do {
+ __m128i l0;
+ if (width == 4) {
+ l0 = _mm_add_epi16(_mm_loadl_epi64(src),
+ _mm_loadl_epi64(src + CFL_BUF_LINE_I128));
+ __m128i l1 = _mm_add_epi16(_mm_loadl_epi64(src + 2 * CFL_BUF_LINE_I128),
+ _mm_loadl_epi64(src + 3 * CFL_BUF_LINE_I128));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpacklo_epi16(l1, zeros)));
+ } else {
+ if (width == 8) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src),
+ _mm_loadu_si128(src + CFL_BUF_LINE_I128));
+ } else {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src), _mm_loadu_si128(src + 1));
+ }
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ if (width == 32) {
+ l0 = _mm_add_epi16(_mm_loadu_si128(src + 2), _mm_loadu_si128(src + 3));
+ sum = _mm_add_epi32(sum, _mm_add_epi32(_mm_unpacklo_epi16(l0, zeros),
+ _mm_unpackhi_epi16(l0, zeros)));
+ }
+ }
+ src += step;
+ } while (src < end);
+
+ sum = fill_sum_epi32(sum);
+
+ __m128i avg_epi16 =
+ _mm_srli_epi32(_mm_add_epi32(sum, round_offset_epi32), num_pel_log2);
+ avg_epi16 = _mm_packs_epi32(avg_epi16, avg_epi16);
+
+ src = (__m128i *)src_ptr;
+ __m128i *dst = (__m128i *)dst_ptr;
+ do {
+ if (width == 4) {
+ _mm_storel_epi64(dst, _mm_sub_epi16(_mm_loadl_epi64(src), avg_epi16));
+ } else {
+ _mm_storeu_si128(dst, _mm_sub_epi16(_mm_loadu_si128(src), avg_epi16));
+ if (width > 8) {
+ _mm_storeu_si128(dst + 1,
+ _mm_sub_epi16(_mm_loadu_si128(src + 1), avg_epi16));
+ if (width == 32) {
+ _mm_storeu_si128(dst + 2,
+ _mm_sub_epi16(_mm_loadu_si128(src + 2), avg_epi16));
+ _mm_storeu_si128(dst + 3,
+ _mm_sub_epi16(_mm_loadu_si128(src + 3), avg_epi16));
+ }
+ }
+ }
+ src += CFL_BUF_LINE_I128;
+ dst += CFL_BUF_LINE_I128;
+ } while (src < end);
+}
+
+CFL_SUB_AVG_FN(sse2)
diff --git a/third_party/aom/av1/common/x86/cfl_ssse3.c b/third_party/aom/av1/common/x86/cfl_ssse3.c
new file mode 100644
index 0000000000..bbf0072955
--- /dev/null
+++ b/third_party/aom/av1/common/x86/cfl_ssse3.c
@@ -0,0 +1,393 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/cfl.h"
+
+#include "av1/common/x86/cfl_simd.h"
+
+// Load 32-bit integer from memory into the first element of dst.
+static INLINE __m128i _mm_loadh_epi32(__m128i const *mem_addr) {
+ return _mm_cvtsi32_si128(*((int *)mem_addr));
+}
+
+// Store 32-bit integer from the first element of a into memory.
+static INLINE void _mm_storeh_epi32(__m128i const *mem_addr, __m128i a) {
+ *((int *)mem_addr) = _mm_cvtsi128_si32(a);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i twos = _mm_set1_epi8(2);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, twos);
+ __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ bot = _mm_maddubs_epi16(bot, twos);
+ const __m128i sum = _mm_add_epi16(top, bot);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, twos);
+ bot_1 = _mm_maddubs_epi16(bot_1, twos);
+ __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i fours = _mm_set1_epi8(4);
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i top = _mm_loadh_epi32((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeh_epi32(pred_buf_m128i, top);
+ } else if (width == 8) {
+ __m128i top = _mm_loadl_epi64((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storel_epi64(pred_buf_m128i, top);
+ } else {
+ __m128i top = _mm_loadu_si128((__m128i *)input);
+ top = _mm_maddubs_epi16(top, fours);
+ _mm_storeu_si128(pred_buf_m128i, top);
+ if (width == 32) {
+ __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ top_1 = _mm_maddubs_epi16(top_1, fours);
+ _mm_storeu_si128(pred_buf_m128i + 1, top_1);
+ }
+ }
+ input += input_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Multiplies the pixels by 8 (scaling in Q3).
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i zeros = _mm_setzero_si128();
+ const int luma_stride = input_stride;
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ __m128i row = _mm_loadh_epi32((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else if (width == 8) {
+ __m128i row = _mm_loadl_epi64((__m128i *)input);
+ row = _mm_unpacklo_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3));
+ } else {
+ __m128i row = _mm_loadu_si128((__m128i *)input);
+ const __m128i row_lo = _mm_unpacklo_epi8(row, zeros);
+ const __m128i row_hi = _mm_unpackhi_epi8(row, zeros);
+ _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3));
+ if (width == 32) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros);
+ const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros);
+ _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3));
+ _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3));
+ }
+ }
+ input += luma_stride;
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ } while (pred_buf_m128i < end);
+}
+
+/**
+ * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
+ * precise version of a box filter 4:2:0 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+ const int luma_stride = input_stride << 1;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ sum = _mm_hadd_epi16(sum, sum);
+ *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+ __m128i sum = _mm_add_epi16(top, bot);
+ if (width == 8) {
+ sum = _mm_hadd_epi16(sum, sum);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i bot_1 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
+ sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
+ _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i bot_2 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i bot_3 =
+ _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
+ const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
+ const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
+ __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
+ _mm_add_epi16(next_sum, next_sum));
+ }
+ }
+ }
+ input += luma_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
+ const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
+ do {
+ if (width == 4) {
+ const __m128i top = _mm_loadl_epi64((__m128i *)input);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storeh_epi32(pred_buf_m128i, sum);
+ } else {
+ const __m128i top = _mm_loadu_si128((__m128i *)input);
+ if (width == 8) {
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2);
+ _mm_storel_epi64(pred_buf_m128i, sum);
+ } else {
+ const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2);
+ _mm_storeu_si128(pred_buf_m128i, sum);
+ if (width == 32) {
+ const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2);
+ _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
+ }
+ }
+ }
+ pred_buf_m128i += CFL_BUF_LINE_I128;
+ input += input_stride;
+ } while (pred_buf_m128i < end);
+}
+
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+ int input_stride,
+ uint16_t *pred_buf_q3,
+ int width, int height) {
+ const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3);
+ _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+ } else {
+ const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3);
+ _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+ if (width >= 16) {
+ __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1);
+ row_1 = _mm_slli_epi16(row_1, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1);
+ if (width == 32) {
+ __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2);
+ row_2 = _mm_slli_epi16(row_2, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2);
+ __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3);
+ row_3 = _mm_slli_epi16(row_3, 3);
+ _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3);
+ }
+ }
+ }
+ input += input_stride;
+ pred_buf_q3 += CFL_BUF_LINE;
+ } while (pred_buf_q3 < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
+
+static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
+ __m128i alpha_sign, __m128i dc_q0) {
+ __m128i ac_q3 = _mm_loadu_si128(input);
+ __m128i ac_sign = _mm_sign_epi16(alpha_sign, ac_q3);
+ __m128i scaled_luma_q0 = _mm_mulhrs_epi16(_mm_abs_epi16(ac_q3), alpha_q12);
+ scaled_luma_q0 = _mm_sign_epi16(scaled_luma_q0, ac_sign);
+ return _mm_add_epi16(scaled_luma_q0, dc_q0);
+}
+
+static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3,
+ uint8_t *dst, int dst_stride,
+ int alpha_q3, int width, int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ if (width < 16) {
+ res = _mm_packus_epi16(res, res);
+ if (width == 4)
+ _mm_storeh_epi32((__m128i *)dst, res);
+ else
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)dst, res);
+ if (width == 32) {
+ res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ res = _mm_packus_epi16(res, next);
+ _mm_storeu_si128((__m128i *)(dst + 16), res);
+ }
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, lbd)
+
+static INLINE __m128i highbd_max_epi16(int bd) {
+ const __m128i neg_one = _mm_set1_epi16(-1);
+ // (1 << bd) - 1 => -(-1 << bd) -1 => -1 - (-1 << bd) => -1 ^ (-1 << bd)
+ return _mm_xor_si128(_mm_slli_epi16(neg_one, bd), neg_one);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, __m128i zero, __m128i max) {
+ return _mm_max_epi16(_mm_min_epi16(u, max), zero);
+}
+
+static INLINE void cfl_predict_hbd_ssse3(const int16_t *pred_buf_q3,
+ uint16_t *dst, int dst_stride,
+ int alpha_q3, int bd, int width,
+ int height) {
+ const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
+ const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
+ const __m128i dc_q0 = _mm_set1_epi16(*dst);
+ const __m128i max = highbd_max_epi16(bd);
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i *row = (__m128i *)pred_buf_q3;
+ const __m128i *row_end = row + height * CFL_BUF_LINE_I128;
+ do {
+ __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+ res = highbd_clamp_epi16(res, zeros, max);
+ if (width == 4) {
+ _mm_storel_epi64((__m128i *)dst, res);
+ } else {
+ _mm_storeu_si128((__m128i *)dst, res);
+ }
+ if (width >= 16) {
+ const __m128i res_1 =
+ predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128(((__m128i *)dst) + 1,
+ highbd_clamp_epi16(res_1, zeros, max));
+ }
+ if (width == 32) {
+ const __m128i res_2 =
+ predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 16),
+ highbd_clamp_epi16(res_2, zeros, max));
+ const __m128i res_3 =
+ predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0);
+ _mm_storeu_si128((__m128i *)(dst + 24),
+ highbd_clamp_epi16(res_3, zeros, max));
+ }
+ dst += dst_stride;
+ } while ((row += CFL_BUF_LINE_I128) < row_end);
+}
+
+CFL_PREDICT_FN(ssse3, hbd)
diff --git a/third_party/aom/av1/common/x86/convolve_2d_avx2.c b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
new file mode 100644
index 0000000000..0acafd0446
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_avx2.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_h);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_v);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+
+ for (j = 0; j < w; j += 8) {
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));
+
+ // Load the next line
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data,
+ _mm_loadu_si128(
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),
+ 1);
+
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt);
+
+ res =
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+
+ /* Vertical filter */
+ {
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ __m256i s[8];
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1);
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3);
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5);
+
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1);
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3);
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ __m256i res_a = convolve(s, coeffs_v);
+ __m256i res_b = convolve(s + 4, coeffs_v);
+
+ // Combine V round and 2F-H-V round into a single rounding
+ res_a =
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);
+ res_b =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+
+ /* rounding code */
+ // 16 bit conversion
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ // 8 bit conversion and saturation to uint8
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+
+ // Store values into the destination buffer
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ if (w - j > 4) {
+ _mm_storel_epi64(p_0, res_0);
+ _mm_storel_epi64(p_1, res_1);
+ } else if (w == 4) {
+ xx_storel_32(p_0, res_0);
+ xx_storel_32(p_1, res_1);
+ } else {
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void av1_convolve_2d_copy_sr_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_2d_sse2.c b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
new file mode 100644
index 0000000000..b1a62a4f69
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_2d_sse2.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i sum_round =
+ _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ if (w == 2) {
+ *(uint16_t *)p = _mm_cvtsi128_si32(res);
+ } else if (w == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res);
+ } else {
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_convolve_2d_copy_sr_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+void av1_jnt_convolve_2d_copy_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ int i, j;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ assert((w % 4) == 0);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i d8 = _mm_loadu_si128((__m128i *)&src[j]);
+
+ const __m128i d16_lo = _mm_unpacklo_epi8(d8, zero);
+ const __m128i d16_hi = _mm_unpackhi_epi8(d8, zero);
+
+ const __m128i res_lo = _mm_sll_epi16(d16_lo, left_shift);
+ const __m128i res_unsigned_lo = _mm_add_epi16(res_lo, offset_const);
+
+ const __m128i res_hi = _mm_sll_epi16(d16_hi, left_shift);
+ const __m128i res_unsigned_hi = _mm_add_epi16(res_hi, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0_lo = _mm_loadu_si128((__m128i *)(&dst[j]));
+ const __m128i data_ref_0_hi =
+ _mm_loadu_si128((__m128i *)(&dst[j + 8]));
+
+ const __m128i comp_avg_res_lo =
+ comp_avg(&data_ref_0_lo, &res_unsigned_lo, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i comp_avg_res_hi =
+ comp_avg(&data_ref_0_hi, &res_unsigned_hi, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 =
+ _mm_packus_epi16(round_result_lo, round_result_hi);
+
+ _mm_store_si128((__m128i *)(&dst0[j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned_lo);
+ _mm_store_si128((__m128i *)(&dst[j + 8]), res_unsigned_hi);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i d8 = _mm_loadl_epi64((__m128i *)&src[j]);
+ const __m128i d16_0 = _mm_unpacklo_epi8(d8, zero);
+
+ const __m128i res = _mm_sll_epi16(d16_0, left_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)(&dst[j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[j]), res_8);
+ else
+ *(uint32_t *)(&dst0[j]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[j]), res_unsigned);
+ }
+ }
+ src += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_avx2.c b/third_party/aom/av1/common/x86/convolve_avx2.c
new file mode 100644
index 0000000000..0e91ea9475
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_avx2.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_convolve_y_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+ // right shift is F-1 because we are already dividing
+ // filter co-efficients by 2
+ const int right_shift_bits = (FILTER_BITS - 1);
+ const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+ const __m256i right_shift_const =
+ _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+ __m256i coeffs[4], s[8];
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ const __m256i src_01a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+
+ const __m256i src_12a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+
+ const __m256i src_23a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+
+ const __m256i src_34a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+
+ const __m256i src_45a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ const __m256i src_56a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+ s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+ s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+
+ s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+ s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+ s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+ const __m256i src_67a = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ const __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_lo = _mm256_sra_epi16(
+ _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+ if (w - j > 8) {
+ const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ /* rounding code */
+ // shift by F - 1
+ const __m256i res_16b_hi = _mm256_sra_epi16(
+ _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+ __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_a);
+ const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_1);
+ } else if (w - j > 2) {
+ xx_storel_32(&dst[i * dst_stride + j], res_0);
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+}
+
+void av1_convolve_x_sr_avx2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ __m256i filt[4], coeffs[4];
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+ const __m256i round_0_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+ const __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+ assert(conv_params->round_0 > 0);
+
+ if (w <= 8) {
+ for (i = 0; i < h; i += 2) {
+ const __m256i data = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))),
+ _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&src_ptr[i * src_stride + src_stride]))),
+ 0x20);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b =
+ _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1);
+ } else if (w > 2) {
+ xx_storel_32(&dst[i * dst_stride], res_0);
+ xx_storel_32(&dst[i * dst_stride + dst_stride], res_1);
+ } else {
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride];
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride];
+ *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0);
+ *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 18
+ // 19 20 21 22 23
+ const __m256i data = _mm256_inserti128_si256(
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]),
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]),
+ 1);
+
+ __m256i res_16b = convolve_lowbd_x(data, coeffs, filt);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const),
+ round_0_shift);
+
+ res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const),
+ round_shift);
+
+ /* rounding code */
+ // 8 bit conversion and saturation to uint8
+ __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b);
+
+ // Store values into the destination buffer
+ // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ res_8b = _mm256_permute4x64_epi64(res_8b, 216);
+ __m128i res = _mm256_castsi256_si128(res_8b);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/convolve_sse2.c b/third_party/aom/av1/common/x86/convolve_sse2.c
new file mode 100644
index 0000000000..5016642dee
--- /dev/null
+++ b/third_party/aom/av1/common/x86/convolve_sse2.c
@@ -0,0 +1,338 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "av1/common/convolve.h"
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ coeffs[0] = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_unpackhi_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+ __m128i coeffs[4];
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+ (void)conv_params;
+
+ assert(conv_params->round_0 <= FILTER_BITS);
+ assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+ ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+ if (w <= 4) {
+ __m128i s[8], src6, res, res_round, res16;
+ uint32_t res_int;
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = res_int;
+ else
+ *(uint32_t *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+ res16 = _mm_packs_epi32(res_round, res_round);
+ res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+
+ if (w == 2)
+ *(uint16_t *)dst = res_int;
+ else
+ *(uint32_t *)dst = res_int;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi;
+ __m128i res_lo_round, res_hi_round, res16, res;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
+
+void av1_convolve_x_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_0_const =
+ _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+ __m128i coeffs[4];
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+ if (w <= 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const), round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_lo_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ uint32_t r = _mm_cvtsi128_si32(res);
+ if (w == 2)
+ *(uint16_t *)dst = r;
+ else
+ *(uint32_t *)dst = r;
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_0_const), round_0_shift);
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_0_const), round_0_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/filterintra_sse4.c b/third_party/aom/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000000..c11edc1d45
--- /dev/null
+++ b/third_party/aom/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ int r, c;
+ uint8_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ // The initialization is just for silencing Jenkins static analysis warnings
+ for (r = 0; r < bh + 1; ++r)
+ memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+ const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+ const __m128i filter_intra_scale_bits =
+ _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+ for (r = 1; r < bh + 1; r += 2) {
+ for (c = 1; c < bw + 1; c += 4) {
+ DECLARE_ALIGNED(16, uint8_t, p[8]);
+ memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+ p[5] = buffer[r][c - 1];
+ p[6] = buffer[r + 1][c - 1];
+ p[7] = 0;
+ const __m128i p_b = xx_loadl_64(p);
+ const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+ const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+ const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+ const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+ const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+ const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+ const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+ const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+ // Rounding
+ const __m128i round_w =
+ _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+ const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+ const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+ // Storing
+ xx_storel_32(&buffer[r][c], out_r);
+ xx_storel_32(&buffer[r + 1][c], out_r1);
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+ dst += stride;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
new file mode 100644
index 0000000000..ae68f0bbb3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4,
+ const int subpel_y_q4,
+ ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_set1_epi16(0);
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+ res_b_round =
+ _mm256_sra_epi32(_mm256_add_epi32(res_b_round, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+ __m256i s[8];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+ s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+ s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+ s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+ _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+ _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+ _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+ _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)bd;
+
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memcpy(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
new file mode 100644
index 0000000000..15f8872c18
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void copy_64(const uint16_t *src, uint16_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void copy_128(const uint16_t *src, uint16_t *dst) {
+ __m128i s[16];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+ s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+ s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+ s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+ s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+ s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+ s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+ s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+ _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+ _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+ _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+ _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+ _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+ _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+ _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+ _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void av1_highbd_convolve_2d_copy_sr_sse2(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+ (void)conv_params;
+ (void)bd;
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ __m128i s = _mm_loadl_epi64((__m128i *)src);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ s = _mm_loadl_epi64((__m128i *)src);
+ *(uint32_t *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
new file mode 100644
index 0000000000..3f8dafb4b3
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_sse4.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const __m128i offset_const_16b = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 8)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_16bit =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_32b_lo, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 4) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);
+
+ const __m128i res = _mm_sll_epi16(src_10, left_shift);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
+ const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);
+
+ const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_32b_hi, offset_const);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_1 = _mm_srli_si128(res_clip, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m128i res_unsigned_16b =
+ _mm_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+
+ const __m128i res_unsigned_lo =
+ _mm_add_epi32(res_lo_round, offset_const);
+
+ if (w < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i data_ref_0 = _mm_cvtepu16_epi32(data_0);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ } else {
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_unsigned_hi =
+ _mm_add_epi32(res_hi_round, offset_const);
+
+ if (do_average) {
+ const __m128i data_lo =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_hi =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j + 4]));
+
+ const __m128i data_ref_0_lo = _mm_cvtepu16_epi32(data_lo);
+ const __m128i data_ref_0_hi = _mm_cvtepu16_epi32(data_hi);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m128i res_16b =
+ _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
new file mode 100644
index 0000000000..1d029db39a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_convolve_2d_ssse3.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_convolve_2d_sr_ssse3(
+ const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+ __m128i coeffs_x[4], coeffs_y[4], s[16];
+
+ const __m128i round_const_x = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], res);
+ }
+ }
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(im_block + 0 * im_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(im_block + 1 * im_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(im_block + 2 * im_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(im_block + 3 * im_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(im_block + 4 * im_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(im_block + 5 * im_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(im_block + 6 * im_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * im_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * im_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 =
+ _mm_sra_epi32(_mm_add_epi32(res_a0, round_const_y), round_shift_y);
+ res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 =
+ _mm_sra_epi32(_mm_add_epi32(res_a1, round_const_y), round_shift_y);
+ res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a_round1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_y), round_shift_y);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_y), round_shift_y);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((uint32_t *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
new file mode 100644
index 0000000000..ade2af03e4
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -0,0 +1,1349 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+// Note:
+// Total 32x4 registers to represent 32x32 block coefficients.
+// For high bit depth, each coefficient is 4-byte.
+// Each __m256i register holds 8 coefficients.
+// So each "row" we needs 4 register. Totally 32 rows
+// Register layout:
+// v0, v1, v2, v3,
+// v4, v5, v6, v7,
+// ... ...
+// v124, v125, v126, v127
+
+static INLINE __m256i highbd_clamp_epi16_avx2(__m256i u, int bd) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i max = _mm256_sub_epi16(_mm256_slli_epi16(one, bd), one);
+ __m256i clamped, mask;
+
+ mask = _mm256_cmpgt_epi16(u, max);
+ clamped = _mm256_andnot_si256(mask, u);
+ mask = _mm256_and_si256(mask, max);
+ clamped = _mm256_or_si256(mask, clamped);
+ mask = _mm256_cmpgt_epi16(clamped, zero);
+ clamped = _mm256_and_si256(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m256i highbd_get_recon_16x8_avx2(const __m256i pred,
+ __m256i res0, __m256i res1,
+ const int bd) {
+ __m256i x0 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(pred));
+ __m256i x1 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(pred, 1));
+
+ x0 = _mm256_add_epi32(res0, x0);
+ x1 = _mm256_add_epi32(res1, x1);
+ x0 = _mm256_packus_epi32(x0, x1);
+ x0 = _mm256_permute4x64_epi64(x0, 0xd8);
+ x0 = highbd_clamp_epi16_avx2(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_16xn_avx2(__m256i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m256i v = _mm256_loadu_si256((__m256i const *)(output + i * stride));
+ __m256i u = highbd_get_recon_16x8_avx2(v, in[j], in[j + height], bd);
+
+ _mm256_storeu_si256((__m256i *)(output + i * stride), u);
+ }
+}
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static void transpose_8x8_avx2(const __m256i *in, __m256i *out) {
+ __m256i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m256i x0, x1;
+
+ u0 = _mm256_unpacklo_epi32(in[0], in[1]);
+ u1 = _mm256_unpackhi_epi32(in[0], in[1]);
+
+ u2 = _mm256_unpacklo_epi32(in[2], in[3]);
+ u3 = _mm256_unpackhi_epi32(in[2], in[3]);
+
+ u4 = _mm256_unpacklo_epi32(in[4], in[5]);
+ u5 = _mm256_unpackhi_epi32(in[4], in[5]);
+
+ u6 = _mm256_unpacklo_epi32(in[6], in[7]);
+ u7 = _mm256_unpackhi_epi32(in[6], in[7]);
+
+ x0 = _mm256_unpacklo_epi64(u0, u2);
+ x1 = _mm256_unpacklo_epi64(u4, u6);
+ out[0] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[4] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u0, u2);
+ x1 = _mm256_unpackhi_epi64(u4, u6);
+ out[1] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[5] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpacklo_epi64(u1, u3);
+ x1 = _mm256_unpacklo_epi64(u5, u7);
+ out[2] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[6] = _mm256_permute2f128_si256(x0, x1, 0x31);
+
+ x0 = _mm256_unpackhi_epi64(u1, u3);
+ x1 = _mm256_unpackhi_epi64(u5, u7);
+ out[3] = _mm256_permute2f128_si256(x0, x1, 0x20);
+ out[7] = _mm256_permute2f128_si256(x0, x1, 0x31);
+}
+
+static void load_buffer_32x32(const int32_t *coeff, __m256i *in,
+ int input_stiride, int size) {
+ int i;
+ for (i = 0; i < size; ++i) {
+ in[i] = _mm256_loadu_si256((const __m256i *)(coeff + i * input_stiride));
+ }
+}
+
+static INLINE __m256i half_btf_0_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *rounding, int bit) {
+ __m256i x;
+ x = _mm256_mullo_epi32(*w0, *n0);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m256i half_btf_avx2(const __m256i *w0, const __m256i *n0,
+ const __m256i *w1, const __m256i *n1,
+ const __m256i *rounding, int bit) {
+ __m256i x, y;
+
+ x = _mm256_mullo_epi32(*w0, *n0);
+ y = _mm256_mullo_epi32(*w1, *n1);
+ x = _mm256_add_epi32(x, y);
+ x = _mm256_add_epi32(x, *rounding);
+ x = _mm256_srai_epi32(x, bit);
+ return x;
+}
+
+static void addsub_avx2(const __m256i in0, const __m256i in1, __m256i *out0,
+ __m256i *out1, const __m256i *clamp_lo,
+ const __m256i *clamp_hi) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_no_clamp_avx2(const __m256i in0, const __m256i in1,
+ __m256i *out0, __m256i *out1) {
+ __m256i a0 = _mm256_add_epi32(in0, in1);
+ __m256i a1 = _mm256_sub_epi32(in0, in1);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_shift_avx2(const __m256i in0, const __m256i in1,
+ __m256i *out0, __m256i *out1,
+ const __m256i *clamp_lo, const __m256i *clamp_hi,
+ int shift) {
+ __m256i offset = _mm256_set1_epi32((1 << shift) >> 1);
+ __m256i in0_w_offset = _mm256_add_epi32(in0, offset);
+ __m256i a0 = _mm256_add_epi32(in0_w_offset, in1);
+ __m256i a1 = _mm256_sub_epi32(in0_w_offset, in1);
+
+ a0 = _mm256_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm256_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm256_max_epi32(a0, *clamp_lo);
+ a0 = _mm256_min_epi32(a0, *clamp_hi);
+ a1 = _mm256_max_epi32(a1, *clamp_lo);
+ a1 = _mm256_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static INLINE void idct32_stage4_avx2(
+ __m256i *bf1, const __m256i *cospim8, const __m256i *cospi56,
+ const __m256i *cospi8, const __m256i *cospim56, const __m256i *cospim40,
+ const __m256i *cospi24, const __m256i *cospi40, const __m256i *cospim24,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_avx2(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_avx2(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_avx2(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 = half_btf_avx2(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_avx2(
+ __m256i *bf1, const __m256i *cospim16, const __m256i *cospi48,
+ const __m256i *cospi16, const __m256i *cospim48, const __m256i *clamp_lo,
+ const __m256i *clamp_hi, const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_avx2(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 = half_btf_avx2(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_avx2(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_avx2(
+ __m256i *bf1, const __m256i *cospim32, const __m256i *cospi32,
+ const __m256i *cospim16, const __m256i *cospi48, const __m256i *cospi16,
+ const __m256i *cospim48, const __m256i *clamp_lo, const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_avx2(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_avx2(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] = half_btf_avx2(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_avx2(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] = half_btf_avx2(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 = half_btf_avx2(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] = half_btf_avx2(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] = half_btf_avx2(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_avx2(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_avx2(__m256i *bf1, const __m256i *cospim32,
+ const __m256i *cospi32,
+ const __m256i *clamp_lo,
+ const __m256i *clamp_hi,
+ const __m256i *rounding, int bit) {
+ __m256i temp1, temp2;
+ addsub_avx2(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_avx2(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] = half_btf_avx2(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] = half_btf_avx2(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_avx2(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] = half_btf_avx2(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_avx2(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] = half_btf_avx2(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_avx2(__m256i *bf1, __m256i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_avx2(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void idct32_low1_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i x;
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ x = _mm256_mullo_epi32(in[0], cospi32);
+ x = _mm256_add_epi32(x, rounding);
+ x = _mm256_srai_epi32(x, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ x = _mm256_max_epi32(x, clamp_lo);
+ x = _mm256_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m256i offset = _mm256_set1_epi32((1 << out_shift) >> 1);
+ x = _mm256_add_epi32(offset, x);
+ x = _mm256_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm256_max_epi32(x, clamp_lo_out);
+ x = _mm256_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+ out[8] = x;
+ out[9] = x;
+ out[10] = x;
+ out[11] = x;
+ out[12] = x;
+ out[13] = x;
+ out[14] = x;
+ out[15] = x;
+ out[16] = x;
+ out[17] = x;
+ out[18] = x;
+ out[19] = x;
+ out[20] = x;
+ out[21] = x;
+ out[22] = x;
+ out[23] = x;
+ out[24] = x;
+ out[25] = x;
+ out[26] = x;
+ out[27] = x;
+ out[28] = x;
+ out[29] = x;
+ out[30] = x;
+ out[31] = x;
+}
+
+static void idct32_low8_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct32_low16_avx2(__m256i *in, __m256i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_avx2(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_avx2(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_avx2(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_avx2(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_avx2(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_avx2(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_avx2(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_avx2(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_avx2(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_avx2(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_avx2(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_avx2(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_avx2(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_avx2(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_avx2(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_avx2(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_avx2(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_avx2(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_avx2(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_avx2(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_avx2(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_avx2(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_avx2(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_avx2(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_avx2(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf1[7] = half_btf_0_avx2(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_avx2(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_avx2(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_avx2(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_avx2(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_avx2(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_avx2(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_avx2(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_avx2(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_avx2(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_avx2(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_avx2(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_avx2(bf1, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct32_avx2(__m256i *in, __m256i *out, int bit, int do_cols, int bd,
+ int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m256i cospi62 = _mm256_set1_epi32(cospi[62]);
+ const __m256i cospi30 = _mm256_set1_epi32(cospi[30]);
+ const __m256i cospi46 = _mm256_set1_epi32(cospi[46]);
+ const __m256i cospi14 = _mm256_set1_epi32(cospi[14]);
+ const __m256i cospi54 = _mm256_set1_epi32(cospi[54]);
+ const __m256i cospi22 = _mm256_set1_epi32(cospi[22]);
+ const __m256i cospi38 = _mm256_set1_epi32(cospi[38]);
+ const __m256i cospi6 = _mm256_set1_epi32(cospi[6]);
+ const __m256i cospi58 = _mm256_set1_epi32(cospi[58]);
+ const __m256i cospi26 = _mm256_set1_epi32(cospi[26]);
+ const __m256i cospi42 = _mm256_set1_epi32(cospi[42]);
+ const __m256i cospi10 = _mm256_set1_epi32(cospi[10]);
+ const __m256i cospi50 = _mm256_set1_epi32(cospi[50]);
+ const __m256i cospi18 = _mm256_set1_epi32(cospi[18]);
+ const __m256i cospi34 = _mm256_set1_epi32(cospi[34]);
+ const __m256i cospi2 = _mm256_set1_epi32(cospi[2]);
+ const __m256i cospim58 = _mm256_set1_epi32(-cospi[58]);
+ const __m256i cospim26 = _mm256_set1_epi32(-cospi[26]);
+ const __m256i cospim42 = _mm256_set1_epi32(-cospi[42]);
+ const __m256i cospim10 = _mm256_set1_epi32(-cospi[10]);
+ const __m256i cospim50 = _mm256_set1_epi32(-cospi[50]);
+ const __m256i cospim18 = _mm256_set1_epi32(-cospi[18]);
+ const __m256i cospim34 = _mm256_set1_epi32(-cospi[34]);
+ const __m256i cospim2 = _mm256_set1_epi32(-cospi[2]);
+ const __m256i cospi60 = _mm256_set1_epi32(cospi[60]);
+ const __m256i cospi28 = _mm256_set1_epi32(cospi[28]);
+ const __m256i cospi44 = _mm256_set1_epi32(cospi[44]);
+ const __m256i cospi12 = _mm256_set1_epi32(cospi[12]);
+ const __m256i cospi52 = _mm256_set1_epi32(cospi[52]);
+ const __m256i cospi20 = _mm256_set1_epi32(cospi[20]);
+ const __m256i cospi36 = _mm256_set1_epi32(cospi[36]);
+ const __m256i cospi4 = _mm256_set1_epi32(cospi[4]);
+ const __m256i cospim52 = _mm256_set1_epi32(-cospi[52]);
+ const __m256i cospim20 = _mm256_set1_epi32(-cospi[20]);
+ const __m256i cospim36 = _mm256_set1_epi32(-cospi[36]);
+ const __m256i cospim4 = _mm256_set1_epi32(-cospi[4]);
+ const __m256i cospi56 = _mm256_set1_epi32(cospi[56]);
+ const __m256i cospi24 = _mm256_set1_epi32(cospi[24]);
+ const __m256i cospi40 = _mm256_set1_epi32(cospi[40]);
+ const __m256i cospi8 = _mm256_set1_epi32(cospi[8]);
+ const __m256i cospim40 = _mm256_set1_epi32(-cospi[40]);
+ const __m256i cospim8 = _mm256_set1_epi32(-cospi[8]);
+ const __m256i cospim56 = _mm256_set1_epi32(-cospi[56]);
+ const __m256i cospim24 = _mm256_set1_epi32(-cospi[24]);
+ const __m256i cospi32 = _mm256_set1_epi32(cospi[32]);
+ const __m256i cospim32 = _mm256_set1_epi32(-cospi[32]);
+ const __m256i cospi48 = _mm256_set1_epi32(cospi[48]);
+ const __m256i cospim48 = _mm256_set1_epi32(-cospi[48]);
+ const __m256i cospi16 = _mm256_set1_epi32(cospi[16]);
+ const __m256i cospim16 = _mm256_set1_epi32(-cospi[16]);
+ const __m256i rounding = _mm256_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m256i clamp_lo = _mm256_set1_epi32(-(1 << (log_range - 1)));
+ const __m256i clamp_hi = _mm256_set1_epi32((1 << (log_range - 1)) - 1);
+ __m256i bf1[32], bf0[32];
+
+ {
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_avx2(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_avx2(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_avx2(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_avx2(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_avx2(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_avx2(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_avx2(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_avx2(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_avx2(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_avx2(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_avx2(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_avx2(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_avx2(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_avx2(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_avx2(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_avx2(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_avx2(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_avx2(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_avx2(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_avx2(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_avx2(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_avx2(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_avx2(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_avx2(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_avx2(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_avx2(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_avx2(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_avx2(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_avx2(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_avx2(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_avx2(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_avx2(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_avx2(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_avx2(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_avx2(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_avx2(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_avx2(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_avx2(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_avx2(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_avx2(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_avx2(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_avx2(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_avx2(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_avx2(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_avx2(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_avx2(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_avx2(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_avx2(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_avx2(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_avx2(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_avx2(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ if (do_cols) {
+ addsub_no_clamp_avx2(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_avx2(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_avx2(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_avx2(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_avx2(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_avx2(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_avx2(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_avx2(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_avx2(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_avx2(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_avx2(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_avx2(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_avx2(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_avx2(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_avx2(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_avx2(bf0[15], bf0[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m256i clamp_lo_out = _mm256_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m256i clamp_hi_out = _mm256_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_avx2(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_avx2(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+typedef void (*transform_1d_avx2)(__m256i *in, __m256i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+static const transform_1d_avx2
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32_low1_avx2, idct32_low8_avx2, idct32_low16_avx2, idct32_avx2 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+
+ { { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_avx2(const int32_t *input,
+ uint16_t *output, int stride,
+ TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m256i buf1[64 * 2];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_avx2 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_avx2 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
+ __m256i buf0[32];
+ const int32_t *input_row = input + i * input_stride * 8;
+ for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
+ __m256i *buf0_cur = buf0 + j * 8;
+ load_buffer_32x32(input_row + j * 8, buf0_cur, input_stride, 8);
+
+ transpose_8x8_avx2(&buf0_cur[0], &buf0_cur[0]);
+ }
+
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m256i *_buf1 = buf1 + i * 8;
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ transpose_8x8_avx2(&buf0[j * 8], &_buf1[j * txfm_size_row]);
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_avx2(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 4); i++) {
+ highbd_write_buffer_16xn_avx2(buf1 + i * txfm_size_row * 2,
+ output + 16 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_avx2(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ highbd_inv_txfm2d_add_no_identity_avx2(input, CONVERT_TO_SHORTPTR(output),
+ stride, tx_type, tx_size, eob, bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ const int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_highbd_inv_txfm2d_add_universe_avx2(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+
+ default: assert(0);
+ }
+}
+
+void av1_highbd_inv_txfm_add_avx2(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_avx2(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000000..e29e0baf50
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,5348 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_inv_txfm1d_cfg.h"
+#include "av1/common/idct.h"
+#include "av1/common/x86/av1_inv_txfm_ssse3.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ __m128i clamped, mask;
+
+ mask = _mm_cmpgt_epi16(u, max);
+ clamped = _mm_andnot_si128(mask, u);
+ mask = _mm_and_si128(mask, max);
+ clamped = _mm_or_si128(mask, clamped);
+ mask = _mm_cmpgt_epi16(clamped, zero);
+ clamped = _mm_and_si128(clamped, mask);
+
+ return clamped;
+}
+
+static INLINE __m128i highbd_get_recon_8x8_sse4_1(const __m128i pred,
+ __m128i res0, __m128i res1,
+ const int bd) {
+ __m128i x0 = _mm_cvtepi16_epi32(pred);
+ __m128i x1 = _mm_cvtepi16_epi32(_mm_srli_si128(pred, 8));
+
+ x0 = _mm_add_epi32(res0, x0);
+ x1 = _mm_add_epi32(res1, x1);
+ x0 = _mm_packus_epi32(x0, x1);
+ x0 = highbd_clamp_epi16(x0, bd);
+ return x0;
+}
+
+static INLINE void highbd_write_buffer_8xn_sse4_1(__m128i *in, uint16_t *output,
+ int stride, int flipud,
+ int height, const int bd) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ for (int i = 0; i < height; ++i, j += step) {
+ __m128i v = _mm_loadu_si128((__m128i const *)(output + i * stride));
+ __m128i u = highbd_get_recon_8x8_sse4_1(v, in[j], in[j + height], bd);
+
+ _mm_storeu_si128((__m128i *)(output + i * stride), u);
+ }
+}
+
+static INLINE void load_buffer_32bit_input(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = _mm_loadu_si128((const __m128i *)(in + i * stride));
+ }
+}
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void addsub_sse4_1(const __m128i in0, const __m128i in1, __m128i *out0,
+ __m128i *out1, const __m128i *clamp_lo,
+ const __m128i *clamp_hi) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_no_clamp_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1) {
+ __m128i a0 = _mm_add_epi32(in0, in1);
+ __m128i a1 = _mm_sub_epi32(in0, in1);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void addsub_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi, int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i in0_w_offset = _mm_add_epi32(in0, offset);
+ __m128i a0 = _mm_add_epi32(in0_w_offset, in1);
+ __m128i a1 = _mm_sub_epi32(in0_w_offset, in1);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static INLINE void idct32_stage4_sse4_1(
+ __m128i *bf1, const __m128i *cospim8, const __m128i *cospi56,
+ const __m128i *cospi8, const __m128i *cospim56, const __m128i *cospim40,
+ const __m128i *cospi24, const __m128i *cospi40, const __m128i *cospim24,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim8, &bf1[17], cospi56, &bf1[30], rounding, bit);
+ bf1[30] = half_btf_sse4_1(cospi56, &bf1[17], cospi8, &bf1[30], rounding, bit);
+ bf1[17] = temp1;
+
+ temp2 = half_btf_sse4_1(cospim56, &bf1[18], cospim8, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospim8, &bf1[18], cospi56, &bf1[29], rounding, bit);
+ bf1[18] = temp2;
+
+ temp1 = half_btf_sse4_1(cospim40, &bf1[21], cospi24, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi24, &bf1[21], cospi40, &bf1[26], rounding, bit);
+ bf1[21] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim24, &bf1[22], cospim40, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospim40, &bf1[22], cospi24, &bf1[25], rounding, bit);
+ bf1[22] = temp2;
+}
+
+static INLINE void idct32_stage5_sse4_1(
+ __m128i *bf1, const __m128i *cospim16, const __m128i *cospi48,
+ const __m128i *cospi16, const __m128i *cospim48, const __m128i *clamp_lo,
+ const __m128i *clamp_hi, const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim16, &bf1[9], cospi48, &bf1[14], rounding, bit);
+ bf1[14] = half_btf_sse4_1(cospi48, &bf1[9], cospi16, &bf1[14], rounding, bit);
+ bf1[9] = temp1;
+
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[10], cospim16, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospim16, &bf1[10], cospi48, &bf1[13], rounding, bit);
+ bf1[10] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[19], bf1 + 16, bf1 + 19, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[18], bf1 + 17, bf1 + 18, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[20], bf1 + 23, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[22], bf1[21], bf1 + 22, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[27], bf1 + 24, bf1 + 27, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[25], bf1[26], bf1 + 25, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[28], bf1 + 31, bf1 + 28, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[29], bf1 + 30, bf1 + 29, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage6_sse4_1(
+ __m128i *bf1, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[6] = half_btf_sse4_1(cospi32, &bf1[5], cospi32, &bf1[6], rounding, bit);
+ bf1[5] = temp1;
+
+ addsub_sse4_1(bf1[8], bf1[11], bf1 + 8, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf1 + 9, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf1 + 15, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf1 + 14, bf1 + 13, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim16, &bf1[18], cospi48, &bf1[29], rounding, bit);
+ bf1[29] =
+ half_btf_sse4_1(cospi48, &bf1[18], cospi16, &bf1[29], rounding, bit);
+ bf1[18] = temp1;
+ temp2 = half_btf_sse4_1(cospim16, &bf1[19], cospi48, &bf1[28], rounding, bit);
+ bf1[28] =
+ half_btf_sse4_1(cospi48, &bf1[19], cospi16, &bf1[28], rounding, bit);
+ bf1[19] = temp2;
+ temp1 =
+ half_btf_sse4_1(cospim48, &bf1[20], cospim16, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospim16, &bf1[20], cospi48, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 =
+ half_btf_sse4_1(cospim48, &bf1[21], cospim16, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospim16, &bf1[21], cospi48, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+}
+
+static INLINE void idct32_stage7_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[7], bf1 + 0, bf1 + 7, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[6], bf1 + 1, bf1 + 6, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[5], bf1 + 2, bf1 + 5, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[4], bf1 + 3, bf1 + 4, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(cospi32, &bf1[10], cospi32, &bf1[13], rounding, bit);
+ bf1[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(cospi32, &bf1[11], cospi32, &bf1[12], rounding, bit);
+ bf1[11] = temp2;
+
+ addsub_sse4_1(bf1[16], bf1[23], bf1 + 16, bf1 + 23, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[17], bf1[22], bf1 + 17, bf1 + 22, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[18], bf1[21], bf1 + 18, bf1 + 21, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[20], bf1 + 19, bf1 + 20, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[24], bf1 + 31, bf1 + 24, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[30], bf1[25], bf1 + 30, bf1 + 25, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[29], bf1[26], bf1 + 29, bf1 + 26, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[27], bf1 + 28, bf1 + 27, clamp_lo, clamp_hi);
+}
+
+static INLINE void idct32_stage8_sse4_1(__m128i *bf1, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rounding, int bit) {
+ __m128i temp1, temp2;
+ addsub_sse4_1(bf1[0], bf1[15], bf1 + 0, bf1 + 15, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf1 + 1, bf1 + 14, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf1 + 2, bf1 + 13, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf1 + 3, bf1 + 12, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf1 + 4, bf1 + 11, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf1 + 5, bf1 + 10, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf1 + 6, bf1 + 9, clamp_lo, clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf1 + 7, bf1 + 8, clamp_lo, clamp_hi);
+
+ temp1 = half_btf_sse4_1(cospim32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[27] =
+ half_btf_sse4_1(cospi32, &bf1[20], cospi32, &bf1[27], rounding, bit);
+ bf1[20] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[26] =
+ half_btf_sse4_1(cospi32, &bf1[21], cospi32, &bf1[26], rounding, bit);
+ bf1[21] = temp2;
+ temp1 = half_btf_sse4_1(cospim32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[25] =
+ half_btf_sse4_1(cospi32, &bf1[22], cospi32, &bf1[25], rounding, bit);
+ bf1[22] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[24] =
+ half_btf_sse4_1(cospi32, &bf1[23], cospi32, &bf1[24], rounding, bit);
+ bf1[23] = temp2;
+}
+
+static INLINE void idct32_stage9_sse4_1(__m128i *bf1, __m128i *out,
+ const int do_cols, const int bd,
+ const int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf1[0], bf1[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf1[1], bf1[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf1[2], bf1[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf1[3], bf1[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf1[4], bf1[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf1[5], bf1[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf1[6], bf1[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf1[7], bf1[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf1[8], bf1[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf1[9], bf1[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf1[10], bf1[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf1[11], bf1[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf1[12], bf1[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf1[13], bf1[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf1[14], bf1[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf1[15], bf1[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf1[0], bf1[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[1], bf1[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[2], bf1[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[3], bf1[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[4], bf1[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[5], bf1[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[6], bf1[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[7], bf1[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[8], bf1[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[9], bf1[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[10], bf1[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[11], bf1[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[12], bf1[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[13], bf1[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[14], bf1[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf1[15], bf1[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void neg_shift_sse4_1(const __m128i in0, const __m128i in1,
+ __m128i *out0, __m128i *out1,
+ const __m128i *clamp_lo, const __m128i *clamp_hi,
+ int shift) {
+ __m128i offset = _mm_set1_epi32((1 << shift) >> 1);
+ __m128i a0 = _mm_add_epi32(offset, in0);
+ __m128i a1 = _mm_sub_epi32(offset, in1);
+
+ a0 = _mm_sra_epi32(a0, _mm_cvtsi32_si128(shift));
+ a1 = _mm_sra_epi32(a1, _mm_cvtsi32_si128(shift));
+
+ a0 = _mm_max_epi32(a0, *clamp_lo);
+ a0 = _mm_min_epi32(a0, *clamp_hi);
+ a1 = _mm_max_epi32(a1, *clamp_lo);
+ a1 = _mm_min_epi32(a1, *clamp_hi);
+
+ *out0 = a0;
+ *out1 = a1;
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3, x, y;
+
+ v0 = _mm_unpacklo_epi32(in[0], in[1]);
+ v1 = _mm_unpackhi_epi32(in[0], in[1]);
+ v2 = _mm_unpacklo_epi32(in[2], in[3]);
+ v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ u0 = _mm_unpacklo_epi64(v0, v2);
+ u1 = _mm_unpackhi_epi64(v0, v2);
+ u2 = _mm_unpacklo_epi64(v1, v3);
+ u3 = _mm_unpackhi_epi64(v1, v3);
+
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u2, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u1, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u1, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(v0, v3, in + 0, in + 3);
+ addsub_no_clamp_sse4_1(v1, v2, in + 1, in + 2);
+ } else {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ addsub_sse4_1(v0, v3, in + 0, in + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, in + 1, in + 2, &clamp_lo, &clamp_hi);
+ }
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit, int do_cols, int bd) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ v0 = _mm_unpacklo_epi32(in[0], in[1]);
+ v1 = _mm_unpackhi_epi32(in[0], in[1]);
+ v2 = _mm_unpacklo_epi32(in[2], in[3]);
+ v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ x0 = _mm_unpacklo_epi64(v0, v2);
+ x1 = _mm_unpackhi_epi64(v0, v2);
+ x2 = _mm_unpacklo_epi64(v1, v3);
+ x3 = _mm_unpackhi_epi64(v1, v3);
+
+ s0 = _mm_mullo_epi32(x0, sinpi1);
+ s1 = _mm_mullo_epi32(x0, sinpi2);
+ s2 = _mm_mullo_epi32(x1, sinpi3);
+ s3 = _mm_mullo_epi32(x2, sinpi4);
+ s4 = _mm_mullo_epi32(x2, sinpi1);
+ s5 = _mm_mullo_epi32(x3, sinpi2);
+ s6 = _mm_mullo_epi32(x3, sinpi4);
+ t = _mm_sub_epi32(x0, x2);
+ s7 = _mm_add_epi32(t, x3);
+
+ t = _mm_add_epi32(s0, s3);
+ s0 = _mm_add_epi32(t, s5);
+ t = _mm_sub_epi32(s1, s4);
+ s1 = _mm_sub_epi32(t, s6);
+ s3 = s2;
+ s2 = _mm_mullo_epi32(s7, sinpi3);
+
+ u0 = _mm_add_epi32(s0, s3);
+ u1 = _mm_add_epi32(s1, s3);
+ u2 = s2;
+ t = _mm_add_epi32(s0, s1);
+ u3 = _mm_sub_epi32(t, s3);
+
+ u0 = _mm_add_epi32(u0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(u1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(u2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(u3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
+
+ if (!do_cols) {
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ u0 = _mm_max_epi32(u0, clamp_lo);
+ u0 = _mm_min_epi32(u0, clamp_hi);
+ u1 = _mm_max_epi32(u1, clamp_lo);
+ u1 = _mm_min_epi32(u1, clamp_hi);
+ u2 = _mm_max_epi32(u2, clamp_lo);
+ u2 = _mm_min_epi32(u2, clamp_hi);
+ u3 = _mm_max_epi32(u3, clamp_lo);
+ u3 = _mm_min_epi32(u3, clamp_hi);
+ }
+
+ in[0] = u0;
+ in[1] = u1;
+ in[2] = u2;
+ in[3] = u3;
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+ __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[1] = _mm_add_epi32(in[1], rnding);
+ in[2] = _mm_add_epi32(in[2], rnding);
+ in[3] = _mm_add_epi32(in[3], rnding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ round_shift_4x4(in, shift);
+
+ v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+ v0 = _mm_unpacklo_epi16(v0, zero);
+ v1 = _mm_unpacklo_epi16(v1, zero);
+ v2 = _mm_unpacklo_epi16(v2, zero);
+ v3 = _mm_unpacklo_epi16(v3, zero);
+
+ if (fliplr) {
+ in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+ in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+ in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+ in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+ }
+
+ if (flipud) {
+ u0 = _mm_add_epi32(in[3], v0);
+ u1 = _mm_add_epi32(in[2], v1);
+ u2 = _mm_add_epi32(in[1], v2);
+ u3 = _mm_add_epi32(in[0], v3);
+ } else {
+ u0 = _mm_add_epi32(in[0], v0);
+ u1 = _mm_add_epi32(in[1], v1);
+ u2 = _mm_add_epi32(in[2], v2);
+ u3 = _mm_add_epi32(in[3], v3);
+ }
+
+ v0 = _mm_packus_epi32(u0, u1);
+ v2 = _mm_packus_epi32(u2, u3);
+
+ u0 = highbd_clamp_epi16(v0, bd);
+ u2 = highbd_clamp_epi16(v2, bd);
+
+ v0 = _mm_unpacklo_epi64(u0, u0);
+ v1 = _mm_unpackhi_epi64(u0, u0);
+ v2 = _mm_unpacklo_epi64(u2, u2);
+ v3 = _mm_unpackhi_epi64(u2, u2);
+
+ _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+ _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+ _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+ _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void av1_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = inv_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(coeff, in);
+ idct4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ idct4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(coeff, in);
+ iadst4x4_sse4_1(in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd);
+ iadst4x4_sse4_1(in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd);
+ write_buffer_4x4(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+ in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+ in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+ in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+ in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+ in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+ in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+ in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+ in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+ in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+ in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+ in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+ in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+ in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+ in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+ in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+ in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0 * 2 + col];
+ u1 = in[4 * 2 + col];
+ u2 = in[2 * 2 + col];
+ u3 = in[6 * 2 + col];
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+ y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+ y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col);
+ addsub_no_clamp_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col);
+ addsub_no_clamp_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col);
+ addsub_no_clamp_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ addsub_shift_sse4_1(u0, u7, out + 0 * 2 + col, out + 7 * 2 + col,
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u1, u6, out + 1 * 2 + col, out + 6 * 2 + col,
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u2, u5, out + 2 * 2 + col, out + 5 * 2 + col,
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u3, u4, out + 3 * 2 + col, out + 4 * 2 + col,
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // Even 8 points: 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[14], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[14], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[10], cospi20);
+ x = _mm_mullo_epi32(in[4], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[10], cospi44);
+ x = _mm_mullo_epi32(in[4], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[6], cospi36);
+ x = _mm_mullo_epi32(in[8], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[6], cospi28);
+ x = _mm_mullo_epi32(in[8], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[2], cospi52);
+ x = _mm_mullo_epi32(in[12], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[2], cospi12);
+ x = _mm_mullo_epi32(in[12], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[2] = _mm_sub_epi32(kZero, u[4]);
+ out[4] = u[6];
+ out[6] = _mm_sub_epi32(kZero, u[2]);
+ out[8] = u[3];
+ out[10] = _mm_sub_epi32(kZero, u[7]);
+ out[12] = u[5];
+ out[14] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 2, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 4, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 8, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 12, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ // stage 2
+ // (1)
+ u[0] = _mm_mullo_epi32(in[15], cospi4);
+ x = _mm_mullo_epi32(in[1], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[15], cospi60);
+ x = _mm_mullo_epi32(in[1], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[11], cospi20);
+ x = _mm_mullo_epi32(in[5], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[11], cospi44);
+ x = _mm_mullo_epi32(in[5], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[7], cospi36);
+ x = _mm_mullo_epi32(in[9], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[7], cospi28);
+ x = _mm_mullo_epi32(in[9], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[3], cospi52);
+ x = _mm_mullo_epi32(in[13], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[3], cospi12);
+ x = _mm_mullo_epi32(in[13], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[1] = u[0];
+ out[3] = _mm_sub_epi32(kZero, u[4]);
+ out[5] = u[6];
+ out[7] = _mm_sub_epi32(kZero, u[2]);
+ out[9] = u[3];
+ out[11] = _mm_sub_epi32(kZero, u[7]);
+ out[13] = u[5];
+ out[15] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 1, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 5, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 9, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 13, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+static void round_shift_8x8(__m128i *in, int shift) {
+ round_shift_4x4(&in[0], shift);
+ round_shift_4x4(&in[4], shift);
+ round_shift_4x4(&in[8], shift);
+ round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo, __m128i res_hi,
+ int fliplr, int bd) {
+ __m128i x0, x1;
+ const __m128i zero = _mm_setzero_si128();
+
+ x0 = _mm_unpacklo_epi16(pred, zero);
+ x1 = _mm_unpackhi_epi16(pred, zero);
+
+ if (fliplr) {
+ res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+ res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+ x0 = _mm_add_epi32(res_hi, x0);
+ x1 = _mm_add_epi32(res_lo, x1);
+
+ } else {
+ x0 = _mm_add_epi32(res_lo, x0);
+ x1 = _mm_add_epi32(res_hi, x1);
+ }
+
+ x0 = _mm_packus_epi32(x0, x1);
+ return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+ int fliplr, int flipud, int shift, int bd) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+ round_shift_8x8(in, shift);
+
+ v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+ v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+ v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+ v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+ v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+ v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+ v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+ v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+ if (flipud) {
+ u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+ } else {
+ u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+ u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+ u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+ u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+ u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+ u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+ u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+ u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+ }
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+ _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+ _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+ _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+ _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+ _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+ _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+ _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void av1_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ idct8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(coeff, in);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_row[txw_idx][txh_idx], 0, bd,
+ -shift[0]);
+ transpose_8x8(in, out);
+ iadst8x8_sse4_1(out, in, inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+ write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
+ break;
+ default: assert(0);
+ }
+}
+
+static void idct8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ __m128i x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ x = _mm_mullo_epi32(in[0], cospi32);
+ x = _mm_add_epi32(x, rnding);
+ x = _mm_srai_epi32(x, bit);
+
+ // stage 4
+ // stage 5
+ if (!do_cols) {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[1] = x;
+ out[2] = x;
+ out[3] = x;
+ out[4] = x;
+ out[5] = x;
+ out[6] = x;
+ out[7] = x;
+}
+
+static void idct8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+
+ // stage 0
+ // stage 1
+ // stage 2
+ u0 = in[0];
+ u1 = in[4];
+ u2 = in[2];
+ u3 = in[6];
+
+ x = _mm_mullo_epi32(in[1], cospi56);
+ y = _mm_mullo_epi32(in[7], cospim8);
+ u4 = _mm_add_epi32(x, y);
+ u4 = _mm_add_epi32(u4, rnding);
+ u4 = _mm_srai_epi32(u4, bit);
+
+ x = _mm_mullo_epi32(in[1], cospi8);
+ y = _mm_mullo_epi32(in[7], cospi56);
+ u7 = _mm_add_epi32(x, y);
+ u7 = _mm_add_epi32(u7, rnding);
+ u7 = _mm_srai_epi32(u7, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi24);
+ y = _mm_mullo_epi32(in[3], cospim40);
+ u5 = _mm_add_epi32(x, y);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ x = _mm_mullo_epi32(in[5], cospi40);
+ y = _mm_mullo_epi32(in[3], cospi24);
+ u6 = _mm_add_epi32(x, y);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ // stage 3
+ x = _mm_mullo_epi32(u0, cospi32);
+ y = _mm_mullo_epi32(u1, cospi32);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ v1 = _mm_sub_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi48);
+ y = _mm_mullo_epi32(u3, cospim16);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi16);
+ y = _mm_mullo_epi32(u3, cospi48);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ addsub_sse4_1(u4, u5, &v4, &v5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u7, u6, &v7, &v6, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ addsub_sse4_1(v0, v3, &u0, &u3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v1, v2, &u1, &u2, &clamp_lo, &clamp_hi);
+ u4 = v4;
+ u7 = v7;
+
+ x = _mm_mullo_epi32(v5, cospi32);
+ y = _mm_mullo_epi32(v6, cospi32);
+ u6 = _mm_add_epi32(y, x);
+ u6 = _mm_add_epi32(u6, rnding);
+ u6 = _mm_srai_epi32(u6, bit);
+
+ u5 = _mm_sub_epi32(y, x);
+ u5 = _mm_add_epi32(u5, rnding);
+ u5 = _mm_srai_epi32(u5, bit);
+
+ // stage 5
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u0, u7, out + 0, out + 7);
+ addsub_no_clamp_sse4_1(u1, u6, out + 1, out + 6);
+ addsub_no_clamp_sse4_1(u2, u5, out + 2, out + 5);
+ addsub_no_clamp_sse4_1(u3, u4, out + 3, out + 4);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ addsub_shift_sse4_1(u0, u7, out + 0, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u1, u6, out + 1, out + 6, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u2, u5, out + 2, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ addsub_shift_sse4_1(u3, u4, out + 3, out + 4, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ __m128i u[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(kZero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // stage 3
+ // stage 4
+ __m128i temp1, temp2;
+ temp1 = _mm_mullo_epi32(u[0], cospi16);
+ x = _mm_mullo_epi32(u[1], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+ u[4] = temp1;
+
+ temp2 = _mm_mullo_epi32(u[0], cospi48);
+ x = _mm_mullo_epi32(u[1], cospi16);
+ u[5] = _mm_sub_epi32(temp2, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // stage 5
+ // stage 6
+ temp1 = _mm_mullo_epi32(u[0], cospi32);
+ x = _mm_mullo_epi32(u[1], cospi32);
+ u[2] = _mm_add_epi32(temp1, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(temp1, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ temp1 = _mm_mullo_epi32(u[4], cospi32);
+ x = _mm_mullo_epi32(u[5], cospi32);
+ u[6] = _mm_add_epi32(temp1, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(temp1, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void iadst8x8_new_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i kZero = _mm_setzero_si128();
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[8], v[8], x;
+
+ // stage 0
+ // stage 1
+ // stage 2
+
+ u[0] = _mm_mullo_epi32(in[7], cospi4);
+ x = _mm_mullo_epi32(in[0], cospi60);
+ u[0] = _mm_add_epi32(u[0], x);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_mullo_epi32(in[7], cospi60);
+ x = _mm_mullo_epi32(in[0], cospi4);
+ u[1] = _mm_sub_epi32(u[1], x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // (2)
+ u[2] = _mm_mullo_epi32(in[5], cospi20);
+ x = _mm_mullo_epi32(in[2], cospi44);
+ u[2] = _mm_add_epi32(u[2], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_mullo_epi32(in[5], cospi44);
+ x = _mm_mullo_epi32(in[2], cospi20);
+ u[3] = _mm_sub_epi32(u[3], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ // (3)
+ u[4] = _mm_mullo_epi32(in[3], cospi36);
+ x = _mm_mullo_epi32(in[4], cospi28);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(in[3], cospi28);
+ x = _mm_mullo_epi32(in[4], cospi36);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ // (4)
+ u[6] = _mm_mullo_epi32(in[1], cospi52);
+ x = _mm_mullo_epi32(in[6], cospi12);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(in[1], cospi12);
+ x = _mm_mullo_epi32(in[6], cospi52);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[4], &v[0], &v[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &v[1], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &v[2], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &v[3], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[5] = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ u[5] = _mm_sub_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[6], cospim48);
+ x = _mm_mullo_epi32(v[7], cospi16);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_mullo_epi32(v[6], cospi16);
+ x = _mm_mullo_epi32(v[7], cospim48);
+ u[7] = _mm_sub_epi32(u[7], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[2], &v[0], &v[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &v[1], &v[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &v[4], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &v[5], &v[7], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ u[0] = v[0];
+ u[1] = v[1];
+ u[4] = v[4];
+ u[5] = v[5];
+
+ v[0] = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ u[2] = _mm_add_epi32(v[0], x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(v[0], x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ v[0] = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ u[6] = _mm_add_epi32(v[0], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(v[0], x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ // stage 7
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(kZero, u[4]);
+ out[2] = u[6];
+ out[3] = _mm_sub_epi32(kZero, u[2]);
+ out[4] = u[3];
+ out[5] = _mm_sub_epi32(kZero, u[7]);
+ out[6] = u[5];
+ out[7] = _mm_sub_epi32(kZero, u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out = _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[4], out + 0, out + 1, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[6], u[2], out + 2, out + 3, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[3], u[7], out + 4, out + 5, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ neg_shift_sse4_1(u[5], u[1], out + 6, out + 7, &clamp_lo_out, &clamp_hi_out,
+ out_shift);
+ }
+}
+
+static void idct16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ in[0] = _mm_mullo_epi32(in[0], cospi32);
+ in[0] = _mm_add_epi32(in[0], rnding);
+ in[0] = _mm_srai_epi32(in[0], bit);
+
+ // stage 5
+ // stage 6
+ // stage 7
+ if (do_cols) {
+ in[0] = _mm_max_epi32(in[0], clamp_lo);
+ in[0] = _mm_min_epi32(in[0], clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ in[0] = _mm_add_epi32(in[0], offset);
+ in[0] = _mm_sra_epi32(in[0], _mm_cvtsi32_si128(out_shift));
+ in[0] = _mm_max_epi32(in[0], clamp_lo_out);
+ in[0] = _mm_min_epi32(in[0], clamp_hi_out);
+ }
+
+ out[0] = in[0];
+ out[1] = in[0];
+ out[2] = in[0];
+ out[3] = in[0];
+ out[4] = in[0];
+ out[5] = in[0];
+ out[6] = in[0];
+ out[7] = in[0];
+ out[8] = in[0];
+ out[9] = in[0];
+ out[10] = in[0];
+ out[11] = in[0];
+ out[12] = in[0];
+ out[13] = in[0];
+ out[14] = in[0];
+ out[15] = in[0];
+ }
+}
+
+static void idct16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[2] = in[4];
+ u[4] = in[2];
+ u[6] = in[6];
+ u[8] = in[1];
+ u[10] = in[5];
+ u[12] = in[3];
+ u[14] = in[7];
+
+ // stage 2
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+
+ u[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ u[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+
+ u[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ u[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ // stage 3
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &u[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &u[6], &rnding, bit);
+
+ addsub_sse4_1(u[8], u[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+ u[1] = u[0];
+
+ u[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+ u[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &u[4], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &u[7], &u[6], &clamp_lo, &clamp_hi);
+
+ x = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = x;
+ y = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = y;
+
+ // stage 5
+ addsub_sse4_1(u[0], u[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[5], cospi32);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &u[0], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &u[1], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &u[2], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &u[3], &u[4], &clamp_lo, &clamp_hi);
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ u[10] = _mm_sub_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[13] = _mm_add_epi32(x, y);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_add_epi32(x, y);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+ // stage 7
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(u[0], u[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(u[1], u[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(u[2], u[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(u[3], u[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(u[4], u[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(u[5], u[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(u[6], u[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(u[7], u[8], out + 7, out + 8);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(u[0], u[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[1], u[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[2], u[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[3], u[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[4], u[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[5], u[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[6], u[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(u[7], u[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i v[16], x, y, temp1, temp2;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(x, rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(zero, x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ // stage 3
+ v[8] = v[0];
+ v[9] = v[1];
+
+ // stage 4
+ temp1 = _mm_mullo_epi32(v[8], cospi8);
+ x = _mm_mullo_epi32(v[9], cospi56);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[8], cospi56);
+ x = _mm_mullo_epi32(v[9], cospi8);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[8] = temp1;
+ v[9] = temp2;
+
+ // stage 5
+ v[4] = v[0];
+ v[5] = v[1];
+ v[12] = v[8];
+ v[13] = v[9];
+
+ // stage 6
+ temp1 = _mm_mullo_epi32(v[4], cospi16);
+ x = _mm_mullo_epi32(v[5], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[4], cospi48);
+ x = _mm_mullo_epi32(v[5], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[4] = temp1;
+ v[5] = temp2;
+
+ temp1 = _mm_mullo_epi32(v[12], cospi16);
+ x = _mm_mullo_epi32(v[13], cospi48);
+ temp1 = _mm_add_epi32(temp1, x);
+ temp1 = _mm_add_epi32(temp1, rnding);
+ temp1 = _mm_srai_epi32(temp1, bit);
+
+ temp2 = _mm_mullo_epi32(v[12], cospi48);
+ x = _mm_mullo_epi32(v[13], cospi16);
+ temp2 = _mm_sub_epi32(temp2, x);
+ temp2 = _mm_add_epi32(temp2, rnding);
+ temp2 = _mm_srai_epi32(temp2, bit);
+ v[12] = temp1;
+ v[13] = temp2;
+
+ // stage 7
+ v[2] = v[0];
+ v[3] = v[1];
+ v[6] = v[4];
+ v[7] = v[5];
+ v[10] = v[8];
+ v[11] = v[9];
+ v[14] = v[12];
+ v[15] = v[13];
+
+ // stage 8
+ y = _mm_mullo_epi32(v[2], cospi32);
+ x = _mm_mullo_epi32(v[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ y = _mm_mullo_epi32(v[6], cospi32);
+ x = _mm_mullo_epi32(v[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ y = _mm_mullo_epi32(v[10], cospi32);
+ x = _mm_mullo_epi32(v[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ y = _mm_mullo_epi32(v[14], cospi32);
+ x = _mm_mullo_epi32(v[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], x, y;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ __m128i zero = _mm_setzero_si128();
+ x = _mm_mullo_epi32(in[0], cospi62);
+ u[0] = _mm_add_epi32(x, rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ x = _mm_mullo_epi32(in[0], cospi2);
+ u[1] = _mm_sub_epi32(zero, x);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi54);
+ u[2] = _mm_add_epi32(x, rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ x = _mm_mullo_epi32(in[2], cospi10);
+ u[3] = _mm_sub_epi32(zero, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi46);
+ u[4] = _mm_add_epi32(x, rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(in[4], cospi18);
+ u[5] = _mm_sub_epi32(zero, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi38);
+ u[6] = _mm_add_epi32(x, rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(in[6], cospi26);
+ u[7] = _mm_sub_epi32(zero, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[8] = _mm_mullo_epi32(in[7], cospi34);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ u[9] = _mm_mullo_epi32(in[7], cospi30);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ u[10] = _mm_mullo_epi32(in[5], cospi42);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_mullo_epi32(in[5], cospi22);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ u[12] = _mm_mullo_epi32(in[3], cospi50);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ u[13] = _mm_mullo_epi32(in[3], cospi14);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ u[14] = _mm_mullo_epi32(in[1], cospi58);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_mullo_epi32(in[1], cospi6);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 3
+ addsub_sse4_1(u[0], u[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[6], u[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ y = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ u[8] = _mm_mullo_epi32(u[8], cospi8);
+ u[8] = _mm_add_epi32(u[8], x);
+ u[8] = _mm_add_epi32(u[8], rnding);
+ u[8] = _mm_srai_epi32(u[8], bit);
+
+ x = _mm_mullo_epi32(u[9], cospi8);
+ u[9] = _mm_sub_epi32(y, x);
+ u[9] = _mm_add_epi32(u[9], rnding);
+ u[9] = _mm_srai_epi32(u[9], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi24);
+ y = _mm_mullo_epi32(u[10], cospi24);
+ u[10] = _mm_mullo_epi32(u[10], cospi40);
+ u[10] = _mm_add_epi32(u[10], x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi40);
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi8);
+ y = _mm_mullo_epi32(u[12], cospi8);
+ u[12] = _mm_mullo_epi32(u[12], cospim56);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospim56);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi40);
+ y = _mm_mullo_epi32(u[14], cospi40);
+ u[14] = _mm_mullo_epi32(u[14], cospim24);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim24);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 5
+ addsub_sse4_1(u[0], u[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[10], u[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[11], u[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ x = _mm_mullo_epi32(u[5], cospi48);
+ y = _mm_mullo_epi32(u[4], cospi48);
+ u[4] = _mm_mullo_epi32(u[4], cospi16);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ x = _mm_mullo_epi32(u[5], cospi16);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ x = _mm_mullo_epi32(u[7], cospi16);
+ y = _mm_mullo_epi32(u[6], cospi16);
+ u[6] = _mm_mullo_epi32(u[6], cospim48);
+ u[6] = _mm_add_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ x = _mm_mullo_epi32(u[7], cospim48);
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi48);
+ y = _mm_mullo_epi32(u[12], cospi48);
+ u[12] = _mm_mullo_epi32(u[12], cospi16);
+ u[12] = _mm_add_epi32(u[12], x);
+ u[12] = _mm_add_epi32(u[12], rnding);
+ u[12] = _mm_srai_epi32(u[12], bit);
+
+ x = _mm_mullo_epi32(u[13], cospi16);
+ u[13] = _mm_sub_epi32(y, x);
+ u[13] = _mm_add_epi32(u[13], rnding);
+ u[13] = _mm_srai_epi32(u[13], bit);
+
+ x = _mm_mullo_epi32(u[15], cospi16);
+ y = _mm_mullo_epi32(u[14], cospi16);
+ u[14] = _mm_mullo_epi32(u[14], cospim48);
+ u[14] = _mm_add_epi32(u[14], x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ x = _mm_mullo_epi32(u[15], cospim48);
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 7
+ addsub_sse4_1(u[0], u[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[4], u[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[5], u[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[8], u[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[12], u[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[13], u[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ u[2] = _mm_add_epi32(y, x);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ u[3] = _mm_sub_epi32(y, x);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = _mm_sub_epi32(y, x);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ u[10] = _mm_add_epi32(y, x);
+ u[10] = _mm_add_epi32(u[10], rnding);
+ u[10] = _mm_srai_epi32(u[10], bit);
+
+ u[11] = _mm_sub_epi32(y, x);
+ u[11] = _mm_add_epi32(u[11], rnding);
+ u[11] = _mm_srai_epi32(u[11], bit);
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ u[14] = _mm_add_epi32(y, x);
+ u[14] = _mm_add_epi32(u[14], rnding);
+ u[14] = _mm_srai_epi32(u[14], bit);
+
+ u[15] = _mm_sub_epi32(y, x);
+ u[15] = _mm_add_epi32(u[15], rnding);
+ u[15] = _mm_srai_epi32(u[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = u[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), u[8]);
+ out[2] = u[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), u[4]);
+ out[4] = u[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), u[14]);
+ out[6] = u[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), u[2]);
+ out[8] = u[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), u[11]);
+ out[10] = u[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), u[7]);
+ out[12] = u[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), u[13]);
+ out[14] = u[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), u[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(u[0], u[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[12], u[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[6], u[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[10], u[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[3], u[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[15], u[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[5], u[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(u[9], u[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], v[16], x, y;
+
+ {
+ // stage 0
+ // stage 1
+ u[0] = in[0];
+ u[1] = in[8];
+ u[2] = in[4];
+ u[3] = in[12];
+ u[4] = in[2];
+ u[5] = in[10];
+ u[6] = in[6];
+ u[7] = in[14];
+ u[8] = in[1];
+ u[9] = in[9];
+ u[10] = in[5];
+ u[11] = in[13];
+ u[12] = in[3];
+ u[13] = in[11];
+ u[14] = in[7];
+ u[15] = in[15];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = half_btf_sse4_1(&cospi60, &u[8], &cospim4, &u[15], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi28, &u[9], &cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi44, &u[10], &cospim20, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi12, &u[11], &cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi52, &u[11], &cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi20, &u[10], &cospi44, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi36, &u[9], &cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi4, &u[8], &cospi60, &u[15], &rnding, bit);
+
+ // stage 3
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+ u[4] = half_btf_sse4_1(&cospi56, &v[4], &cospim8, &v[7], &rnding, bit);
+ u[5] = half_btf_sse4_1(&cospi24, &v[5], &cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi40, &v[5], &cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_sse4_1(&cospi8, &v[4], &cospi56, &v[7], &rnding, bit);
+ addsub_sse4_1(v[8], v[9], &u[8], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[10], &u[11], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[13], &u[12], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[14], &u[15], &u[14], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ x = _mm_mullo_epi32(u[0], cospi32);
+ y = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(x, y);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(x, y);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = half_btf_sse4_1(&cospi48, &u[2], &cospim16, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi16, &u[2], &cospi48, &u[3], &rnding, bit);
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[11] = u[11];
+ v[12] = u[12];
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ v[15] = u[15];
+
+ // stage 5
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+ u[4] = v[4];
+
+ x = _mm_mullo_epi32(v[5], cospi32);
+ y = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_sub_epi32(y, x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_add_epi32(y, x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(u[0], u[7], &v[0], &v[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[1], u[6], &v[1], &v[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[2], u[5], &v[2], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[3], u[4], &v[3], &v[4], &clamp_lo, &clamp_hi);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_sub_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_add_epi32(x, y);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ x = _mm_mullo_epi32(u[11], cospi32);
+ y = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_add_epi32(x, y);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 7
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(v[0], v[15], out + 0, out + 15);
+ addsub_no_clamp_sse4_1(v[1], v[14], out + 1, out + 14);
+ addsub_no_clamp_sse4_1(v[2], v[13], out + 2, out + 13);
+ addsub_no_clamp_sse4_1(v[3], v[12], out + 3, out + 12);
+ addsub_no_clamp_sse4_1(v[4], v[11], out + 4, out + 11);
+ addsub_no_clamp_sse4_1(v[5], v[10], out + 5, out + 10);
+ addsub_no_clamp_sse4_1(v[6], v[9], out + 6, out + 9);
+ addsub_no_clamp_sse4_1(v[7], v[8], out + 7, out + 8);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(v[0], v[15], out + 0, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[1], v[14], out + 1, out + 14, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[2], v[13], out + 2, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[3], v[12], out + 3, out + 12, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[4], v[11], out + 4, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[5], v[10], out + 5, out + 10, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[6], v[9], out + 6, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(v[7], v[8], out + 7, out + 8, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i u[16], v[16], x, y;
+
+ // Calculate the column 0, 1, 2, 3
+ {
+ // stage 0
+ // stage 1
+ // stage 2
+ v[0] = _mm_mullo_epi32(in[15], cospi2);
+ x = _mm_mullo_epi32(in[0], cospi62);
+ v[0] = _mm_add_epi32(v[0], x);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_mullo_epi32(in[15], cospi62);
+ x = _mm_mullo_epi32(in[0], cospi2);
+ v[1] = _mm_sub_epi32(v[1], x);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(in[13], cospi10);
+ x = _mm_mullo_epi32(in[2], cospi54);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(in[13], cospi54);
+ x = _mm_mullo_epi32(in[2], cospi10);
+ v[3] = _mm_sub_epi32(v[3], x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_mullo_epi32(in[11], cospi18);
+ x = _mm_mullo_epi32(in[4], cospi46);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(in[11], cospi46);
+ x = _mm_mullo_epi32(in[4], cospi18);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(in[9], cospi26);
+ x = _mm_mullo_epi32(in[6], cospi38);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(in[9], cospi38);
+ x = _mm_mullo_epi32(in[6], cospi26);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = _mm_mullo_epi32(in[7], cospi34);
+ x = _mm_mullo_epi32(in[8], cospi30);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(in[7], cospi30);
+ x = _mm_mullo_epi32(in[8], cospi34);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(in[5], cospi42);
+ x = _mm_mullo_epi32(in[10], cospi22);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(in[5], cospi22);
+ x = _mm_mullo_epi32(in[10], cospi42);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(in[3], cospi50);
+ x = _mm_mullo_epi32(in[12], cospi14);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(in[3], cospi14);
+ x = _mm_mullo_epi32(in[12], cospi50);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(in[1], cospi58);
+ x = _mm_mullo_epi32(in[14], cospi6);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(in[1], cospi6);
+ x = _mm_mullo_epi32(in[14], cospi58);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ addsub_sse4_1(v[0], v[8], &u[0], &u[8], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[9], &u[1], &u[9], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[10], &u[2], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[11], &u[3], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[12], &u[4], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[13], &u[5], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[6], v[14], &u[6], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[7], v[15], &u[7], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi8);
+ x = _mm_mullo_epi32(u[9], cospi56);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[9] = _mm_mullo_epi32(u[8], cospi56);
+ x = _mm_mullo_epi32(u[9], cospi8);
+ v[9] = _mm_sub_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi40);
+ x = _mm_mullo_epi32(u[11], cospi24);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_mullo_epi32(u[10], cospi24);
+ x = _mm_mullo_epi32(u[11], cospi40);
+ v[11] = _mm_sub_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[12], cospim56);
+ x = _mm_mullo_epi32(u[13], cospi8);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi8);
+ x = _mm_mullo_epi32(u[13], cospim56);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim24);
+ x = _mm_mullo_epi32(u[15], cospi40);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi40);
+ x = _mm_mullo_epi32(u[15], cospim24);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 5
+ addsub_sse4_1(v[0], v[4], &u[0], &u[4], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[5], &u[1], &u[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[2], v[6], &u[2], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[3], v[7], &u[3], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[12], &u[8], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[13], &u[9], &u[13], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[10], v[14], &u[10], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[11], v[15], &u[11], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+
+ v[4] = _mm_mullo_epi32(u[4], cospi16);
+ x = _mm_mullo_epi32(u[5], cospi48);
+ v[4] = _mm_add_epi32(v[4], x);
+ v[4] = _mm_add_epi32(v[4], rnding);
+ v[4] = _mm_srai_epi32(v[4], bit);
+
+ v[5] = _mm_mullo_epi32(u[4], cospi48);
+ x = _mm_mullo_epi32(u[5], cospi16);
+ v[5] = _mm_sub_epi32(v[5], x);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ v[6] = _mm_mullo_epi32(u[6], cospim48);
+ x = _mm_mullo_epi32(u[7], cospi16);
+ v[6] = _mm_add_epi32(v[6], x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_mullo_epi32(u[6], cospi16);
+ x = _mm_mullo_epi32(u[7], cospim48);
+ v[7] = _mm_sub_epi32(v[7], x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+
+ v[12] = _mm_mullo_epi32(u[12], cospi16);
+ x = _mm_mullo_epi32(u[13], cospi48);
+ v[12] = _mm_add_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ v[13] = _mm_mullo_epi32(u[12], cospi48);
+ x = _mm_mullo_epi32(u[13], cospi16);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[14] = _mm_mullo_epi32(u[14], cospim48);
+ x = _mm_mullo_epi32(u[15], cospi16);
+ v[14] = _mm_add_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_mullo_epi32(u[14], cospi16);
+ x = _mm_mullo_epi32(u[15], cospim48);
+ v[15] = _mm_sub_epi32(v[15], x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[2], &u[0], &u[2], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[3], &u[1], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[4], v[6], &u[4], &u[6], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[5], v[7], &u[5], &u[7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[8], v[10], &u[8], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[11], &u[9], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[12], v[14], &u[12], &u[14], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[13], v[15], &u[13], &u[15], &clamp_lo, &clamp_hi);
+
+ // stage 8
+ v[0] = u[0];
+ v[1] = u[1];
+
+ y = _mm_mullo_epi32(u[2], cospi32);
+ x = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(y, x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(y, x);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ y = _mm_mullo_epi32(u[6], cospi32);
+ x = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(y, x);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(y, x);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ y = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(y, x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(y, x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ y = _mm_mullo_epi32(u[14], cospi32);
+ x = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(y, x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(y, x);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 9
+ if (do_cols) {
+ out[0] = v[0];
+ out[1] = _mm_sub_epi32(_mm_setzero_si128(), v[8]);
+ out[2] = v[12];
+ out[3] = _mm_sub_epi32(_mm_setzero_si128(), v[4]);
+ out[4] = v[6];
+ out[5] = _mm_sub_epi32(_mm_setzero_si128(), v[14]);
+ out[6] = v[10];
+ out[7] = _mm_sub_epi32(_mm_setzero_si128(), v[2]);
+ out[8] = v[3];
+ out[9] = _mm_sub_epi32(_mm_setzero_si128(), v[11]);
+ out[10] = v[15];
+ out[11] = _mm_sub_epi32(_mm_setzero_si128(), v[7]);
+ out[12] = v[5];
+ out[13] = _mm_sub_epi32(_mm_setzero_si128(), v[13]);
+ out[14] = v[9];
+ out[15] = _mm_sub_epi32(_mm_setzero_si128(), v[1]);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(-(1 << (log_range_out - 1)));
+ const __m128i clamp_hi_out =
+ _mm_set1_epi32((1 << (log_range_out - 1)) - 1);
+
+ neg_shift_sse4_1(v[0], v[8], out + 0, out + 1, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[12], v[4], out + 2, out + 3, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[6], v[14], out + 4, out + 5, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[10], v[2], out + 6, out + 7, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[3], v[11], out + 8, out + 9, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[15], v[7], out + 10, out + 11, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[5], v[13], out + 12, out + 13, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ neg_shift_sse4_1(v[9], v[1], out + 14, out + 15, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static INLINE void idct64_stage8_sse4_1(
+ __m128i *u, const __m128i *cospim32, const __m128i *cospi32,
+ const __m128i *cospim16, const __m128i *cospi48, const __m128i *cospi16,
+ const __m128i *cospim48, const __m128i *clamp_lo, const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ temp1 = half_btf_sse4_1(cospim32, &u[10], cospi32, &u[13], rnding, bit);
+ u[13] = half_btf_sse4_1(cospi32, &u[10], cospi32, &u[13], rnding, bit);
+ u[10] = temp1;
+ temp2 = half_btf_sse4_1(cospim32, &u[11], cospi32, &u[12], rnding, bit);
+ u[12] = half_btf_sse4_1(cospi32, &u[11], cospi32, &u[12], rnding, bit);
+ u[11] = temp2;
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &u[i], &u[i ^ 7], clamp_lo, clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &u[i ^ 15], &u[i ^ 8], clamp_lo,
+ clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim16, &u[36], cospi48, &u[59], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim16, &u[37], cospi48, &u[58], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim16, &u[38], cospi48, &u[57], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim16, &u[39], cospi48, &u[56], rnding, bit);
+ u[56] = half_btf_sse4_1(cospi48, &u[39], cospi16, &u[56], rnding, bit);
+ u[57] = half_btf_sse4_1(cospi48, &u[38], cospi16, &u[57], rnding, bit);
+ u[58] = half_btf_sse4_1(cospi48, &u[37], cospi16, &u[58], rnding, bit);
+ u[59] = half_btf_sse4_1(cospi48, &u[36], cospi16, &u[59], rnding, bit);
+ u[36] = temp1;
+ u[37] = temp2;
+ u[38] = temp3;
+ u[39] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim48, &u[40], cospim16, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim48, &u[41], cospim16, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim48, &u[42], cospim16, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim48, &u[43], cospim16, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospim16, &u[43], cospi48, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospim16, &u[42], cospi48, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospim16, &u[41], cospi48, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospim16, &u[40], cospi48, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+}
+
+static INLINE void idct64_stage9_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ int i;
+ __m128i temp1, temp2, temp3, temp4;
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(u[i], u[15 - i], &u[i], &u[15 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[20], cospi32, &u[27], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[21], cospi32, &u[26], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[22], cospi32, &u[25], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[23], cospi32, &u[24], rnding, bit);
+ u[24] = half_btf_sse4_1(cospi32, &u[23], cospi32, &u[24], rnding, bit);
+ u[25] = half_btf_sse4_1(cospi32, &u[22], cospi32, &u[25], rnding, bit);
+ u[26] = half_btf_sse4_1(cospi32, &u[21], cospi32, &u[26], rnding, bit);
+ u[27] = half_btf_sse4_1(cospi32, &u[20], cospi32, &u[27], rnding, bit);
+ u[20] = temp1;
+ u[21] = temp2;
+ u[22] = temp3;
+ u[23] = temp4;
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(u[i], u[i ^ 15], &u[i], &u[i ^ 15], clamp_lo, clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(u[i ^ 15], u[i], &u[i ^ 15], &u[i], clamp_lo, clamp_hi);
+ }
+}
+
+static INLINE void idct64_stage10_sse4_1(__m128i *u, const __m128i *cospim32,
+ const __m128i *cospi32,
+ const __m128i *clamp_lo,
+ const __m128i *clamp_hi,
+ const __m128i *rnding, int bit) {
+ __m128i temp1, temp2, temp3, temp4;
+ for (int i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &u[i], &u[31 - i], clamp_lo, clamp_hi);
+ }
+
+ temp1 = half_btf_sse4_1(cospim32, &u[40], cospi32, &u[55], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[41], cospi32, &u[54], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[42], cospi32, &u[53], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[43], cospi32, &u[52], rnding, bit);
+ u[52] = half_btf_sse4_1(cospi32, &u[43], cospi32, &u[52], rnding, bit);
+ u[53] = half_btf_sse4_1(cospi32, &u[42], cospi32, &u[53], rnding, bit);
+ u[54] = half_btf_sse4_1(cospi32, &u[41], cospi32, &u[54], rnding, bit);
+ u[55] = half_btf_sse4_1(cospi32, &u[40], cospi32, &u[55], rnding, bit);
+ u[40] = temp1;
+ u[41] = temp2;
+ u[42] = temp3;
+ u[43] = temp4;
+
+ temp1 = half_btf_sse4_1(cospim32, &u[44], cospi32, &u[51], rnding, bit);
+ temp2 = half_btf_sse4_1(cospim32, &u[45], cospi32, &u[50], rnding, bit);
+ temp3 = half_btf_sse4_1(cospim32, &u[46], cospi32, &u[49], rnding, bit);
+ temp4 = half_btf_sse4_1(cospim32, &u[47], cospi32, &u[48], rnding, bit);
+ u[48] = half_btf_sse4_1(cospi32, &u[47], cospi32, &u[48], rnding, bit);
+ u[49] = half_btf_sse4_1(cospi32, &u[46], cospi32, &u[49], rnding, bit);
+ u[50] = half_btf_sse4_1(cospi32, &u[45], cospi32, &u[50], rnding, bit);
+ u[51] = half_btf_sse4_1(cospi32, &u[44], cospi32, &u[51], rnding, bit);
+ u[44] = temp1;
+ u[45] = temp2;
+ u[46] = temp3;
+ u[47] = temp4;
+}
+
+static INLINE void idct64_stage11_sse4_1(__m128i *u, __m128i *out, int do_cols,
+ int bd, int out_shift,
+ const int log_range) {
+ if (do_cols) {
+ for (int i = 0; i < 32; i++) {
+ addsub_no_clamp_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)]);
+ }
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ for (int i = 0; i < 32; i++) {
+ addsub_shift_sse4_1(u[i], u[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ }
+ }
+}
+
+static void idct64x64_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+
+ {
+ __m128i x;
+
+ // stage 1
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ // stage 6
+ x = half_btf_0_sse4_1(&cospi32, &in[0], &rnding, bit);
+
+ // stage 8
+ // stage 9
+ // stage 10
+ // stage 11
+ if (do_cols) {
+ x = _mm_max_epi32(x, clamp_lo);
+ x = _mm_min_epi32(x, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ x = _mm_add_epi32(x, offset);
+ x = _mm_sra_epi32(x, _mm_cvtsi32_si128(out_shift));
+
+ x = _mm_max_epi32(x, clamp_lo_out);
+ x = _mm_min_epi32(x, clamp_hi_out);
+ }
+
+ out[0] = x;
+ out[63] = x;
+ out[1] = x;
+ out[62] = x;
+ out[2] = x;
+ out[61] = x;
+ out[3] = x;
+ out[60] = x;
+ out[4] = x;
+ out[59] = x;
+ out[5] = x;
+ out[58] = x;
+ out[6] = x;
+ out[57] = x;
+ out[7] = x;
+ out[56] = x;
+ out[8] = x;
+ out[55] = x;
+ out[9] = x;
+ out[54] = x;
+ out[10] = x;
+ out[53] = x;
+ out[11] = x;
+ out[52] = x;
+ out[12] = x;
+ out[51] = x;
+ out[13] = x;
+ out[50] = x;
+ out[14] = x;
+ out[49] = x;
+ out[15] = x;
+ out[48] = x;
+ out[16] = x;
+ out[47] = x;
+ out[17] = x;
+ out[46] = x;
+ out[18] = x;
+ out[45] = x;
+ out[19] = x;
+ out[44] = x;
+ out[20] = x;
+ out[43] = x;
+ out[21] = x;
+ out[42] = x;
+ out[22] = x;
+ out[41] = x;
+ out[23] = x;
+ out[40] = x;
+ out[24] = x;
+ out[39] = x;
+ out[25] = x;
+ out[38] = x;
+ out[26] = x;
+ out[37] = x;
+ out[27] = x;
+ out[36] = x;
+ out[28] = x;
+ out[35] = x;
+ out[29] = x;
+ out[34] = x;
+ out[30] = x;
+ out[33] = x;
+ out[31] = x;
+ out[32] = x;
+ }
+}
+
+static void idct64x64_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+
+ {
+ __m128i u[64];
+
+ // stage 1
+ u[0] = in[0];
+ u[8] = in[4];
+ u[16] = in[2];
+ u[24] = in[6];
+ u[32] = in[1];
+ u[40] = in[5];
+ u[48] = in[3];
+ u[56] = in[7];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[62] = u[63];
+
+ // stage 4
+ __m128i temp1, temp2;
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[17] = u[16];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[30] = u[31];
+
+ temp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = temp2;
+
+ temp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[46] = temp2;
+
+ // stage 5
+ u[9] = u[8];
+ u[14] = u[15];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[22] = temp2;
+
+ u[35] = u[32];
+ u[34] = u[33];
+ u[36] = u[39];
+ u[37] = u[38];
+ u[43] = u[40];
+ u[42] = u[41];
+ u[44] = u[47];
+ u[45] = u[46];
+ u[51] = u[48];
+ u[50] = u[49];
+ u[52] = u[55];
+ u[53] = u[54];
+ u[59] = u[56];
+ u[58] = u[57];
+ u[60] = u[63];
+ u[61] = u[62];
+
+ // stage 6
+ temp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = temp1;
+
+ temp2 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = temp2;
+ u[19] = u[16];
+ u[18] = u[17];
+ u[20] = u[23];
+ u[21] = u[22];
+ u[27] = u[24];
+ u[26] = u[25];
+ u[28] = u[31];
+ u[29] = u[30];
+
+ temp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = temp1;
+ temp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[35] = temp2;
+ temp1 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[36] = temp1;
+ temp2 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[37] = temp2;
+ temp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = temp1;
+ temp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[43] = temp2;
+ temp1 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[44] = temp1;
+ temp2 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[45] = temp2;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ u[11] = u[8];
+ u[10] = u[9];
+ u[12] = u[15];
+ u[13] = u[14];
+
+ temp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = temp1;
+ temp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[19] = temp2;
+ temp1 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[20] = temp1;
+ temp2 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[21] = temp2;
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ u[7] = u[0];
+ u[6] = u[1];
+ u[5] = u[2];
+ u[4] = u[3];
+ u[9] = u[9];
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct64x64_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64];
+ __m128i tmp1, tmp2, tmp3, tmp4;
+ // stage 1
+ u[0] = in[0];
+ u[32] = in[1];
+ u[36] = in[9];
+ u[40] = in[5];
+ u[44] = in[13];
+ u[48] = in[3];
+ u[52] = in[11];
+ u[56] = in[7];
+ u[60] = in[15];
+ u[16] = in[2];
+ u[20] = in[10];
+ u[24] = in[6];
+ u[28] = in[14];
+ u[4] = in[8];
+ u[8] = in[4];
+ u[12] = in[12];
+
+ // stage 2
+ u[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+ u[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ u[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ u[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ u[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ u[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ u[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ u[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ u[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ u[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ u[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ u[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ u[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ u[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ u[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ u[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+
+ // stage 3
+ u[31] = half_btf_0_sse4_1(&cospi2, &u[16], &rnding, bit);
+ u[16] = half_btf_0_sse4_1(&cospi62, &u[16], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &u[28], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &u[28], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &u[20], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &u[20], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &u[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &u[24], &rnding, bit);
+ u[33] = u[32];
+ u[34] = u[35];
+ u[37] = u[36];
+ u[38] = u[39];
+ u[41] = u[40];
+ u[42] = u[43];
+ u[45] = u[44];
+ u[46] = u[47];
+ u[49] = u[48];
+ u[50] = u[51];
+ u[53] = u[52];
+ u[54] = u[55];
+ u[57] = u[56];
+ u[58] = u[59];
+ u[61] = u[60];
+ u[62] = u[63];
+
+ // stage 4
+ u[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+ u[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ u[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ u[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+
+ u[17] = u[16];
+ u[18] = u[19];
+ u[21] = u[20];
+ u[22] = u[23];
+ u[25] = u[24];
+ u[26] = u[27];
+ u[29] = u[28];
+ u[30] = u[31];
+
+ tmp1 = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ u[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ u[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+ u[33] = tmp1;
+ u[34] = tmp2;
+ u[37] = tmp3;
+ u[38] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ u[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ u[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ u[41] = tmp1;
+ u[42] = tmp2;
+ u[45] = tmp3;
+ u[46] = tmp4;
+
+ // stage 5
+ u[7] = half_btf_0_sse4_1(&cospi8, &u[4], &rnding, bit);
+ u[4] = half_btf_0_sse4_1(&cospi56, &u[4], &rnding, bit);
+
+ u[9] = u[8];
+ u[10] = u[11];
+ u[13] = u[12];
+ u[14] = u[15];
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[17], &cospi56, &u[30], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim56, &u[18], &cospim8, &u[29], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim40, &u[21], &cospi24, &u[26], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[22], &cospim40, &u[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &u[22], &cospi24, &u[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &u[21], &cospi40, &u[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &u[18], &cospi56, &u[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &u[17], &cospi8, &u[30], &rnding, bit);
+ u[17] = tmp1;
+ u[18] = tmp2;
+ u[21] = tmp3;
+ u[22] = tmp4;
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ tmp1 = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ u[0] = tmp1;
+ u[5] = u[4];
+ u[6] = u[7];
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ u[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+ u[9] = tmp1;
+ tmp2 = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ u[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ u[10] = tmp2;
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ tmp1 = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ u[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ u[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ u[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ u[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+ u[34] = tmp1;
+ u[35] = tmp2;
+ u[36] = tmp3;
+ u[37] = tmp4;
+
+ tmp1 = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ u[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ u[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ u[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ u[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ u[42] = tmp1;
+ u[43] = tmp2;
+ u[44] = tmp3;
+ u[45] = tmp4;
+
+ // stage 7
+ u[3] = u[0];
+ u[2] = u[1];
+ tmp1 = half_btf_sse4_1(&cospim32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &u[5], &cospi32, &u[6], &rnding, bit);
+ u[5] = tmp1;
+ addsub_sse4_1(u[8], u[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[9], u[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[15], u[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[14], u[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ tmp1 = half_btf_sse4_1(&cospim16, &u[18], &cospi48, &u[29], &rnding, bit);
+ tmp2 = half_btf_sse4_1(&cospim16, &u[19], &cospi48, &u[28], &rnding, bit);
+ tmp3 = half_btf_sse4_1(&cospim48, &u[20], &cospim16, &u[27], &rnding, bit);
+ tmp4 = half_btf_sse4_1(&cospim48, &u[21], &cospim16, &u[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &u[21], &cospi48, &u[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &u[20], &cospi48, &u[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &u[19], &cospi16, &u[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &u[18], &cospi16, &u[29], &rnding, bit);
+ u[18] = tmp1;
+ u[19] = tmp2;
+ u[20] = tmp3;
+ u[21] = tmp4;
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(u[j], u[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[j ^ 15], u[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &u[i], &u[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ idct64_stage8_sse4_1(u, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rnding, bit);
+
+ // stage 9
+ idct64_stage9_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 10
+ idct64_stage10_sse4_1(u, &cospim32, &cospi32, &clamp_lo, &clamp_hi, &rnding,
+ bit);
+
+ // stage 11
+ idct64_stage11_sse4_1(u, out, do_cols, bd, out_shift, log_range);
+ }
+}
+
+static void idct64x64_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ int i, j;
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+
+ const __m128i cospi1 = _mm_set1_epi32(cospi[1]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi3 = _mm_set1_epi32(cospi[3]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi5 = _mm_set1_epi32(cospi[5]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi7 = _mm_set1_epi32(cospi[7]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi9 = _mm_set1_epi32(cospi[9]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi11 = _mm_set1_epi32(cospi[11]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi13 = _mm_set1_epi32(cospi[13]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi15 = _mm_set1_epi32(cospi[15]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi17 = _mm_set1_epi32(cospi[17]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi19 = _mm_set1_epi32(cospi[19]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi21 = _mm_set1_epi32(cospi[21]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi23 = _mm_set1_epi32(cospi[23]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi25 = _mm_set1_epi32(cospi[25]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi27 = _mm_set1_epi32(cospi[27]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi29 = _mm_set1_epi32(cospi[29]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi31 = _mm_set1_epi32(cospi[31]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi35 = _mm_set1_epi32(cospi[35]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi39 = _mm_set1_epi32(cospi[39]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi43 = _mm_set1_epi32(cospi[43]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi47 = _mm_set1_epi32(cospi[47]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi51 = _mm_set1_epi32(cospi[51]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi55 = _mm_set1_epi32(cospi[55]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi59 = _mm_set1_epi32(cospi[59]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi63 = _mm_set1_epi32(cospi[63]);
+
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim12 = _mm_set1_epi32(-cospi[12]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim28 = _mm_set1_epi32(-cospi[28]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospim33 = _mm_set1_epi32(-cospi[33]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim37 = _mm_set1_epi32(-cospi[37]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim41 = _mm_set1_epi32(-cospi[41]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim44 = _mm_set1_epi32(-cospi[44]);
+ const __m128i cospim45 = _mm_set1_epi32(-cospi[45]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim49 = _mm_set1_epi32(-cospi[49]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim53 = _mm_set1_epi32(-cospi[53]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim57 = _mm_set1_epi32(-cospi[57]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim60 = _mm_set1_epi32(-cospi[60]);
+ const __m128i cospim61 = _mm_set1_epi32(-cospi[61]);
+
+ {
+ __m128i u[64], v[64];
+
+ // stage 1
+ u[32] = in[1];
+ u[34] = in[17];
+ u[36] = in[9];
+ u[38] = in[25];
+ u[40] = in[5];
+ u[42] = in[21];
+ u[44] = in[13];
+ u[46] = in[29];
+ u[48] = in[3];
+ u[50] = in[19];
+ u[52] = in[11];
+ u[54] = in[27];
+ u[56] = in[7];
+ u[58] = in[23];
+ u[60] = in[15];
+ u[62] = in[31];
+
+ v[16] = in[2];
+ v[18] = in[18];
+ v[20] = in[10];
+ v[22] = in[26];
+ v[24] = in[6];
+ v[26] = in[22];
+ v[28] = in[14];
+ v[30] = in[30];
+
+ u[8] = in[4];
+ u[10] = in[20];
+ u[12] = in[12];
+ u[14] = in[28];
+
+ v[4] = in[8];
+ v[6] = in[24];
+
+ u[0] = in[0];
+ u[2] = in[16];
+
+ // stage 2
+ v[32] = half_btf_0_sse4_1(&cospi63, &u[32], &rnding, bit);
+ v[33] = half_btf_0_sse4_1(&cospim33, &u[62], &rnding, bit);
+ v[34] = half_btf_0_sse4_1(&cospi47, &u[34], &rnding, bit);
+ v[35] = half_btf_0_sse4_1(&cospim49, &u[60], &rnding, bit);
+ v[36] = half_btf_0_sse4_1(&cospi55, &u[36], &rnding, bit);
+ v[37] = half_btf_0_sse4_1(&cospim41, &u[58], &rnding, bit);
+ v[38] = half_btf_0_sse4_1(&cospi39, &u[38], &rnding, bit);
+ v[39] = half_btf_0_sse4_1(&cospim57, &u[56], &rnding, bit);
+ v[40] = half_btf_0_sse4_1(&cospi59, &u[40], &rnding, bit);
+ v[41] = half_btf_0_sse4_1(&cospim37, &u[54], &rnding, bit);
+ v[42] = half_btf_0_sse4_1(&cospi43, &u[42], &rnding, bit);
+ v[43] = half_btf_0_sse4_1(&cospim53, &u[52], &rnding, bit);
+ v[44] = half_btf_0_sse4_1(&cospi51, &u[44], &rnding, bit);
+ v[45] = half_btf_0_sse4_1(&cospim45, &u[50], &rnding, bit);
+ v[46] = half_btf_0_sse4_1(&cospi35, &u[46], &rnding, bit);
+ v[47] = half_btf_0_sse4_1(&cospim61, &u[48], &rnding, bit);
+ v[48] = half_btf_0_sse4_1(&cospi3, &u[48], &rnding, bit);
+ v[49] = half_btf_0_sse4_1(&cospi29, &u[46], &rnding, bit);
+ v[50] = half_btf_0_sse4_1(&cospi19, &u[50], &rnding, bit);
+ v[51] = half_btf_0_sse4_1(&cospi13, &u[44], &rnding, bit);
+ v[52] = half_btf_0_sse4_1(&cospi11, &u[52], &rnding, bit);
+ v[53] = half_btf_0_sse4_1(&cospi21, &u[42], &rnding, bit);
+ v[54] = half_btf_0_sse4_1(&cospi27, &u[54], &rnding, bit);
+ v[55] = half_btf_0_sse4_1(&cospi5, &u[40], &rnding, bit);
+ v[56] = half_btf_0_sse4_1(&cospi7, &u[56], &rnding, bit);
+ v[57] = half_btf_0_sse4_1(&cospi25, &u[38], &rnding, bit);
+ v[58] = half_btf_0_sse4_1(&cospi23, &u[58], &rnding, bit);
+ v[59] = half_btf_0_sse4_1(&cospi9, &u[36], &rnding, bit);
+ v[60] = half_btf_0_sse4_1(&cospi15, &u[60], &rnding, bit);
+ v[61] = half_btf_0_sse4_1(&cospi17, &u[34], &rnding, bit);
+ v[62] = half_btf_0_sse4_1(&cospi31, &u[62], &rnding, bit);
+ v[63] = half_btf_0_sse4_1(&cospi1, &u[32], &rnding, bit);
+
+ // stage 3
+ u[16] = half_btf_0_sse4_1(&cospi62, &v[16], &rnding, bit);
+ u[17] = half_btf_0_sse4_1(&cospim34, &v[30], &rnding, bit);
+ u[18] = half_btf_0_sse4_1(&cospi46, &v[18], &rnding, bit);
+ u[19] = half_btf_0_sse4_1(&cospim50, &v[28], &rnding, bit);
+ u[20] = half_btf_0_sse4_1(&cospi54, &v[20], &rnding, bit);
+ u[21] = half_btf_0_sse4_1(&cospim42, &v[26], &rnding, bit);
+ u[22] = half_btf_0_sse4_1(&cospi38, &v[22], &rnding, bit);
+ u[23] = half_btf_0_sse4_1(&cospim58, &v[24], &rnding, bit);
+ u[24] = half_btf_0_sse4_1(&cospi6, &v[24], &rnding, bit);
+ u[25] = half_btf_0_sse4_1(&cospi26, &v[22], &rnding, bit);
+ u[26] = half_btf_0_sse4_1(&cospi22, &v[26], &rnding, bit);
+ u[27] = half_btf_0_sse4_1(&cospi10, &v[20], &rnding, bit);
+ u[28] = half_btf_0_sse4_1(&cospi14, &v[28], &rnding, bit);
+ u[29] = half_btf_0_sse4_1(&cospi18, &v[18], &rnding, bit);
+ u[30] = half_btf_0_sse4_1(&cospi30, &v[30], &rnding, bit);
+ u[31] = half_btf_0_sse4_1(&cospi2, &v[16], &rnding, bit);
+
+ for (i = 32; i < 64; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 4
+ v[8] = half_btf_0_sse4_1(&cospi60, &u[8], &rnding, bit);
+ v[9] = half_btf_0_sse4_1(&cospim36, &u[14], &rnding, bit);
+ v[10] = half_btf_0_sse4_1(&cospi44, &u[10], &rnding, bit);
+ v[11] = half_btf_0_sse4_1(&cospim52, &u[12], &rnding, bit);
+ v[12] = half_btf_0_sse4_1(&cospi12, &u[12], &rnding, bit);
+ v[13] = half_btf_0_sse4_1(&cospi20, &u[10], &rnding, bit);
+ v[14] = half_btf_0_sse4_1(&cospi28, &u[14], &rnding, bit);
+ v[15] = half_btf_0_sse4_1(&cospi4, &u[8], &rnding, bit);
+
+ for (i = 16; i < 32; i += 4) {
+ addsub_sse4_1(u[i + 0], u[i + 1], &v[i + 0], &v[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 3], u[i + 2], &v[i + 3], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[33] = half_btf_sse4_1(&cospim4, &u[33], &cospi60, &u[62], &rnding, bit);
+ v[34] = half_btf_sse4_1(&cospim60, &u[34], &cospim4, &u[61], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim36, &u[37], &cospi28, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim28, &u[38], &cospim36, &u[57], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim20, &u[41], &cospi44, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim44, &u[42], &cospim20, &u[53], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim52, &u[45], &cospi12, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim12, &u[46], &cospim52, &u[49], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospim52, &u[46], &cospi12, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi12, &u[45], &cospi52, &u[50], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim20, &u[42], &cospi44, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi44, &u[41], &cospi20, &u[54], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospim36, &u[38], &cospi28, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi28, &u[37], &cospi36, &u[58], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospim4, &u[34], &cospi60, &u[61], &rnding, bit);
+ v[62] = half_btf_sse4_1(&cospi60, &u[33], &cospi4, &u[62], &rnding, bit);
+
+ // stage 5
+ u[4] = half_btf_0_sse4_1(&cospi56, &v[4], &rnding, bit);
+ u[5] = half_btf_0_sse4_1(&cospim40, &v[6], &rnding, bit);
+ u[6] = half_btf_0_sse4_1(&cospi24, &v[6], &rnding, bit);
+ u[7] = half_btf_0_sse4_1(&cospi8, &v[4], &rnding, bit);
+
+ for (i = 8; i < 16; i += 4) {
+ addsub_sse4_1(v[i + 0], v[i + 1], &u[i + 0], &u[i + 1], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 3], v[i + 2], &u[i + 3], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 16; i < 32; i += 4) {
+ u[i + 0] = v[i + 0];
+ u[i + 3] = v[i + 3];
+ }
+
+ u[17] = half_btf_sse4_1(&cospim8, &v[17], &cospi56, &v[30], &rnding, bit);
+ u[18] = half_btf_sse4_1(&cospim56, &v[18], &cospim8, &v[29], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim40, &v[21], &cospi24, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim24, &v[22], &cospim40, &v[25], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospim40, &v[22], &cospi24, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi24, &v[21], &cospi40, &v[26], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospim8, &v[18], &cospi56, &v[29], &rnding, bit);
+ u[30] = half_btf_sse4_1(&cospi56, &v[17], &cospi8, &v[30], &rnding, bit);
+
+ for (i = 32; i < 64; i += 8) {
+ addsub_sse4_1(v[i + 0], v[i + 3], &u[i + 0], &u[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 1], v[i + 2], &u[i + 1], &u[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(v[i + 7], v[i + 4], &u[i + 7], &u[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(v[i + 6], v[i + 5], &u[i + 6], &u[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ // stage 6
+ v[0] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[1] = half_btf_0_sse4_1(&cospi32, &u[0], &rnding, bit);
+ v[2] = half_btf_0_sse4_1(&cospi48, &u[2], &rnding, bit);
+ v[3] = half_btf_0_sse4_1(&cospi16, &u[2], &rnding, bit);
+
+ addsub_sse4_1(u[4], u[5], &v[4], &v[5], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[7], u[6], &v[7], &v[6], &clamp_lo, &clamp_hi);
+
+ for (i = 8; i < 16; i += 4) {
+ v[i + 0] = u[i + 0];
+ v[i + 3] = u[i + 3];
+ }
+
+ v[9] = half_btf_sse4_1(&cospim16, &u[9], &cospi48, &u[14], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospim48, &u[10], &cospim16, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospim16, &u[10], &cospi48, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi48, &u[9], &cospi16, &u[14], &rnding, bit);
+
+ for (i = 16; i < 32; i += 8) {
+ addsub_sse4_1(u[i + 0], u[i + 3], &v[i + 0], &v[i + 3], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 1], u[i + 2], &v[i + 1], &v[i + 2], &clamp_lo,
+ &clamp_hi);
+
+ addsub_sse4_1(u[i + 7], u[i + 4], &v[i + 7], &v[i + 4], &clamp_lo,
+ &clamp_hi);
+ addsub_sse4_1(u[i + 6], u[i + 5], &v[i + 6], &v[i + 5], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 64; i += 8) {
+ v[i + 0] = u[i + 0];
+ v[i + 1] = u[i + 1];
+ v[i + 6] = u[i + 6];
+ v[i + 7] = u[i + 7];
+ }
+
+ v[34] = half_btf_sse4_1(&cospim8, &u[34], &cospi56, &u[61], &rnding, bit);
+ v[35] = half_btf_sse4_1(&cospim8, &u[35], &cospi56, &u[60], &rnding, bit);
+ v[36] = half_btf_sse4_1(&cospim56, &u[36], &cospim8, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim56, &u[37], &cospim8, &u[58], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim40, &u[42], &cospi24, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim40, &u[43], &cospi24, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim24, &u[44], &cospim40, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim24, &u[45], &cospim40, &u[50], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospim40, &u[45], &cospi24, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospim40, &u[44], &cospi24, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi24, &u[43], &cospi40, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi24, &u[42], &cospi40, &u[53], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospim8, &u[37], &cospi56, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospim8, &u[36], &cospi56, &u[59], &rnding, bit);
+ v[60] = half_btf_sse4_1(&cospi56, &u[35], &cospi8, &u[60], &rnding, bit);
+ v[61] = half_btf_sse4_1(&cospi56, &u[34], &cospi8, &u[61], &rnding, bit);
+
+ // stage 7
+ addsub_sse4_1(v[0], v[3], &u[0], &u[3], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[1], v[2], &u[1], &u[2], &clamp_lo, &clamp_hi);
+
+ u[4] = v[4];
+ u[7] = v[7];
+ u[5] = half_btf_sse4_1(&cospim32, &v[5], &cospi32, &v[6], &rnding, bit);
+ u[6] = half_btf_sse4_1(&cospi32, &v[5], &cospi32, &v[6], &rnding, bit);
+
+ addsub_sse4_1(v[8], v[11], &u[8], &u[11], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[9], v[10], &u[9], &u[10], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[15], v[12], &u[15], &u[12], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[14], v[13], &u[14], &u[13], &clamp_lo, &clamp_hi);
+
+ for (i = 16; i < 32; i += 8) {
+ u[i + 0] = v[i + 0];
+ u[i + 1] = v[i + 1];
+ u[i + 6] = v[i + 6];
+ u[i + 7] = v[i + 7];
+ }
+
+ u[18] = half_btf_sse4_1(&cospim16, &v[18], &cospi48, &v[29], &rnding, bit);
+ u[19] = half_btf_sse4_1(&cospim16, &v[19], &cospi48, &v[28], &rnding, bit);
+ u[20] = half_btf_sse4_1(&cospim48, &v[20], &cospim16, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim48, &v[21], &cospim16, &v[26], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospim16, &v[21], &cospi48, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospim16, &v[20], &cospi48, &v[27], &rnding, bit);
+ u[28] = half_btf_sse4_1(&cospi48, &v[19], &cospi16, &v[28], &rnding, bit);
+ u[29] = half_btf_sse4_1(&cospi48, &v[18], &cospi16, &v[29], &rnding, bit);
+
+ for (i = 32; i < 64; i += 16) {
+ for (j = i; j < i + 4; j++) {
+ addsub_sse4_1(v[j], v[j ^ 7], &u[j], &u[j ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(v[j ^ 15], v[j ^ 8], &u[j ^ 15], &u[j ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+ }
+
+ // stage 8
+ for (i = 0; i < 4; ++i) {
+ addsub_sse4_1(u[i], u[7 - i], &v[i], &v[7 - i], &clamp_lo, &clamp_hi);
+ }
+
+ v[8] = u[8];
+ v[9] = u[9];
+ v[14] = u[14];
+ v[15] = u[15];
+
+ v[10] = half_btf_sse4_1(&cospim32, &u[10], &cospi32, &u[13], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospim32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi32, &u[11], &cospi32, &u[12], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi32, &u[10], &cospi32, &u[13], &rnding, bit);
+
+ for (i = 16; i < 20; ++i) {
+ addsub_sse4_1(u[i], u[i ^ 7], &v[i], &v[i ^ 7], &clamp_lo, &clamp_hi);
+ addsub_sse4_1(u[i ^ 15], u[i ^ 8], &v[i ^ 15], &v[i ^ 8], &clamp_lo,
+ &clamp_hi);
+ }
+
+ for (i = 32; i < 36; ++i) {
+ v[i] = u[i];
+ v[i + 12] = u[i + 12];
+ v[i + 16] = u[i + 16];
+ v[i + 28] = u[i + 28];
+ }
+
+ v[36] = half_btf_sse4_1(&cospim16, &u[36], &cospi48, &u[59], &rnding, bit);
+ v[37] = half_btf_sse4_1(&cospim16, &u[37], &cospi48, &u[58], &rnding, bit);
+ v[38] = half_btf_sse4_1(&cospim16, &u[38], &cospi48, &u[57], &rnding, bit);
+ v[39] = half_btf_sse4_1(&cospim16, &u[39], &cospi48, &u[56], &rnding, bit);
+ v[40] = half_btf_sse4_1(&cospim48, &u[40], &cospim16, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim48, &u[41], &cospim16, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim48, &u[42], &cospim16, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim48, &u[43], &cospim16, &u[52], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospim16, &u[43], &cospi48, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospim16, &u[42], &cospi48, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospim16, &u[41], &cospi48, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospim16, &u[40], &cospi48, &u[55], &rnding, bit);
+ v[56] = half_btf_sse4_1(&cospi48, &u[39], &cospi16, &u[56], &rnding, bit);
+ v[57] = half_btf_sse4_1(&cospi48, &u[38], &cospi16, &u[57], &rnding, bit);
+ v[58] = half_btf_sse4_1(&cospi48, &u[37], &cospi16, &u[58], &rnding, bit);
+ v[59] = half_btf_sse4_1(&cospi48, &u[36], &cospi16, &u[59], &rnding, bit);
+
+ // stage 9
+ for (i = 0; i < 8; ++i) {
+ addsub_sse4_1(v[i], v[15 - i], &u[i], &u[15 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 16; i < 20; ++i) {
+ u[i] = v[i];
+ u[i + 12] = v[i + 12];
+ }
+
+ u[20] = half_btf_sse4_1(&cospim32, &v[20], &cospi32, &v[27], &rnding, bit);
+ u[21] = half_btf_sse4_1(&cospim32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[22] = half_btf_sse4_1(&cospim32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[23] = half_btf_sse4_1(&cospim32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[24] = half_btf_sse4_1(&cospi32, &v[23], &cospi32, &v[24], &rnding, bit);
+ u[25] = half_btf_sse4_1(&cospi32, &v[22], &cospi32, &v[25], &rnding, bit);
+ u[26] = half_btf_sse4_1(&cospi32, &v[21], &cospi32, &v[26], &rnding, bit);
+ u[27] = half_btf_sse4_1(&cospi32, &v[20], &cospi32, &v[27], &rnding, bit);
+
+ for (i = 32; i < 40; i++) {
+ addsub_sse4_1(v[i], v[i ^ 15], &u[i], &u[i ^ 15], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 48; i < 56; i++) {
+ addsub_sse4_1(v[i ^ 15], v[i], &u[i ^ 15], &u[i], &clamp_lo, &clamp_hi);
+ }
+
+ // stage 10
+ for (i = 0; i < 16; i++) {
+ addsub_sse4_1(u[i], u[31 - i], &v[i], &v[31 - i], &clamp_lo, &clamp_hi);
+ }
+
+ for (i = 32; i < 40; i++) v[i] = u[i];
+
+ v[40] = half_btf_sse4_1(&cospim32, &u[40], &cospi32, &u[55], &rnding, bit);
+ v[41] = half_btf_sse4_1(&cospim32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[42] = half_btf_sse4_1(&cospim32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[43] = half_btf_sse4_1(&cospim32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[44] = half_btf_sse4_1(&cospim32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[45] = half_btf_sse4_1(&cospim32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[46] = half_btf_sse4_1(&cospim32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[47] = half_btf_sse4_1(&cospim32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[48] = half_btf_sse4_1(&cospi32, &u[47], &cospi32, &u[48], &rnding, bit);
+ v[49] = half_btf_sse4_1(&cospi32, &u[46], &cospi32, &u[49], &rnding, bit);
+ v[50] = half_btf_sse4_1(&cospi32, &u[45], &cospi32, &u[50], &rnding, bit);
+ v[51] = half_btf_sse4_1(&cospi32, &u[44], &cospi32, &u[51], &rnding, bit);
+ v[52] = half_btf_sse4_1(&cospi32, &u[43], &cospi32, &u[52], &rnding, bit);
+ v[53] = half_btf_sse4_1(&cospi32, &u[42], &cospi32, &u[53], &rnding, bit);
+ v[54] = half_btf_sse4_1(&cospi32, &u[41], &cospi32, &u[54], &rnding, bit);
+ v[55] = half_btf_sse4_1(&cospi32, &u[40], &cospi32, &u[55], &rnding, bit);
+
+ for (i = 56; i < 64; i++) v[i] = u[i];
+
+ // stage 11
+ if (do_cols) {
+ for (i = 0; i < 32; i++) {
+ addsub_no_clamp_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)]);
+ }
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ for (i = 0; i < 32; i++) {
+ addsub_shift_sse4_1(v[i], v[63 - i], &out[(i)], &out[(63 - i)],
+ &clamp_lo_out, &clamp_hi_out, out_shift);
+ }
+ }
+ }
+}
+
+static void idct32x32_low1_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1;
+
+ // stage 0
+ // stage 1
+ bf1 = in[0];
+
+ // stage 2
+ // stage 3
+ // stage 4
+ // stage 5
+ bf1 = half_btf_0_sse4_1(&cospi32, &bf1, &rounding, bit);
+
+ // stage 6
+ // stage 7
+ // stage 8
+ // stage 9
+ if (do_cols) {
+ bf1 = _mm_max_epi32(bf1, clamp_lo);
+ bf1 = _mm_min_epi32(bf1, clamp_hi);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ __m128i offset = _mm_set1_epi32((1 << out_shift) >> 1);
+ bf1 = _mm_add_epi32(bf1, offset);
+ bf1 = _mm_sra_epi32(bf1, _mm_cvtsi32_si128(out_shift));
+ bf1 = _mm_max_epi32(bf1, clamp_lo_out);
+ bf1 = _mm_min_epi32(bf1, clamp_hi_out);
+ }
+ out[0] = bf1;
+ out[1] = bf1;
+ out[2] = bf1;
+ out[3] = bf1;
+ out[4] = bf1;
+ out[5] = bf1;
+ out[6] = bf1;
+ out[7] = bf1;
+ out[8] = bf1;
+ out[9] = bf1;
+ out[10] = bf1;
+ out[11] = bf1;
+ out[12] = bf1;
+ out[13] = bf1;
+ out[14] = bf1;
+ out[15] = bf1;
+ out[16] = bf1;
+ out[17] = bf1;
+ out[18] = bf1;
+ out[19] = bf1;
+ out[20] = bf1;
+ out[21] = bf1;
+ out[22] = bf1;
+ out[23] = bf1;
+ out[24] = bf1;
+ out[25] = bf1;
+ out[26] = bf1;
+ out[27] = bf1;
+ out[28] = bf1;
+ out[29] = bf1;
+ out[30] = bf1;
+ out[31] = bf1;
+}
+
+static void idct32x32_low8_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[4] = in[4];
+ bf1[8] = in[2];
+ bf1[12] = in[6];
+ bf1[16] = in[1];
+ bf1[20] = in[5];
+ bf1[24] = in[3];
+ bf1[28] = in[7];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+ bf1[17] = bf1[16];
+ bf1[18] = bf1[19];
+ bf1[21] = bf1[20];
+ bf1[22] = bf1[23];
+ bf1[25] = bf1[24];
+ bf1[26] = bf1[27];
+ bf1[29] = bf1[28];
+ bf1[30] = bf1[31];
+
+ // stage 4 :
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+
+ bf1[9] = bf1[8];
+ bf1[10] = bf1[11];
+ bf1[13] = bf1[12];
+ bf1[14] = bf1[15];
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[5] = bf1[4];
+ bf1[6] = bf1[7];
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ bf1[3] = bf1[0];
+ bf1[2] = bf1[1];
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_low16_sse4_1(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32];
+
+ // stage 0
+ // stage 1
+
+ bf1[0] = in[0];
+ bf1[2] = in[8];
+ bf1[4] = in[4];
+ bf1[6] = in[12];
+ bf1[8] = in[2];
+ bf1[10] = in[10];
+ bf1[12] = in[6];
+ bf1[14] = in[14];
+ bf1[16] = in[1];
+ bf1[18] = in[9];
+ bf1[20] = in[5];
+ bf1[22] = in[13];
+ bf1[24] = in[3];
+ bf1[26] = in[11];
+ bf1[28] = in[7];
+ bf1[30] = in[15];
+
+ // stage 2
+ bf1[31] = half_btf_0_sse4_1(&cospi2, &bf1[16], &rounding, bit);
+ bf1[16] = half_btf_0_sse4_1(&cospi62, &bf1[16], &rounding, bit);
+ bf1[17] = half_btf_0_sse4_1(&cospim34, &bf1[30], &rounding, bit);
+ bf1[30] = half_btf_0_sse4_1(&cospi30, &bf1[30], &rounding, bit);
+ bf1[29] = half_btf_0_sse4_1(&cospi18, &bf1[18], &rounding, bit);
+ bf1[18] = half_btf_0_sse4_1(&cospi46, &bf1[18], &rounding, bit);
+ bf1[19] = half_btf_0_sse4_1(&cospim50, &bf1[28], &rounding, bit);
+ bf1[28] = half_btf_0_sse4_1(&cospi14, &bf1[28], &rounding, bit);
+ bf1[27] = half_btf_0_sse4_1(&cospi10, &bf1[20], &rounding, bit);
+ bf1[20] = half_btf_0_sse4_1(&cospi54, &bf1[20], &rounding, bit);
+ bf1[21] = half_btf_0_sse4_1(&cospim42, &bf1[26], &rounding, bit);
+ bf1[26] = half_btf_0_sse4_1(&cospi22, &bf1[26], &rounding, bit);
+ bf1[25] = half_btf_0_sse4_1(&cospi26, &bf1[22], &rounding, bit);
+ bf1[22] = half_btf_0_sse4_1(&cospi38, &bf1[22], &rounding, bit);
+ bf1[23] = half_btf_0_sse4_1(&cospim58, &bf1[24], &rounding, bit);
+ bf1[24] = half_btf_0_sse4_1(&cospi6, &bf1[24], &rounding, bit);
+
+ // stage 3
+ bf1[15] = half_btf_0_sse4_1(&cospi4, &bf1[8], &rounding, bit);
+ bf1[8] = half_btf_0_sse4_1(&cospi60, &bf1[8], &rounding, bit);
+ bf1[9] = half_btf_0_sse4_1(&cospim36, &bf1[14], &rounding, bit);
+ bf1[14] = half_btf_0_sse4_1(&cospi28, &bf1[14], &rounding, bit);
+ bf1[13] = half_btf_0_sse4_1(&cospi20, &bf1[10], &rounding, bit);
+ bf1[10] = half_btf_0_sse4_1(&cospi44, &bf1[10], &rounding, bit);
+ bf1[11] = half_btf_0_sse4_1(&cospim52, &bf1[12], &rounding, bit);
+ bf1[12] = half_btf_0_sse4_1(&cospi12, &bf1[12], &rounding, bit);
+
+ addsub_sse4_1(bf1[16], bf1[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[19], bf1[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[20], bf1[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[23], bf1[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[24], bf1[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[27], bf1[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[28], bf1[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[31], bf1[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+ // stage 4
+ bf1[7] = half_btf_0_sse4_1(&cospi8, &bf1[4], &rounding, bit);
+ bf1[4] = half_btf_0_sse4_1(&cospi56, &bf1[4], &rounding, bit);
+ bf1[5] = half_btf_0_sse4_1(&cospim40, &bf1[6], &rounding, bit);
+ bf1[6] = half_btf_0_sse4_1(&cospi24, &bf1[6], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf1 + 8, bf1 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf1 + 11, bf1 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf1 + 12, bf1 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf1 + 15, bf1 + 14, &clamp_lo, &clamp_hi);
+
+ idct32_stage4_sse4_1(bf1, &cospim8, &cospi56, &cospi8, &cospim56, &cospim40,
+ &cospi24, &cospi40, &cospim24, &rounding, bit);
+
+ // stage 5
+ bf1[0] = half_btf_0_sse4_1(&cospi32, &bf1[0], &rounding, bit);
+ bf1[1] = bf1[0];
+ bf1[3] = half_btf_0_sse4_1(&cospi16, &bf1[2], &rounding, bit);
+ bf1[2] = half_btf_0_sse4_1(&cospi48, &bf1[2], &rounding, bit);
+
+ addsub_sse4_1(bf1[4], bf1[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+
+ idct32_stage5_sse4_1(bf1, &cospim16, &cospi48, &cospi16, &cospim48, &clamp_lo,
+ &clamp_hi, &rounding, bit);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf1 + 0, bf1 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf1 + 1, bf1 + 2, &clamp_lo, &clamp_hi);
+
+ idct32_stage6_sse4_1(bf1, &cospim32, &cospi32, &cospim16, &cospi48, &cospi16,
+ &cospim48, &clamp_lo, &clamp_hi, &rounding, bit);
+
+ // stage 7
+ idct32_stage7_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 8
+ idct32_stage8_sse4_1(bf1, &cospim32, &cospi32, &clamp_lo, &clamp_hi,
+ &rounding, bit);
+
+ // stage 9
+ idct32_stage9_sse4_1(bf1, out, do_cols, bd, out_shift, log_range);
+}
+
+static void idct32x32_sse4_1(__m128i *in, __m128i *out, int bit, int do_cols,
+ int bd, int out_shift) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i rounding = _mm_set1_epi32(1 << (bit - 1));
+ const int log_range = AOMMAX(16, bd + (do_cols ? 6 : 8));
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ __m128i bf1[32], bf0[32];
+
+ // stage 0
+ // stage 1
+ bf1[0] = in[0];
+ bf1[1] = in[16];
+ bf1[2] = in[8];
+ bf1[3] = in[24];
+ bf1[4] = in[4];
+ bf1[5] = in[20];
+ bf1[6] = in[12];
+ bf1[7] = in[28];
+ bf1[8] = in[2];
+ bf1[9] = in[18];
+ bf1[10] = in[10];
+ bf1[11] = in[26];
+ bf1[12] = in[6];
+ bf1[13] = in[22];
+ bf1[14] = in[14];
+ bf1[15] = in[30];
+ bf1[16] = in[1];
+ bf1[17] = in[17];
+ bf1[18] = in[9];
+ bf1[19] = in[25];
+ bf1[20] = in[5];
+ bf1[21] = in[21];
+ bf1[22] = in[13];
+ bf1[23] = in[29];
+ bf1[24] = in[3];
+ bf1[25] = in[19];
+ bf1[26] = in[11];
+ bf1[27] = in[27];
+ bf1[28] = in[7];
+ bf1[29] = in[23];
+ bf1[30] = in[15];
+ bf1[31] = in[31];
+
+ // stage 2
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] = bf1[4];
+ bf0[5] = bf1[5];
+ bf0[6] = bf1[6];
+ bf0[7] = bf1[7];
+ bf0[8] = bf1[8];
+ bf0[9] = bf1[9];
+ bf0[10] = bf1[10];
+ bf0[11] = bf1[11];
+ bf0[12] = bf1[12];
+ bf0[13] = bf1[13];
+ bf0[14] = bf1[14];
+ bf0[15] = bf1[15];
+ bf0[16] =
+ half_btf_sse4_1(&cospi62, &bf1[16], &cospim2, &bf1[31], &rounding, bit);
+ bf0[17] =
+ half_btf_sse4_1(&cospi30, &bf1[17], &cospim34, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospi46, &bf1[18], &cospim18, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospi14, &bf1[19], &cospim50, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospi54, &bf1[20], &cospim10, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospi22, &bf1[21], &cospim42, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospi38, &bf1[22], &cospim26, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospi6, &bf1[23], &cospim58, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi58, &bf1[23], &cospi6, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi26, &bf1[22], &cospi38, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi42, &bf1[21], &cospi22, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi10, &bf1[20], &cospi54, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi50, &bf1[19], &cospi14, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi18, &bf1[18], &cospi46, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi34, &bf1[17], &cospi30, &bf1[30], &rounding, bit);
+ bf0[31] =
+ half_btf_sse4_1(&cospi2, &bf1[16], &cospi62, &bf1[31], &rounding, bit);
+
+ // stage 3
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] =
+ half_btf_sse4_1(&cospi60, &bf0[8], &cospim4, &bf0[15], &rounding, bit);
+ bf1[9] =
+ half_btf_sse4_1(&cospi28, &bf0[9], &cospim36, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospi44, &bf0[10], &cospim20, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospi12, &bf0[11], &cospim52, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi52, &bf0[11], &cospi12, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi20, &bf0[10], &cospi44, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi36, &bf0[9], &cospi28, &bf0[14], &rounding, bit);
+ bf1[15] =
+ half_btf_sse4_1(&cospi4, &bf0[8], &cospi60, &bf0[15], &rounding, bit);
+
+ addsub_sse4_1(bf0[16], bf0[17], bf1 + 16, bf1 + 17, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[18], bf1 + 19, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[20], bf0[21], bf1 + 20, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[22], bf1 + 23, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[25], bf1 + 24, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[27], bf0[26], bf1 + 27, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[29], bf1 + 28, bf1 + 29, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[30], bf1 + 31, bf1 + 30, &clamp_lo, &clamp_hi);
+
+ // stage 4
+ bf0[0] = bf1[0];
+ bf0[1] = bf1[1];
+ bf0[2] = bf1[2];
+ bf0[3] = bf1[3];
+ bf0[4] =
+ half_btf_sse4_1(&cospi56, &bf1[4], &cospim8, &bf1[7], &rounding, bit);
+ bf0[5] =
+ half_btf_sse4_1(&cospi24, &bf1[5], &cospim40, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi40, &bf1[5], &cospi24, &bf1[6], &rounding, bit);
+ bf0[7] = half_btf_sse4_1(&cospi8, &bf1[4], &cospi56, &bf1[7], &rounding, bit);
+
+ addsub_sse4_1(bf1[8], bf1[9], bf0 + 8, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[11], bf1[10], bf0 + 11, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[12], bf1[13], bf0 + 12, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[14], bf0 + 15, bf0 + 14, &clamp_lo, &clamp_hi);
+
+ bf0[16] = bf1[16];
+ bf0[17] =
+ half_btf_sse4_1(&cospim8, &bf1[17], &cospi56, &bf1[30], &rounding, bit);
+ bf0[18] =
+ half_btf_sse4_1(&cospim56, &bf1[18], &cospim8, &bf1[29], &rounding, bit);
+ bf0[19] = bf1[19];
+ bf0[20] = bf1[20];
+ bf0[21] =
+ half_btf_sse4_1(&cospim40, &bf1[21], &cospi24, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim24, &bf1[22], &cospim40, &bf1[25], &rounding, bit);
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] =
+ half_btf_sse4_1(&cospim40, &bf1[22], &cospi24, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi24, &bf1[21], &cospi40, &bf1[26], &rounding, bit);
+ bf0[27] = bf1[27];
+ bf0[28] = bf1[28];
+ bf0[29] =
+ half_btf_sse4_1(&cospim8, &bf1[18], &cospi56, &bf1[29], &rounding, bit);
+ bf0[30] =
+ half_btf_sse4_1(&cospi56, &bf1[17], &cospi8, &bf1[30], &rounding, bit);
+ bf0[31] = bf1[31];
+
+ // stage 5
+ bf1[0] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospi32, &bf0[1], &rounding, bit);
+ bf1[1] =
+ half_btf_sse4_1(&cospi32, &bf0[0], &cospim32, &bf0[1], &rounding, bit);
+ bf1[2] =
+ half_btf_sse4_1(&cospi48, &bf0[2], &cospim16, &bf0[3], &rounding, bit);
+ bf1[3] =
+ half_btf_sse4_1(&cospi16, &bf0[2], &cospi48, &bf0[3], &rounding, bit);
+ addsub_sse4_1(bf0[4], bf0[5], bf1 + 4, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[7], bf0[6], bf1 + 7, bf1 + 6, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] =
+ half_btf_sse4_1(&cospim16, &bf0[9], &cospi48, &bf0[14], &rounding, bit);
+ bf1[10] =
+ half_btf_sse4_1(&cospim48, &bf0[10], &cospim16, &bf0[13], &rounding, bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] =
+ half_btf_sse4_1(&cospim16, &bf0[10], &cospi48, &bf0[13], &rounding, bit);
+ bf1[14] =
+ half_btf_sse4_1(&cospi48, &bf0[9], &cospi16, &bf0[14], &rounding, bit);
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[19], bf1 + 16, bf1 + 19, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[18], bf1 + 17, bf1 + 18, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[23], bf0[20], bf1 + 23, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[22], bf0[21], bf1 + 22, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[24], bf0[27], bf1 + 24, bf1 + 27, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[25], bf0[26], bf1 + 25, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[28], bf1 + 31, bf1 + 28, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[29], bf1 + 30, bf1 + 29, &clamp_lo, &clamp_hi);
+
+ // stage 6
+ addsub_sse4_1(bf1[0], bf1[3], bf0 + 0, bf0 + 3, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[2], bf0 + 1, bf0 + 2, &clamp_lo, &clamp_hi);
+ bf0[4] = bf1[4];
+ bf0[5] =
+ half_btf_sse4_1(&cospim32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[6] =
+ half_btf_sse4_1(&cospi32, &bf1[5], &cospi32, &bf1[6], &rounding, bit);
+ bf0[7] = bf1[7];
+ addsub_sse4_1(bf1[8], bf1[11], bf0 + 8, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[9], bf1[10], bf0 + 9, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[15], bf1[12], bf0 + 15, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[14], bf1[13], bf0 + 14, bf0 + 13, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] =
+ half_btf_sse4_1(&cospim16, &bf1[18], &cospi48, &bf1[29], &rounding, bit);
+ bf0[19] =
+ half_btf_sse4_1(&cospim16, &bf1[19], &cospi48, &bf1[28], &rounding, bit);
+ bf0[20] =
+ half_btf_sse4_1(&cospim48, &bf1[20], &cospim16, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim48, &bf1[21], &cospim16, &bf1[26], &rounding, bit);
+ bf0[22] = bf1[22];
+ bf0[23] = bf1[23];
+ bf0[24] = bf1[24];
+ bf0[25] = bf1[25];
+ bf0[26] =
+ half_btf_sse4_1(&cospim16, &bf1[21], &cospi48, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospim16, &bf1[20], &cospi48, &bf1[27], &rounding, bit);
+ bf0[28] =
+ half_btf_sse4_1(&cospi48, &bf1[19], &cospi16, &bf1[28], &rounding, bit);
+ bf0[29] =
+ half_btf_sse4_1(&cospi48, &bf1[18], &cospi16, &bf1[29], &rounding, bit);
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 7
+ addsub_sse4_1(bf0[0], bf0[7], bf1 + 0, bf1 + 7, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[1], bf0[6], bf1 + 1, bf1 + 6, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[2], bf0[5], bf1 + 2, bf1 + 5, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[3], bf0[4], bf1 + 3, bf1 + 4, &clamp_lo, &clamp_hi);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] =
+ half_btf_sse4_1(&cospim32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[11] =
+ half_btf_sse4_1(&cospim32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[12] =
+ half_btf_sse4_1(&cospi32, &bf0[11], &cospi32, &bf0[12], &rounding, bit);
+ bf1[13] =
+ half_btf_sse4_1(&cospi32, &bf0[10], &cospi32, &bf0[13], &rounding, bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ addsub_sse4_1(bf0[16], bf0[23], bf1 + 16, bf1 + 23, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[17], bf0[22], bf1 + 17, bf1 + 22, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[18], bf0[21], bf1 + 18, bf1 + 21, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[19], bf0[20], bf1 + 19, bf1 + 20, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[31], bf0[24], bf1 + 31, bf1 + 24, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[30], bf0[25], bf1 + 30, bf1 + 25, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[29], bf0[26], bf1 + 29, bf1 + 26, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf0[28], bf0[27], bf1 + 28, bf1 + 27, &clamp_lo, &clamp_hi);
+
+ // stage 8
+ addsub_sse4_1(bf1[0], bf1[15], bf0 + 0, bf0 + 15, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[1], bf1[14], bf0 + 1, bf0 + 14, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[2], bf1[13], bf0 + 2, bf0 + 13, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[3], bf1[12], bf0 + 3, bf0 + 12, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[4], bf1[11], bf0 + 4, bf0 + 11, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[5], bf1[10], bf0 + 5, bf0 + 10, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[6], bf1[9], bf0 + 6, bf0 + 9, &clamp_lo, &clamp_hi);
+ addsub_sse4_1(bf1[7], bf1[8], bf0 + 7, bf0 + 8, &clamp_lo, &clamp_hi);
+ bf0[16] = bf1[16];
+ bf0[17] = bf1[17];
+ bf0[18] = bf1[18];
+ bf0[19] = bf1[19];
+ bf0[20] =
+ half_btf_sse4_1(&cospim32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[21] =
+ half_btf_sse4_1(&cospim32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[22] =
+ half_btf_sse4_1(&cospim32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[23] =
+ half_btf_sse4_1(&cospim32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[24] =
+ half_btf_sse4_1(&cospi32, &bf1[23], &cospi32, &bf1[24], &rounding, bit);
+ bf0[25] =
+ half_btf_sse4_1(&cospi32, &bf1[22], &cospi32, &bf1[25], &rounding, bit);
+ bf0[26] =
+ half_btf_sse4_1(&cospi32, &bf1[21], &cospi32, &bf1[26], &rounding, bit);
+ bf0[27] =
+ half_btf_sse4_1(&cospi32, &bf1[20], &cospi32, &bf1[27], &rounding, bit);
+ bf0[28] = bf1[28];
+ bf0[29] = bf1[29];
+ bf0[30] = bf1[30];
+ bf0[31] = bf1[31];
+
+ // stage 9
+ if (do_cols) {
+ addsub_no_clamp_sse4_1(bf0[0], bf0[31], out + 0, out + 31);
+ addsub_no_clamp_sse4_1(bf0[1], bf0[30], out + 1, out + 30);
+ addsub_no_clamp_sse4_1(bf0[2], bf0[29], out + 2, out + 29);
+ addsub_no_clamp_sse4_1(bf0[3], bf0[28], out + 3, out + 28);
+ addsub_no_clamp_sse4_1(bf0[4], bf0[27], out + 4, out + 27);
+ addsub_no_clamp_sse4_1(bf0[5], bf0[26], out + 5, out + 26);
+ addsub_no_clamp_sse4_1(bf0[6], bf0[25], out + 6, out + 25);
+ addsub_no_clamp_sse4_1(bf0[7], bf0[24], out + 7, out + 24);
+ addsub_no_clamp_sse4_1(bf0[8], bf0[23], out + 8, out + 23);
+ addsub_no_clamp_sse4_1(bf0[9], bf0[22], out + 9, out + 22);
+ addsub_no_clamp_sse4_1(bf0[10], bf0[21], out + 10, out + 21);
+ addsub_no_clamp_sse4_1(bf0[11], bf0[20], out + 11, out + 20);
+ addsub_no_clamp_sse4_1(bf0[12], bf0[19], out + 12, out + 19);
+ addsub_no_clamp_sse4_1(bf0[13], bf0[18], out + 13, out + 18);
+ addsub_no_clamp_sse4_1(bf0[14], bf0[17], out + 14, out + 17);
+ addsub_no_clamp_sse4_1(bf0[15], bf0[16], out + 15, out + 16);
+ } else {
+ const int log_range_out = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo_out = _mm_set1_epi32(AOMMAX(
+ -(1 << (log_range_out - 1)), -(1 << (log_range - 1 - out_shift))));
+ const __m128i clamp_hi_out = _mm_set1_epi32(AOMMIN(
+ (1 << (log_range_out - 1)) - 1, (1 << (log_range - 1 - out_shift))));
+
+ addsub_shift_sse4_1(bf0[0], bf0[31], out + 0, out + 31, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[1], bf0[30], out + 1, out + 30, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[2], bf0[29], out + 2, out + 29, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[3], bf0[28], out + 3, out + 28, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[4], bf0[27], out + 4, out + 27, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[5], bf0[26], out + 5, out + 26, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[6], bf0[25], out + 6, out + 25, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[7], bf0[24], out + 7, out + 24, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[8], bf0[23], out + 8, out + 23, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[9], bf0[22], out + 9, out + 22, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[10], bf0[21], out + 10, out + 21, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[11], bf0[20], out + 11, out + 20, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[12], bf0[19], out + 12, out + 19, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[13], bf0[18], out + 13, out + 18, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[14], bf0[17], out + 14, out + 17, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ addsub_shift_sse4_1(bf0[15], bf0[16], out + 15, out + 16, &clamp_lo_out,
+ &clamp_hi_out, out_shift);
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_8x8_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x8_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_8x16_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ txfm_param->tx_type, txfm_param->bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_16x16_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ default:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_32x32_sse4_1(const tran_low_t *input,
+ uint8_t *dest, int stride,
+ const TxfmParam *txfm_param) {
+ int bd = txfm_param->bd;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int32_t *src = cast_to_int32(input);
+ switch (tx_type) {
+ case DCT_DCT:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(input, dest, stride, tx_type,
+ txfm_param->tx_size,
+ txfm_param->eob, bd);
+ break;
+ // Assembly version doesn't support IDTX, so use C version for it.
+ case IDTX:
+ av1_inv_txfm2d_add_32x32_c(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ default: assert(0);
+ }
+}
+
+void av1_highbd_inv_txfm_add_4x4_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride,
+ const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ int eob = txfm_param->eob;
+ int bd = txfm_param->bd;
+ int lossless = txfm_param->lossless;
+ const int32_t *src = cast_to_int32(input);
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ if (lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_iwht4x4_add(input, dest, stride, eob, bd);
+ return;
+ }
+ switch (tx_type) {
+ // Assembly version doesn't support some transform types, so use C version
+ // for those.
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_inv_txfm2d_add_4x4_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
+ bd);
+ break;
+ default:
+ av1_inv_txfm2d_add_4x4_sse4_1(src, CONVERT_TO_SHORTPTR(dest), stride,
+ tx_type, bd);
+ break;
+ }
+}
+
+static const transform_1d_sse4_1
+ highbd_txfm_all_1d_zeros_w8_arr[TX_SIZES][ITX_TYPES_1D][4] = {
+ {
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct8x8_low1_sse4_1, idct8x8_new_sse4_1, NULL, NULL },
+ { iadst8x8_low1_sse4_1, iadst8x8_new_sse4_1, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ {
+ { idct16x16_low1_sse4_1, idct16x16_low8_sse4_1, idct16x16_sse4_1,
+ NULL },
+ { iadst16x16_low1_sse4_1, iadst16x16_low8_sse4_1, iadst16x16_sse4_1,
+ NULL },
+ { NULL, NULL, NULL, NULL },
+ },
+ { { idct32x32_low1_sse4_1, idct32x32_low8_sse4_1, idct32x32_low16_sse4_1,
+ idct32x32_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } },
+ { { idct64x64_low1_sse4_1, idct64x64_low8_sse4_1, idct64x64_low16_sse4_1,
+ idct64x64_sse4_1 },
+ { NULL, NULL, NULL, NULL },
+ { NULL, NULL, NULL, NULL } }
+ };
+
+static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
+ uint16_t *output,
+ int stride, TX_TYPE tx_type,
+ TX_SIZE tx_size, int eob,
+ const int bd) {
+ __m128i buf1[64 * 16];
+ int eobx, eoby;
+ get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 2;
+ const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+ const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
+ const int input_stride = AOMMIN(32, txfm_size_col);
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+
+ const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
+ const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
+ const transform_1d_sse4_1 row_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
+ const transform_1d_sse4_1 col_txfm =
+ highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
+
+ assert(col_txfm != NULL);
+ assert(row_txfm != NULL);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ // 1st stage: column transform
+ for (int i = 0; i < buf_size_nonzero_h_div8 << 1; i++) {
+ __m128i buf0[64];
+ const int32_t *input_row = input + i * input_stride * 4;
+ for (int j = 0; j < buf_size_nonzero_w_div8 << 1; ++j) {
+ __m128i *buf0_cur = buf0 + j * 4;
+ load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+ TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+ buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+ }
+ if (rect_type == 1 || rect_type == -1) {
+ av1_round_shift_rect_array_32_sse4_1(
+ buf0, buf0, buf_size_nonzero_w_div8 << 3, 0, NewInvSqrt2);
+ }
+ row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+ __m128i *_buf1 = buf1 + i * 4;
+ if (lr_flip) {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+ buf0[4 * j],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+ _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+ }
+ } else {
+ for (int j = 0; j < buf_size_w_div8; ++j) {
+ TRANSPOSE_4X4(
+ buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+ _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+ _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+ }
+ }
+ }
+ // 2nd stage: column transform
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+ inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+ av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+ buf1 + i * txfm_size_row, txfm_size_row,
+ -shift[1]);
+ }
+
+ // write to buffer
+ {
+ for (int i = 0; i < (txfm_size_col >> 3); i++) {
+ highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+ output + 8 * i, stride, ud_flip,
+ txfm_size_row, bd);
+ }
+ }
+}
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd) {
+ switch (tx_type) {
+ case DCT_DCT:
+ case ADST_DCT:
+ case DCT_ADST:
+ case ADST_ADST:
+ case FLIPADST_DCT:
+ case DCT_FLIPADST:
+ case FLIPADST_FLIPADST:
+ case ADST_FLIPADST:
+ case FLIPADST_ADST:
+ highbd_inv_txfm2d_add_no_identity_sse41(
+ input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+ bd);
+ break;
+ default: assert(0); break;
+ }
+}
+
+void av1_highbd_inv_txfm_add_sse4_1(const tran_low_t *input, uint8_t *dest,
+ int stride, const TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_32X32:
+ av1_highbd_inv_txfm_add_32x32_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X16:
+ av1_highbd_inv_txfm_add_16x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X8:
+ av1_highbd_inv_txfm_add_4x8(input, dest, stride, txfm_param);
+ break;
+ case TX_8X4:
+ av1_highbd_inv_txfm_add_8x4(input, dest, stride, txfm_param);
+ break;
+ case TX_8X16:
+ av1_highbd_inv_txfm_add_8x16_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X8:
+ av1_highbd_inv_txfm_add_16x8_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X32:
+ av1_highbd_inv_txfm_add_16x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X16:
+ av1_highbd_inv_txfm_add_32x16(input, dest, stride, txfm_param);
+ break;
+ case TX_32X64:
+ av1_highbd_inv_txfm_add_32x64(input, dest, stride, txfm_param);
+ break;
+ case TX_64X32:
+ av1_highbd_inv_txfm_add_64x32(input, dest, stride, txfm_param);
+ break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16(input, dest, stride, txfm_param);
+ break;
+ case TX_8X32:
+ av1_highbd_inv_txfm_add_8x32(input, dest, stride, txfm_param);
+ break;
+ case TX_32X8:
+ av1_highbd_inv_txfm_add_32x8(input, dest, stride, txfm_param);
+ break;
+ case TX_64X64:
+ case TX_16X64:
+ case TX_64X16:
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, txfm_param->tx_size,
+ txfm_param->eob, txfm_param->bd);
+ break;
+ default: assert(0 && "Invalid transform size"); break;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
new file mode 100644
index 0000000000..e298cf6531
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+void av1_highbd_jnt_convolve_2d_copy_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+ int i, j;
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const __m256i offset_const_16b = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits <= 4);
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit =
+ _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ if (do_average) {
+ const __m256i data_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned_16b);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i res = _mm256_sll_epi16(src_10, left_shift);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b, offset_const);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_32b_lo, offset_const);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
+ const __m256i res_unsigned_hi =
+ _mm256_add_epi32(res_32b_hi, offset_const);
+
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ const __m256i res_unsigned_16b =
+ _mm256_adds_epu16(res, offset_const_16b);
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_2d_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[8], coeffs_y[4], coeffs_x[4];
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x = _mm256_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const __m256i round_const_y = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < im_h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 = _mm256_set1_epi16(0);
+ if (i + 1 < im_h)
+ row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_y), round_shift_y);
+
+ const __m256i res_unsigned_lo =
+ _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_x_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ int i, j;
+ __m256i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+ __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+ __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = highbd_convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m256i round_result_hi = highbd_convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_y_avx2(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ assert(bits >= 0);
+ int i, j;
+ __m256i s[8], coeffs_y[4];
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi32(w0);
+ const __m256i wt1 = _mm256_set1_epi32(w1);
+ const __m256i round_const_y =
+ _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
+ const __m256i clip_pixel_to_bd =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
+ res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
+
+ const __m256i comp_avg_res = highbd_comp_avg(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result = highbd_convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result, round_result);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
+ res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
+
+ __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
+
+ if (do_average) {
+ const __m256i data_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
+ const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
+ const __m256i data_01 =
+ _mm256_permute2x128_si256(data_0, data_1, 0x20);
+
+ const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
+ const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
+
+ const __m256i comp_avg_res_lo = highbd_comp_avg(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m256i comp_avg_res_hi = highbd_comp_avg(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m256i round_result_lo =
+ highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m256i round_result_hi =
+ highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m256i res_16b =
+ _mm256_packus_epi32(round_result_lo, round_result_hi);
+ const __m256i res_clip =
+ _mm256_min_epi16(res_16b, clip_pixel_to_bd);
+
+ const __m128i res_0 = _mm256_castsi256_si128(res_clip);
+ const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
+ } else {
+ __m256i res_16b =
+ _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_16b);
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
+
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
new file mode 100644
index 0000000000..1a29985b56
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_jnt_convolve_sse4.c
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+
+void av1_highbd_jnt_convolve_y_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ assert(bits >= 0);
+ int i, j;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i round_const_y =
+ _mm_set1_epi32(((1 << conv_params->round_1) >> 1));
+ const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sll_epi32(res_a0, round_shift_bits);
+ res_a_round0 = _mm_sra_epi32(_mm_add_epi32(res_a_round0, round_const_y),
+ round_shift_y);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sll_epi32(res_a1, round_shift_bits);
+ res_a_round1 = _mm_sra_epi32(_mm_add_epi32(res_a_round1, round_const_y),
+ round_shift_y);
+
+ __m128i res_unsigned_lo_0 = _mm_add_epi32(res_a_round0, offset_const);
+ __m128i res_unsigned_lo_1 = _mm_add_epi32(res_a_round1, offset_const);
+
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadl_epi64(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_0 = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo_0, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_1 = highbd_comp_avg_sse4_1(
+ &data_ref_1, &res_unsigned_lo_1, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_0, round_result_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_1, round_result_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_storel_epi64(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+
+ } else {
+ __m128i res_16b_0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_lo_0);
+
+ __m128i res_16b_1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_lo_1);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b_0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16b_1);
+ }
+ } else {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sll_epi32(res_b0, round_shift_bits);
+ res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round0, round_const_y), round_shift_y);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sll_epi32(res_b1, round_shift_bits);
+ res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b_round1, round_const_y), round_shift_y);
+
+ __m128i res_unsigned_hi_0 = _mm_add_epi32(res_b_round0, offset_const);
+ __m128i res_unsigned_hi_1 = _mm_add_epi32(res_b_round1, offset_const);
+
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_1 = _mm_loadu_si128(
+ (__m128i *)(&dst[i * dst_stride + j + dst_stride]));
+ const __m128i data_ref_0_lo_0 = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_lo_1 = _mm_unpacklo_epi16(data_1, zero);
+
+ const __m128i data_ref_0_hi_0 = _mm_unpackhi_epi16(data_0, zero);
+ const __m128i data_ref_0_hi_1 = _mm_unpackhi_epi16(data_1, zero);
+
+ const __m128i comp_avg_res_lo_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_0, &res_unsigned_lo_0,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_lo_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_lo_1, &res_unsigned_lo_1,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi_0 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_0, &res_unsigned_hi_0,
+ &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi_1 =
+ highbd_comp_avg_sse4_1(&data_ref_0_hi_1, &res_unsigned_hi_1,
+ &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_lo_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_lo_1, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_0 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_0, &offset_const,
+ &rounding_const, rounding_shift);
+ const __m128i round_result_hi_1 =
+ highbd_convolve_rounding_sse2(&comp_avg_res_hi_1, &offset_const,
+ &rounding_const, rounding_shift);
+
+ const __m128i res_16b_0 =
+ _mm_packus_epi32(round_result_lo_0, round_result_hi_0);
+ const __m128i res_clip_0 =
+ _mm_min_epi16(res_16b_0, clip_pixel_to_bd);
+
+ const __m128i res_16b_1 =
+ _mm_packus_epi32(round_result_lo_1, round_result_hi_1);
+ const __m128i res_clip_1 =
+ _mm_min_epi16(res_16b_1, clip_pixel_to_bd);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ res_clip_0);
+ _mm_store_si128(
+ (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
+ res_clip_1);
+ } else {
+ __m128i res_16bit0 =
+ _mm_packus_epi32(res_unsigned_lo_0, res_unsigned_hi_0);
+ __m128i res_16bit1 =
+ _mm_packus_epi32(res_unsigned_lo_1, res_unsigned_hi_1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16bit0);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_16bit1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+}
+
+void av1_highbd_jnt_convolve_x_sse4_1(
+ const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
+ int h, const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y, const int subpel_x_q4,
+ const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ int i, j;
+ __m128i s[4], coeffs_x[4];
+
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi32(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
+ const __m128i clip_pixel_to_bd =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+
+ assert(bits >= 0);
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x), round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+ res_even = _mm_sll_epi32(res_even, round_shift_bits);
+ res_odd = _mm_sll_epi32(res_odd, round_shift_bits);
+
+ __m128i res1 = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_unsigned_lo = _mm_add_epi32(res1, offset_const);
+ if (w - j < 8) {
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
+
+ const __m128i comp_avg_res = highbd_comp_avg_sse4_1(
+ &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i round_result = highbd_convolve_rounding_sse2(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b = _mm_packus_epi32(round_result, round_result);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_lo);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_16b);
+ }
+ } else {
+ __m128i res2 = _mm_unpackhi_epi32(res_even, res_odd);
+ __m128i res_unsigned_hi = _mm_add_epi32(res2, offset_const);
+ if (do_average) {
+ const __m128i data_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+ const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
+ const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);
+
+ const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
+ &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
+ const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
+ &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
+
+ const __m128i round_result_lo = highbd_convolve_rounding_sse2(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+ const __m128i round_result_hi = highbd_convolve_rounding_sse2(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_16b =
+ _mm_packus_epi32(round_result_lo, round_result_hi);
+ const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
+ } else {
+ __m128i res_16b = _mm_packus_epi32(res_unsigned_lo, res_unsigned_hi);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_16b);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000000..6f24e5948a
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+#define AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+ do { \
+ __m128i u0, u1, u2, u3; \
+ u0 = _mm_unpacklo_epi32(x0, x1); \
+ u1 = _mm_unpackhi_epi32(x0, x1); \
+ u2 = _mm_unpacklo_epi32(x2, x3); \
+ u3 = _mm_unpackhi_epi32(x2, x3); \
+ y0 = _mm_unpacklo_epi64(u0, u2); \
+ y1 = _mm_unpackhi_epi64(u0, u2); \
+ y2 = _mm_unpacklo_epi64(u1, u3); \
+ y3 = _mm_unpackhi_epi64(u1, u3); \
+ } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+ TRANSPOSE_4X4(in[0], in[2], in[4], in[6], out[0], out[2], out[4], out[6]);
+ TRANSPOSE_4X4(in[1], in[3], in[5], in[7], out[8], out[10], out[12], out[14]);
+ TRANSPOSE_4X4(in[8], in[10], in[12], in[14], out[1], out[3], out[5], out[7]);
+ TRANSPOSE_4X4(in[9], in[11], in[13], in[15], out[9], out[11], out[13],
+ out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+ // Upper left 8x8
+ TRANSPOSE_4X4(in[0], in[4], in[8], in[12], out[0], out[4], out[8], out[12]);
+ TRANSPOSE_4X4(in[1], in[5], in[9], in[13], out[16], out[20], out[24],
+ out[28]);
+ TRANSPOSE_4X4(in[16], in[20], in[24], in[28], out[1], out[5], out[9],
+ out[13]);
+ TRANSPOSE_4X4(in[17], in[21], in[25], in[29], out[17], out[21], out[25],
+ out[29]);
+
+ // Upper right 8x8
+ TRANSPOSE_4X4(in[2], in[6], in[10], in[14], out[32], out[36], out[40],
+ out[44]);
+ TRANSPOSE_4X4(in[3], in[7], in[11], in[15], out[48], out[52], out[56],
+ out[60]);
+ TRANSPOSE_4X4(in[18], in[22], in[26], in[30], out[33], out[37], out[41],
+ out[45]);
+ TRANSPOSE_4X4(in[19], in[23], in[27], in[31], out[49], out[53], out[57],
+ out[61]);
+
+ // Lower left 8x8
+ TRANSPOSE_4X4(in[32], in[36], in[40], in[44], out[2], out[6], out[10],
+ out[14]);
+ TRANSPOSE_4X4(in[33], in[37], in[41], in[45], out[18], out[22], out[26],
+ out[30]);
+ TRANSPOSE_4X4(in[48], in[52], in[56], in[60], out[3], out[7], out[11],
+ out[15]);
+ TRANSPOSE_4X4(in[49], in[53], in[57], in[61], out[19], out[23], out[27],
+ out[31]);
+ // Lower right 8x8
+ TRANSPOSE_4X4(in[34], in[38], in[42], in[46], out[34], out[38], out[42],
+ out[46]);
+ TRANSPOSE_4X4(in[35], in[39], in[43], in[47], out[50], out[54], out[58],
+ out[62]);
+ TRANSPOSE_4X4(in[50], in[54], in[58], in[62], out[35], out[39], out[43],
+ out[47]);
+ TRANSPOSE_4X4(in[51], in[55], in[59], in[63], out[51], out[55], out[59],
+ out[63]);
+}
+
+static INLINE void transpose_32x32(const __m128i *input, __m128i *output) {
+ for (int j = 0; j < 8; j++) {
+ for (int i = 0; i < 8; i++) {
+ TRANSPOSE_4X4(input[i * 32 + j + 0], input[i * 32 + j + 8],
+ input[i * 32 + j + 16], input[i * 32 + j + 24],
+ output[j * 32 + i + 0], output[j * 32 + i + 8],
+ output[j * 32 + i + 16], output[j * 32 + i + 24]);
+ }
+ }
+}
+
+// Note:
+// rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *w1, const __m128i *n1,
+ const __m128i *rounding, int bit) {
+ __m128i x, y;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ y = _mm_mullo_epi32(*w1, *n1);
+ x = _mm_add_epi32(x, y);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+static INLINE __m128i half_btf_0_sse4_1(const __m128i *w0, const __m128i *n0,
+ const __m128i *rounding, int bit) {
+ __m128i x;
+
+ x = _mm_mullo_epi32(*w0, *n0);
+ x = _mm_add_epi32(x, *rounding);
+ x = _mm_srai_epi32(x, bit);
+ return x;
+}
+
+typedef void (*transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ int do_cols, int bd, int out_shift);
+
+typedef void (*fwd_transform_1d_sse4_1)(__m128i *in, __m128i *out, int bit,
+ const int num_cols);
+
+void av1_highbd_inv_txfm2d_add_universe_sse4_1(const int32_t *input,
+ uint8_t *output, int stride,
+ TX_TYPE tx_type, TX_SIZE tx_size,
+ int eob, const int bd);
+
+#endif // AOM_AV1_COMMON_X86_HIGHBD_TXFM_UTILITY_SSE4_H_
diff --git a/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
new file mode 100644
index 0000000000..4bcab05645
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_warp_plane_sse4.c
@@ -0,0 +1,624 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+static const uint8_t warp_highbd_arrange_bytes[16] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+};
+
+static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
+ 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+};
+static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
+ 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
+};
+static const uint8_t highbd_shuffle_alpha0_mask2[16] = {
+ 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11
+};
+static const uint8_t highbd_shuffle_alpha0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
+ coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
+ coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
+ coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
+ coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ // Filter odd-index pixels
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
+ int sx, __m128i *coeff) {
+ // Filter coeff
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
+
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
+ coeff[4] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
+ coeff[6] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
+
+ coeff[1] = coeff[0];
+ coeff[3] = coeff[2];
+ coeff[5] = coeff[4];
+ coeff[7] = coeff[6];
+}
+
+static INLINE void highbd_filter_src_pixels(
+ const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
+ const __m128i src_1 = *src;
+ const __m128i src2_1 = *src2;
+
+ const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
+
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
+
+ __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
+ _mm_cvtsi32_si128(reduce_bits_horiz));
+
+ // Combine results into one register.
+ // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+ // as this order helps with the vertical filter.
+ tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+}
+
+static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
+ __m128i *tmp, int sx, int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_alpha0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter_beta0(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[8];
+ highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+ highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
+ reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void highbd_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void highbd_prepare_warp_horizontal_filter(
+ const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ highbd_warp_horizontal_filter_alpha0_beta0(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha == 0 && beta != 0)
+ highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+
+ else if (alpha != 0 && beta == 0)
+ highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else
+ highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
+ int width, int height, int stride,
+ uint16_t *pred, int p_col, int p_row,
+ int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y, int bd,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int reduce_bits_horiz =
+ conv_params->round_0 +
+ AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+ assert(!(bd == 12 && reduce_bits_horiz < 5));
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const __m128i res_sub_const =
+ _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
+ __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi32(w0);
+ const __m128i wt1 = _mm_set1_epi32(w1);
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ const __m128i src2 =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
+
+ const __m128i src_01 = _mm_shuffle_epi8(
+ src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+ const __m128i src2_01 = _mm_shuffle_epi8(
+ src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
+
+ __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
+ __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
+
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
+ }
+
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
+ src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
+ }
+
+ const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
+ const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
+
+ highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+ } else {
+ highbd_prepare_warp_horizontal_filter(
+ ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter +
+ ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo = _mm_add_epi32(res_lo, res_add_const);
+ res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+
+ if (conv_params->do_average) {
+ __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
+
+ if (conv_params->use_jnt_comp_avg) {
+ res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
+ _mm_mullo_epi32(res_lo, wt1));
+ res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
+ } else {
+ res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
+ }
+
+ __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
+ res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
+ round_bits_shift);
+
+ __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
+ res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
+ _mm_storel_epi64(dst16, res16_lo);
+ } else {
+ res_lo = _mm_packus_epi32(res_lo, res_lo);
+ _mm_storel_epi64(p, res_lo);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+
+ res_hi = _mm_add_epi32(res_hi, res_add_const);
+ res_hi =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
+ reduce_bits_vert_shift);
+ if (conv_params->do_average) {
+ __m128i *const dst16_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
+
+ if (conv_params->use_jnt_comp_avg) {
+ res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
+ _mm_mullo_epi32(res_hi, wt1));
+ res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
+ } else {
+ res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
+ }
+
+ __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
+ res32_hi = _mm_sra_epi32(
+ _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
+ __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
+ res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
+ _mm_storel_epi64(dst16_4, res16_hi);
+ } else {
+ res_hi = _mm_packus_epi32(res_hi, res_hi);
+ _mm_storel_epi64(p4, res_hi);
+ }
+ }
+ } else {
+ // Round and pack into 8 bits
+ const __m128i round_const =
+ _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
+
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ // Clamp res_16bit to the range [0, 2^bd - 1]
+ const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i zero = _mm_setzero_si128();
+ res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ _mm_storel_epi64(p, res_16bit);
+ } else {
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
new file mode 100644
index 0000000000..0c8a8505b0
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_avx2.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
+// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
+void av1_highbd_wiener_convolve_add_src_avx2(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const ConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+
+ /* Horizontal filter */
+ {
+ const __m256i clamp_high_ep =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *src_ij = src_ptr + i * src_stride + j;
+
+ // Load 16-bit src data
+ const __m256i src_0 = yy_loadu_256(src_ij + 0);
+ const __m256i src_1 = yy_loadu_256(src_ij + 1);
+ const __m256i src_2 = yy_loadu_256(src_ij + 2);
+ const __m256i src_3 = yy_loadu_256(src_ij + 3);
+ const __m256i src_4 = yy_loadu_256(src_ij + 4);
+ const __m256i src_5 = yy_loadu_256(src_ij + 5);
+ const __m256i src_6 = yy_loadu_256(src_ij + 6);
+ const __m256i src_7 = yy_loadu_256(src_ij + 7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
+
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+ const __m256i res_16bit_clamped = _mm256_min_epi16(
+ _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
+
+ // Store in the dst array
+ yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
new file mode 100644
index 0000000000..818b1099c9
--- /dev/null
+++ b/third_party/aom/av1/common/x86/highbd_wiener_convolve_ssse3.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_highbd_wiener_convolve_add_src_ssse3(
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4, int w, int h,
+ const ConvolveParams *conv_params, int bd) {
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 1;
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
+ const __m128i res_2 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
+ const __m128i res_4 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
+ const __m128i res_6 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i res_1 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
+ const __m128i res_3 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
+ const __m128i res_5 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
+ const __m128i res_7 =
+ _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ const __m128i maxval =
+ _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
+ __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storeu_si128(p, res_16bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/intra_edge_sse4.c b/third_party/aom/av1/common/x86/intra_edge_sse4.c
new file mode 100644
index 0000000000..0c857b5835
--- /dev/null
+++ b/third_party/aom/av1/common/x86/intra_edge_sse4.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = {
+ { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4
+ { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5
+ { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = {
+ { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi8(p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint8_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint8_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i two = _mm_set1_epi8(2);
+ __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]);
+ __m128i shuf_b = _mm_add_epi8(shuf_a, two);
+ __m128i shuf_c = _mm_add_epi8(shuf_b, two);
+ __m128i shuf_d = _mm_add_epi8(shuf_c, two);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)in);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf_a);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf_b);
+ __m128i d2 = _mm_shuffle_epi8(in0, shuf_c);
+ __m128i d3 = _mm_shuffle_epi8(in0, shuf_d);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ d0 = _mm_hadd_epi16(d0, d2);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d0 = _mm_packus_epi16(d0, d0);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi8(n_out);
+ __m128i mask = _mm_cmpgt_epi8(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storel_epi64((__m128i *)out, out0);
+ __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16));
+ in0 = _mm_alignr_epi8(in1, in0, 8);
+ in += 8;
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_filter_intra_edge_high_sse4_1(uint16_t *p, int sz, int strength) {
+ if (!strength) return;
+
+ DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = {
+ { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4
+ { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5
+ { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2
+ };
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
+
+ // Extend the first and last samples to simplify the loop for the 5-tap case
+ p[-1] = p[0];
+ __m128i last = _mm_set1_epi16(p[sz - 1]);
+ _mm_storeu_si128((__m128i *)&p[sz], last);
+
+ // Adjust input pointer for filter support area
+ uint16_t *in = (strength == 3) ? p - 1 : p;
+
+ // Avoid modifying first sample
+ uint16_t *out = p + 1;
+ int len = sz - 1;
+
+ const int use_3tap_filter = (strength < 3);
+
+ if (use_3tap_filter) {
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in02 = _mm_add_epi16(in0, in2);
+ __m128i d0 = _mm_unpacklo_epi16(in02, in1);
+ __m128i d1 = _mm_unpackhi_epi16(in02, in1);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ } else { // 5-tap filter
+ __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]);
+ __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ while (len > 0) {
+ int n_out = (len < 8) ? len : 8;
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i in4 = _mm_alignr_epi8(in8, in0, 8);
+ __m128i in04 = _mm_add_epi16(in0, in4);
+ __m128i in123 = _mm_add_epi16(in1, in2);
+ in123 = _mm_add_epi16(in123, in3);
+ __m128i d0 = _mm_unpacklo_epi16(in04, in123);
+ __m128i d1 = _mm_unpackhi_epi16(in04, in123);
+ d0 = _mm_mullo_epi16(d0, coef0);
+ d1 = _mm_mullo_epi16(d1, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d0 = _mm_srli_epi16(d0, 4);
+ __m128i out0 = _mm_lddqu_si128((__m128i *)out);
+ __m128i n0 = _mm_set1_epi16(n_out);
+ __m128i mask = _mm_cmpgt_epi16(n0, iden);
+ out0 = _mm_blendv_epi8(out0, d0, mask);
+ _mm_storeu_si128((__m128i *)out, out0);
+ in += 8;
+ in0 = in8;
+ in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ out += 8;
+ len -= n_out;
+ }
+ }
+}
+
+void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = {
+ { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 }
+ };
+
+ DECLARE_ALIGNED(16, static const int8_t, v_const[2][16]) = {
+ { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 },
+ { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint8_t *in = &p[-2];
+ uint8_t *out = &p[-2];
+
+ int n = sz + 1; // Input length including upper-left sample
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]);
+ __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]);
+
+ while (n > 0) {
+ __m128i in8 = _mm_alignr_epi8(in16, in0, 8);
+ __m128i d0 = _mm_shuffle_epi8(in0, shuf0);
+ __m128i d1 = _mm_shuffle_epi8(in0, shuf1);
+ __m128i d2 = _mm_shuffle_epi8(in8, shuf0);
+ __m128i d3 = _mm_shuffle_epi8(in8, shuf1);
+ d0 = _mm_maddubs_epi16(d0, coef0);
+ d1 = _mm_maddubs_epi16(d1, coef0);
+ d2 = _mm_maddubs_epi16(d2, coef0);
+ d3 = _mm_maddubs_epi16(d3, coef0);
+ d0 = _mm_hadd_epi16(d0, d1);
+ d2 = _mm_hadd_epi16(d2, d3);
+ __m128i eight = _mm_set1_epi16(8);
+ d0 = _mm_add_epi16(d0, eight);
+ d2 = _mm_add_epi16(d2, eight);
+ d0 = _mm_srai_epi16(d0, 4);
+ d2 = _mm_srai_epi16(d2, 4);
+ d0 = _mm_packus_epi16(d0, d2);
+ __m128i in1 = _mm_alignr_epi8(in16, in0, 1);
+ __m128i out0 = _mm_unpacklo_epi8(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi8(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[16], out1);
+ in0 = in16;
+ in16 = _mm_setzero_si128();
+ out += 32;
+ n -= 16;
+ }
+}
+
+void av1_upsample_intra_edge_high_sse4_1(uint16_t *p, int sz, int bd) {
+ // interpolate half-sample positions
+ assert(sz <= 24);
+
+ DECLARE_ALIGNED(16, static const int16_t,
+ kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } };
+
+ // Extend first/last samples (upper-left p[-1], last p[sz-1])
+ // to support 4-tap filter
+ p[-2] = p[-1];
+ p[sz] = p[sz - 1];
+
+ uint16_t *in = &p[-2];
+ uint16_t *out = in;
+ int n = sz + 1;
+
+ __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]);
+ __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]);
+ __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]);
+ __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]);
+
+ while (n > 0) {
+ __m128i in1 = _mm_alignr_epi8(in8, in0, 2);
+ __m128i in2 = _mm_alignr_epi8(in8, in0, 4);
+ __m128i in3 = _mm_alignr_epi8(in8, in0, 6);
+ __m128i sum0 = _mm_add_epi16(in0, in3);
+ __m128i sum1 = _mm_add_epi16(in1, in2);
+ __m128i d0 = _mm_unpacklo_epi16(sum0, sum1);
+ __m128i d1 = _mm_unpackhi_epi16(sum0, sum1);
+ __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]);
+ d0 = _mm_madd_epi16(d0, coef0);
+ d1 = _mm_madd_epi16(d1, coef0);
+ __m128i eight = _mm_set1_epi32(8);
+ d0 = _mm_add_epi32(d0, eight);
+ d1 = _mm_add_epi32(d1, eight);
+ d0 = _mm_srai_epi32(d0, 4);
+ d1 = _mm_srai_epi32(d1, 4);
+ d0 = _mm_packus_epi32(d0, d1);
+ __m128i max0 = _mm_set1_epi16((1 << bd) - 1);
+ d0 = _mm_min_epi16(d0, max0);
+ __m128i out0 = _mm_unpacklo_epi16(in1, d0);
+ __m128i out1 = _mm_unpackhi_epi16(in1, d0);
+ _mm_storeu_si128((__m128i *)&out[0], out0);
+ _mm_storeu_si128((__m128i *)&out[8], out1);
+ in0 = in8;
+ in8 = in16;
+ in16 = in24;
+ in24 = _mm_setzero_si128();
+ out += 16;
+ n -= 8;
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_avx2.c b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
new file mode 100644
index 0000000000..9f2e2b4578
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_avx2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+#include "aom_dsp/x86/convolve_sse4_1.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "av1/common/convolve.h"
+
+static INLINE __m256i unpack_weights_avx2(ConvolveParams *conv_params) {
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m256i wt0 = _mm256_set1_epi16(w0);
+ const __m256i wt1 = _mm256_set1_epi16(w1);
+ const __m256i wt = _mm256_unpacklo_epi16(wt0, wt1);
+ return wt;
+}
+
+static INLINE __m256i load_line2_avx2(const void *a, const void *b) {
+ return _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)a)),
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)b)), 0x20);
+}
+
+void av1_jnt_convolve_x_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ __m256i filt[4], coeffs[4];
+
+ assert(bits >= 0);
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs);
+
+ const __m256i round_const =
+ _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ for (i = 0; i < h; i += 2) {
+ const uint8_t *src_data = src_ptr + i * src_stride;
+ CONV_BUF_TYPE *dst_data = dst + i * dst_stride;
+ for (j = 0; j < w; j += 8) {
+ const __m256i data =
+ load_line2_avx2(&src_data[j], &src_data[j + src_stride]);
+
+ __m256i res = convolve_lowbd_x(data, coeffs, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const), round_shift);
+
+ res = _mm256_slli_epi16(res, bits);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst_data[j], &dst_data[j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_y_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride;
+ // +1 to compensate for dividing the filter coeffs by 2
+ const int left_shift = FILTER_BITS - conv_params->round_0 + 1;
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int offset_1 = (1 << (bd + FILTER_BITS - 2));
+ const __m256i offset_const_1 = _mm256_set1_epi16(offset_1);
+ const __m256i offset_const_2 = _mm256_set1_epi16((1 << offset_0));
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i coeffs[4], s[8];
+
+ assert((FILTER_BITS - conv_params->round_0) >= 0);
+
+ prepare_coeffs_lowbd(filter_params_y, subpel_y_q4, coeffs);
+
+ (void)conv_params;
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ for (j = 0; j < w; j += 16) {
+ const uint8_t *data = &src_ptr[j];
+ __m256i src6;
+ // Load lines a and b. Line a to lower 128, line b to upper 128
+ {
+ __m256i src_ab[7];
+ __m256i src_a[7];
+ src_a[0] = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ for (int kk = 0; kk < 6; ++kk) {
+ data += src_stride;
+ src_a[kk + 1] =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ src_ab[kk] = _mm256_permute2x128_si256(src_a[kk], src_a[kk + 1], 0x20);
+ }
+ src6 = src_a[6];
+ s[0] = _mm256_unpacklo_epi8(src_ab[0], src_ab[1]);
+ s[1] = _mm256_unpacklo_epi8(src_ab[2], src_ab[3]);
+ s[2] = _mm256_unpacklo_epi8(src_ab[4], src_ab[5]);
+ s[4] = _mm256_unpackhi_epi8(src_ab[0], src_ab[1]);
+ s[5] = _mm256_unpackhi_epi8(src_ab[2], src_ab[3]);
+ s[6] = _mm256_unpackhi_epi8(src_ab[4], src_ab[5]);
+ }
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[(i + 7) * src_stride + j];
+ const __m256i src7 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)data));
+ const __m256i src_67a = _mm256_permute2x128_si256(src6, src7, 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + src_stride)));
+ const __m256i src_78a = _mm256_permute2x128_si256(src7, src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+ s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+ __m256i res_lo = convolve_lowbd(s, coeffs);
+
+ res_lo = _mm256_add_epi16(res_lo, offset_const_1);
+
+ const __m256i res_lo_0_32b = _mm256_unpacklo_epi16(res_lo, zero);
+ const __m256i res_lo_0_shift =
+ _mm256_slli_epi32(res_lo_0_32b, left_shift);
+ const __m256i res_lo_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_0_shift, round_const), round_shift);
+
+ const __m256i res_lo_1_32b = _mm256_unpackhi_epi16(res_lo, zero);
+ const __m256i res_lo_1_shift =
+ _mm256_slli_epi32(res_lo_1_32b, left_shift);
+ const __m256i res_lo_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_lo_1_shift, round_const), round_shift);
+
+ const __m256i res_lo_round =
+ _mm256_packs_epi32(res_lo_0_round, res_lo_1_round);
+
+ const __m256i res_lo_unsigned =
+ _mm256_add_epi16(res_lo_round, offset_const_2);
+
+ if (w - j < 16) {
+ if (do_average) {
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w - j > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ __m256i res_hi = convolve_lowbd(s + 4, coeffs);
+
+ res_hi = _mm256_add_epi16(res_hi, offset_const_1);
+
+ const __m256i res_hi_0_32b = _mm256_unpacklo_epi16(res_hi, zero);
+ const __m256i res_hi_0_shift =
+ _mm256_slli_epi32(res_hi_0_32b, left_shift);
+ const __m256i res_hi_0_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_0_shift, round_const), round_shift);
+
+ const __m256i res_hi_1_32b = _mm256_unpackhi_epi16(res_hi, zero);
+ const __m256i res_hi_1_shift =
+ _mm256_slli_epi32(res_hi_1_32b, left_shift);
+ const __m256i res_hi_1_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_hi_1_shift, round_const), round_shift);
+
+ const __m256i res_hi_round =
+ _mm256_packs_epi32(res_hi_0_round, res_hi_1_round);
+
+ const __m256i res_hi_unsigned =
+ _mm256_add_epi16(res_hi_round, offset_const_2);
+
+ if (do_average) {
+ const __m256i data_ref_0_lo = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i data_ref_0_hi =
+ load_line2_avx2(&dst[i * dst_stride + j + 8],
+ &dst[i * dst_stride + j + 8 + dst_stride]);
+
+ const __m256i comp_avg_res_lo =
+ comp_avg(&data_ref_0_lo, &res_lo_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i comp_avg_res_hi =
+ comp_avg(&data_ref_0_hi, &res_hi_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result_lo = convolve_rounding(
+ &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i round_result_hi = convolve_rounding(
+ &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result_lo, round_result_hi);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_store_si128(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+
+ } else {
+ const __m128i res_lo_0 = _mm256_castsi256_si128(res_lo_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_lo_0);
+
+ const __m128i res_lo_1 = _mm256_extracti128_si256(res_lo_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_lo_1);
+
+ const __m128i res_hi_0 = _mm256_castsi256_si128(res_hi_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + 8]), res_hi_0);
+
+ const __m128i res_hi_1 = _mm256_extracti128_si256(res_hi_unsigned, 1);
+ _mm_store_si128(
+ (__m128i *)(&dst[i * dst_stride + j + 8 + dst_stride]), res_hi_1);
+ }
+ }
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+}
+
+void av1_jnt_convolve_2d_avx2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = 8;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ __m256i filt[4], s[8], coeffs_x[4], coeffs_y[4];
+
+ assert(conv_params->round_0 > 0);
+
+ filt[0] = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_q4, coeffs_x);
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+ const __m256i round_const_h = _mm256_set1_epi16(
+ ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ const uint8_t *src_h = src_ptr + j;
+ for (i = 0; i < im_h; i += 2) {
+ __m256i data =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));
+ if (i + 1 < im_h)
+ data = _mm256_inserti128_si256(
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1);
+ src_h += (src_stride << 1);
+ __m256i res = convolve_lowbd_x(data, coeffs_x, filt);
+
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),
+ round_shift_h);
+
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+ }
+ }
+
+ /* Vertical filter */
+ {
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
+
+ s[0] = _mm256_unpacklo_epi16(s0, s1);
+ s[1] = _mm256_unpacklo_epi16(s2, s3);
+ s[2] = _mm256_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm256_unpackhi_epi16(s0, s1);
+ s[5] = _mm256_unpackhi_epi16(s2, s3);
+ s[6] = _mm256_unpackhi_epi16(s4, s5);
+
+ for (i = 0; i < h; i += 2) {
+ const int16_t *data = &im_block[i * im_stride];
+
+ const __m256i s6 =
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
+ const __m256i s7 =
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
+
+ s[3] = _mm256_unpacklo_epi16(s6, s7);
+ s[7] = _mm256_unpackhi_epi16(s6, s7);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+ const __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ const __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v);
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ } else {
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ load_line2_avx2(&dst[i * dst_stride + j],
+ &dst[i * dst_stride + j + dst_stride]);
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 =
+ _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_jnt_convolve_2d_copy_avx2(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ (void)filter_params_x;
+ (void)filter_params_y;
+ (void)subpel_x_q4;
+ (void)subpel_y_q4;
+
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const __m256i wt = unpack_weights_avx2(conv_params);
+ const __m256i zero = _mm256_setzero_si256();
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m256i offset_const = _mm256_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m256i rounding_const = _mm256_set1_epi16((1 << rounding_shift) >> 1);
+ int i, j;
+
+ if (!(w % 16)) {
+ for (i = 0; i < h; i += 1) {
+ for (j = 0; j < w; j += 16) {
+ const __m256i src_16bit = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])));
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ if (do_average) {
+ const __m256i data_ref_0 =
+ _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
+
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m256i res_0 = _mm256_permute4x64_epi64(res_8, 0xD8);
+
+ _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]),
+ _mm256_castsi256_si128(res_0));
+ } else {
+ _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
+ res_unsigned);
+ }
+ }
+ }
+ } else if (!(w % 4)) {
+ for (i = 0; i < h; i += 2) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i src_row_0 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
+ const __m128i src_row_1 =
+ _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
+ // since not all compilers yet support _mm256_set_m128i()
+ const __m256i src_10 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(src_row_0), src_row_1, 1);
+
+ const __m256i src_16bit = _mm256_unpacklo_epi8(src_10, zero);
+
+ const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
+
+ const __m256i res_unsigned = _mm256_add_epi16(res, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m256i data_ref_0 = load_line2_avx2(
+ &dst[i * dst_stride + j], &dst[i * dst_stride + j + dst_stride]);
+ const __m256i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m256i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m256i res_8 = _mm256_packus_epi16(round_result, round_result);
+ const __m128i res_0 = _mm256_castsi256_si128(res_8);
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);
+
+ if (w > 4) {
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
+ _mm_storel_epi64(
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1);
+ } else {
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_0);
+ *(uint32_t *)(&dst0[i * dst_stride0 + j + dst_stride0]) =
+ _mm_cvtsi128_si32(res_1);
+ }
+ } else {
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
+
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
+ res_1);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_sse2.c b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
new file mode 100644
index 0000000000..87dc3242e8
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_sse2.c
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_x_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_horiz;
+ const int bits = FILTER_BITS - conv_params->round_1;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_0) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ __m128i coeffs[4];
+
+ (void)filter_params_y;
+ (void)subpel_y_q4;
+
+ prepare_coeffs(filter_params_x, subpel_x_q4, coeffs);
+
+ if (w == 4) {
+ do {
+ const __m128i data = _mm_loadu_si128((__m128i *)src_ptr);
+ __m128i s[4];
+
+ s[0] = _mm_unpacklo_epi8(data, _mm_srli_si128(data, 1));
+ s[1] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), _mm_srli_si128(data, 3));
+ s[2] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), _mm_srli_si128(data, 5));
+ s[3] =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), _mm_srli_si128(data, 7));
+ const __m128i res_lo = convolve_lo_x(s, coeffs);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_lo_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[0]), res_unsigned);
+ }
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+ } while (--h);
+ } else {
+ assert(!(w % 8));
+ int i = 0;
+ do {
+ int j = 0;
+ do {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ __m128i s[4];
+
+ // Filter even-index pixels
+ s[0] = data;
+ s[1] = _mm_srli_si128(data, 2);
+ s[2] = _mm_srli_si128(data, 4);
+ s[3] = _mm_srli_si128(data, 6);
+ const __m128i res_even = convolve_lo_x(s, coeffs);
+
+ // Filter odd-index pixels
+ s[0] = _mm_srli_si128(data, 1);
+ s[1] = _mm_srli_si128(data, 3);
+ s[2] = _mm_srli_si128(data, 5);
+ s[3] = _mm_srli_si128(data, 7);
+ const __m128i res_odd = convolve_lo_x(s, coeffs);
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+ const __m128i res_lo_shift = _mm_sll_epi32(res_lo_round, left_shift);
+ const __m128i res_hi_shift = _mm_sll_epi32(res_hi_round, left_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ j += 8;
+ } while (j < w);
+ } while (++i < h);
+ }
+}
+
+void av1_jnt_convolve_y_sse2(const uint8_t *src, int src_stride, uint8_t *dst0,
+ int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ const int dst_stride = conv_params->dst_stride;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint8_t *src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i left_shift = _mm_cvtsi32_si128(bits);
+ const __m128i wt0 = _mm_set1_epi16(conv_params->fwd_offset);
+ const __m128i wt1 = _mm_set1_epi16(conv_params->bck_offset);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+ const __m128i round_const = _mm_set1_epi32((1 << conv_params->round_1) >> 1);
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+ __m128i coeffs[4];
+
+ (void)filter_params_x;
+ (void)subpel_x_q4;
+
+ prepare_coeffs(filter_params_y, subpel_y_q4, coeffs);
+
+ if (w == 4) {
+ __m128i s[8], src6, res, res_shift;
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+
+ do {
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+ src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+
+ res = convolve_lo_y(s + 0, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_shift, res_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ res = convolve_lo_y(s + 1, coeffs);
+ res_shift = _mm_sll_epi32(res, left_shift);
+ res_shift =
+ _mm_sra_epi32(_mm_add_epi32(res_shift, round_const), round_shift);
+
+ res_16b = _mm_packs_epi32(res_shift, res_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 = _mm_loadu_si128((__m128i *)dst);
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ *(uint32_t *)(&dst0[0]) = _mm_cvtsi128_si32(res_8);
+
+ } else {
+ _mm_store_si128((__m128i *)dst, res_unsigned);
+ }
+
+ src_ptr += src_stride;
+ dst += dst_stride;
+ dst0 += dst_stride0;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ h -= 2;
+ } while (h);
+ } else {
+ assert(!(w % 8));
+ int j = 0;
+ do {
+ __m128i s[8], src6, res_lo, res_hi, res_lo_shift, res_hi_shift;
+ const uint8_t *data = &src_ptr[j];
+
+ src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+ s[0] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+ s[1] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+ s[2] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+ s[3] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+ s[4] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+ s[5] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+ int i = 0;
+ do {
+ data = &src_ptr[i * src_stride + j];
+ s[6] = _mm_unpacklo_epi8(
+ src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+ src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+ s[7] = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+ res_lo = convolve_lo_y(s, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+
+ __m128i res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ res_lo = convolve_lo_y(s + 1, coeffs); // Filter low index pixels
+ res_hi = convolve_hi_y(s + 1, coeffs); // Filter high index pixels
+ res_lo_shift = _mm_sll_epi32(res_lo, left_shift);
+ res_hi_shift = _mm_sll_epi32(res_hi, left_shift);
+ res_lo_shift = _mm_sra_epi32(_mm_add_epi32(res_lo_shift, round_const),
+ round_shift);
+ res_hi_shift = _mm_sra_epi32(_mm_add_epi32(res_hi_shift, round_const),
+ round_shift);
+ res_16b = _mm_packs_epi32(res_lo_shift, res_hi_shift);
+ res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ i++;
+
+ s[0] = s[2];
+ s[1] = s[3];
+ s[2] = s[4];
+ s[3] = s[5];
+ s[4] = s[6];
+ s[5] = s[7];
+ } while (i < h);
+ j += 8;
+ } while (j < w);
+ }
+}
diff --git a/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
new file mode 100644
index 0000000000..822772782b
--- /dev/null
+++ b/third_party/aom/av1/common/x86/jnt_convolve_ssse3.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_jnt_convolve_2d_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst0, int dst_stride0, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_q4, const int subpel_y_q4,
+ ConvolveParams *conv_params) {
+ CONV_BUF_TYPE *dst = conv_params->dst;
+ int dst_stride = conv_params->dst_stride;
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const int do_average = conv_params->do_average;
+ const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ const __m128i wt = _mm_unpacklo_epi16(wt0, wt1);
+
+ const int offset_0 =
+ bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
+ const __m128i offset_const = _mm_set1_epi16(offset);
+ const int rounding_shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_q4 & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ const __m128i src_lo = _mm_unpacklo_epi8(data, zero);
+ const __m128i src_hi = _mm_unpackhi_epi8(data, zero);
+
+ // Filter even-index pixels
+ const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01);
+ const __m128i src_2 = _mm_alignr_epi8(src_hi, src_lo, 4);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_alignr_epi8(src_hi, src_lo, 8);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_alignr_epi8(src_hi, src_lo, 12);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_alignr_epi8(src_hi, src_lo, 2);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_alignr_epi8(src_hi, src_lo, 6);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_alignr_epi8(src_hi, src_lo, 10);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_alignr_epi8(src_hi, src_lo, 14);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_q4 & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << conv_params->round_1) >> 1) -
+ (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+ const __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+ const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const);
+
+ // Accumulate values into the destination buffer
+ if (do_average) {
+ const __m128i data_ref_0 =
+ _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));
+
+ const __m128i comp_avg_res =
+ comp_avg(&data_ref_0, &res_unsigned, &wt, use_jnt_comp_avg);
+
+ const __m128i round_result = convolve_rounding(
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
+
+ const __m128i res_8 = _mm_packus_epi16(round_result, round_result);
+
+ if (w > 4)
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8);
+ else
+ *(uint32_t *)(&dst0[i * dst_stride0 + j]) =
+ _mm_cvtsi128_si32(res_8);
+ } else {
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned);
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_avx2.c b/third_party/aom/av1/common/x86/reconinter_avx2.c
new file mode 100644
index 0000000000..f645e04541
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_avx2.c
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m256i calc_mask_avx2(const __m256i mask_base, const __m256i s0,
+ const __m256i s1) {
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(s0, s1));
+ return _mm256_abs_epi16(
+ _mm256_add_epi16(mask_base, _mm256_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = xx_loadl_32(src0);
+ const __m128i s0B = xx_loadl_32(src0 + stride0);
+ const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
+ const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
+ const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);
+
+ const __m128i s1A = xx_loadl_32(src1);
+ const __m128i s1B = xx_loadl_32(src1 + stride1);
+ const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
+ const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
+ const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
+ const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ const __m128i x_m8 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
+ xx_storeu_128(mask, x_m8);
+ src0 += (stride0 << 2);
+ src1 += (stride1 << 2);
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + stride0);
+ const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
+ const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
+ const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
+ const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + stride1);
+ const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
+ const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
+ const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
+ const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
+ const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
+ const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
+ const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 2;
+ src1 += stride1 << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (16 == w) {
+ do {
+ const __m128i s0A = xx_load_128(src0);
+ const __m128i s0B = xx_load_128(src0 + stride0);
+ const __m128i s1A = xx_load_128(src1);
+ const __m128i s1B = xx_load_128(src1 + stride1);
+ const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
+ const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
+ const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
+ const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);
+
+ const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
+ const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);
+
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
+ yy_storeu_256(mask, m8);
+ src0 += stride0 << 1;
+ src1 += stride1 << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m256i s0 = yy_loadu_256(src0 + j);
+ const __m256i s1 = yy_loadu_256(src1 + j);
+ const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
+ const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
+ const __m256i s0H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
+ const __m256i s1H =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
+ const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
+ const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
+ const __m256i m8 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
+ yy_storeu_256(mask + j, m8);
+ j += 32;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff, int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ return diff_clamp;
+}
+
+static INLINE __m256i calc_mask_d16_inv_avx2(const __m256i *data_src0,
+ const __m256i *data_src1,
+ const __m256i *round_const,
+ const __m256i *mask_base_16,
+ const __m256i *clip_diff,
+ int round) {
+ const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1);
+ const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0);
+ const __m256i diff = _mm256_max_epu16(diffa, diffb);
+ const __m256i diff_round =
+ _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round);
+ const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16);
+ const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff);
+ const __m256i diff_const_16 = _mm256_sub_epi16(*clip_diff, diff_clamp);
+ return diff_const_16;
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 = calc_mask_d16_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+static INLINE void build_compound_diffwtd_mask_d16_inv_avx2(
+ uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) {
+ const int mask_base = 38;
+ const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1);
+ const __m256i y38 = _mm256_set1_epi16(mask_base);
+ const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ int i = 0;
+ if (w == 4) {
+ do {
+ const __m128i s0A = xx_loadl_64(src0);
+ const __m128i s0B = xx_loadl_64(src0 + src0_stride);
+ const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2);
+ const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3);
+ const __m128i s1A = xx_loadl_64(src1);
+ const __m128i s1B = xx_loadl_64(src1 + src1_stride);
+ const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2);
+ const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3);
+ const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D),
+ _mm_unpacklo_epi64(s0A, s0B));
+ const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D),
+ _mm_unpacklo_epi64(s1A, s1B));
+ const __m256i m16 =
+ calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
+ xx_storeu_128(mask,
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 16;
+ i += 4;
+ } while (i < h);
+ } else if (w == 8) {
+ do {
+ const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0);
+ const __m256i s0CD =
+ yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2);
+ const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1);
+ const __m256i s1CD =
+ yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2);
+ const __m256i m16AB =
+ calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift);
+ const __m256i m16CD =
+ calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 2;
+ src1 += src1_stride << 2;
+ mask += 32;
+ i += 4;
+ } while (i < h);
+ } else if (w == 16) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + src1_stride);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride << 1;
+ src1 += src1_stride << 1;
+ mask += 32;
+ i += 2;
+ } while (i < h);
+ } else if (w == 32) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m8 = _mm256_packus_epi16(m16A, m16B);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 32;
+ i += 1;
+ } while (i < h);
+ } else if (w == 64) {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 64;
+ i += 1;
+ } while (i < h);
+ } else {
+ do {
+ const __m256i s0A = yy_loadu_256(src0);
+ const __m256i s0B = yy_loadu_256(src0 + 16);
+ const __m256i s0C = yy_loadu_256(src0 + 32);
+ const __m256i s0D = yy_loadu_256(src0 + 48);
+ const __m256i s0E = yy_loadu_256(src0 + 64);
+ const __m256i s0F = yy_loadu_256(src0 + 80);
+ const __m256i s0G = yy_loadu_256(src0 + 96);
+ const __m256i s0H = yy_loadu_256(src0 + 112);
+ const __m256i s1A = yy_loadu_256(src1);
+ const __m256i s1B = yy_loadu_256(src1 + 16);
+ const __m256i s1C = yy_loadu_256(src1 + 32);
+ const __m256i s1D = yy_loadu_256(src1 + 48);
+ const __m256i s1E = yy_loadu_256(src1 + 64);
+ const __m256i s1F = yy_loadu_256(src1 + 80);
+ const __m256i s1G = yy_loadu_256(src1 + 96);
+ const __m256i s1H = yy_loadu_256(src1 + 112);
+ const __m256i m16A =
+ calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift);
+ const __m256i m16B =
+ calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift);
+ const __m256i m16C =
+ calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift);
+ const __m256i m16D =
+ calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift);
+ const __m256i m16E =
+ calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift);
+ const __m256i m16F =
+ calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift);
+ const __m256i m16G =
+ calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift);
+ const __m256i m16H =
+ calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift);
+ const __m256i m8AB = _mm256_packus_epi16(m16A, m16B);
+ const __m256i m8CD = _mm256_packus_epi16(m16C, m16D);
+ const __m256i m8EF = _mm256_packus_epi16(m16E, m16F);
+ const __m256i m8GH = _mm256_packus_epi16(m16G, m16H);
+ yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8));
+ yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8));
+ yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8));
+ yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8));
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 128;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int shift =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+
+ if (mask_type == DIFFWTD_38) {
+ build_compound_diffwtd_mask_d16_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ } else {
+ build_compound_diffwtd_mask_d16_inv_avx2(mask, src0, src0_stride, src1,
+ src1_stride, h, w, shift);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_highbd_avx2(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 16) {
+ av1_build_compound_diffwtd_mask_highbd_ssse3(
+ mask, mask_type, src0, src0_stride, src1, src1_stride, h, w, bd);
+ } else {
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ assert(bd >= 8);
+ assert((w % 16) == 0);
+ const __m256i y0 = _mm256_setzero_si256();
+ const __m256i yAOM_BLEND_A64_MAX_ALPHA =
+ _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m256i ymask_base = _mm256_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_srai_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), DIFF_FACTOR_LOG2);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_sub_epi16(yAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ __m256i s0 = _mm256_loadu_si256((const __m256i *)&ssrc0[j]);
+ __m256i s1 = _mm256_loadu_si256((const __m256i *)&ssrc1[j]);
+ __m256i diff = _mm256_sra_epi16(
+ _mm256_abs_epi16(_mm256_sub_epi16(s0, s1)), xshift);
+ __m256i m = _mm256_min_epi16(
+ _mm256_max_epi16(y0, _mm256_add_epi16(diff, ymask_base)),
+ yAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm256_packus_epi16(m, m);
+ m = _mm256_permute4x64_epi64(m, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i m0 = _mm256_castsi256_si128(m);
+ _mm_storeu_si128((__m128i *)&mask[j], m0);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_sse4.c b/third_party/aom/av1/common/x86/reconinter_sse4.c
new file mode 100644
index 0000000000..5171ca4934
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_sse4.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "av1/common/blockd.h"
+
+static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0,
+ const __m128i s1) {
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(s0, s1));
+ return _mm_abs_epi16(_mm_add_epi16(mask_base, _mm_srli_epi16(diff, 4)));
+ // clamp(diff, 0, 64) can be skiped for diff is always in the range ( 38, 54)
+}
+
+void av1_build_compound_diffwtd_mask_sse4_1(uint8_t *mask,
+ DIFFWTD_MASK_TYPE mask_type,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ int h, int w) {
+ const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
+ const __m128i mask_base = _mm_set1_epi16(38 - mb);
+ int i = 0;
+ if (4 == w) {
+ do {
+ const __m128i s0A = _mm_cvtsi32_si128(*(uint32_t *)src0);
+ const __m128i s0B = _mm_cvtsi32_si128(*(uint32_t *)(src0 + stride0));
+ const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
+ const __m128i s0 = _mm_cvtepu8_epi16(s0AB);
+
+ const __m128i s1A = _mm_cvtsi32_si128(*(uint32_t *)src1);
+ const __m128i s1B = _mm_cvtsi32_si128(*(uint32_t *)(src1 + stride1));
+ const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
+ const __m128i s1 = _mm_cvtepu8_epi16(s1AB);
+
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+
+ *(uint32_t *)mask = _mm_cvtsi128_si32(m8);
+ *(uint32_t *)(mask + w) = _mm_extract_epi32(m8, 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += 8;
+ i += 2;
+ } while (i < h);
+ } else if (8 == w) {
+ do {
+ __m128i s0 = _mm_loadl_epi64((__m128i const *)src0);
+ __m128i s1 = _mm_loadl_epi64((__m128i const *)src1);
+ s0 = _mm_cvtepu8_epi16(s0);
+ s1 = _mm_cvtepu8_epi16(s1);
+ const __m128i m16 = calc_mask(mask_base, s0, s1);
+ const __m128i m8 = _mm_packus_epi16(m16, m16);
+ _mm_storel_epi64((__m128i *)mask, m8);
+ src0 += stride0;
+ src1 += stride1;
+ mask += 8;
+ i += 1;
+ } while (i < h);
+ } else {
+ const __m128i zero = _mm_setzero_si128();
+ do {
+ int j = 0;
+ do {
+ const __m128i s0 = _mm_load_si128((__m128i const *)(src0 + j));
+ const __m128i s1 = _mm_load_si128((__m128i const *)(src1 + j));
+ const __m128i s0L = _mm_cvtepu8_epi16(s0);
+ const __m128i s1L = _mm_cvtepu8_epi16(s1);
+ const __m128i s0H = _mm_unpackhi_epi8(s0, zero);
+ const __m128i s1H = _mm_unpackhi_epi8(s1, zero);
+
+ const __m128i m16L = calc_mask(mask_base, s0L, s1L);
+ const __m128i m16H = calc_mask(mask_base, s0H, s1H);
+
+ const __m128i m8 = _mm_packus_epi16(m16L, m16H);
+ _mm_store_si128((__m128i *)(mask + j), m8);
+ j += 16;
+ } while (j < w);
+ src0 += stride0;
+ src1 += stride1;
+ mask += w;
+ i += 1;
+ } while (i < h);
+ }
+}
+
+void av1_build_compound_diffwtd_mask_d16_sse4_1(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+ int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+ ConvolveParams *conv_params, int bd) {
+ const int which_inverse = (mask_type == DIFFWTD_38) ? 0 : 1;
+ const int mask_base = 38;
+ int round =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+ const __m128i round_const = _mm_set1_epi16((1 << round) >> 1);
+ const __m128i mask_base_16 = _mm_set1_epi16(mask_base);
+ const __m128i clip_diff = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i add_const =
+ _mm_set1_epi16((which_inverse ? AOM_BLEND_A64_MAX_ALPHA : 0));
+ const __m128i add_sign = _mm_set1_epi16((which_inverse ? -1 : 1));
+
+ int i, j;
+ // When rounding constant is added, there is a possibility of overflow.
+ // However that much precision is not required. Code should very well work for
+ // other values of DIFF_FACTOR_LOG2 and AOM_BLEND_A64_MAX_ALPHA as well. But
+ // there is a possibility of corner case bugs.
+ assert(DIFF_FACTOR_LOG2 == 4);
+ assert(AOM_BLEND_A64_MAX_ALPHA == 64);
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data_src0 =
+ _mm_loadu_si128((__m128i *)&src0[(i * src0_stride) + j]);
+ const __m128i data_src1 =
+ _mm_loadu_si128((__m128i *)&src1[(i * src1_stride) + j]);
+
+ const __m128i diffa = _mm_subs_epu16(data_src0, data_src1);
+ const __m128i diffb = _mm_subs_epu16(data_src1, data_src0);
+ const __m128i diff = _mm_max_epu16(diffa, diffb);
+ const __m128i diff_round =
+ _mm_srli_epi16(_mm_adds_epu16(diff, round_const), round);
+ const __m128i diff_factor = _mm_srli_epi16(diff_round, DIFF_FACTOR_LOG2);
+ const __m128i diff_mask = _mm_adds_epi16(diff_factor, mask_base_16);
+ __m128i diff_clamp = _mm_min_epi16(diff_mask, clip_diff);
+ // clamp to 0 can be skipped since we are using add and saturate
+ // instruction
+
+ const __m128i diff_sign = _mm_sign_epi16(diff_clamp, add_sign);
+ const __m128i diff_const_16 = _mm_add_epi16(diff_sign, add_const);
+
+ // 8 bit conversion and saturation to uint8
+ const __m128i res_8 = _mm_packus_epi16(diff_const_16, diff_const_16);
+
+ // Store values into the destination buffer
+ __m128i *const dst = (__m128i *)&mask[i * w + j];
+
+ if ((w - j) > 4) {
+ _mm_storel_epi64(dst, res_8);
+ } else { // w==4
+ *(uint32_t *)dst = _mm_cvtsi128_si32(res_8);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/reconinter_ssse3.c b/third_party/aom/av1/common/x86/reconinter_ssse3.c
new file mode 100644
index 0000000000..cf684447c5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/reconinter_ssse3.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/blockd.h"
+
+void av1_build_compound_diffwtd_mask_highbd_ssse3(
+ uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0,
+ int src0_stride, const uint8_t *src1, int src1_stride, int h, int w,
+ int bd) {
+ if (w < 8) {
+ av1_build_compound_diffwtd_mask_highbd_c(mask, mask_type, src0, src0_stride,
+ src1, src1_stride, h, w, bd);
+ } else {
+ assert(bd >= 8);
+ assert((w % 8) == 0);
+ assert(mask_type == DIFFWTD_38 || mask_type == DIFFWTD_38_INV);
+ const __m128i x0 = _mm_setzero_si128();
+ const __m128i xAOM_BLEND_A64_MAX_ALPHA =
+ _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const int mask_base = 38;
+ const __m128i xmask_base = _mm_set1_epi16(mask_base);
+ const uint16_t *ssrc0 = CONVERT_TO_SHORTPTR(src0);
+ const uint16_t *ssrc1 = CONVERT_TO_SHORTPTR(src1);
+ if (bd == 8) {
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff = _mm_srai_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)),
+ DIFF_FACTOR_LOG2);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ } else {
+ const __m128i xshift = xx_set1_64_from_32i(bd - 8 + DIFF_FACTOR_LOG2);
+ if (mask_type == DIFFWTD_38_INV) {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_sub_epi16(xAOM_BLEND_A64_MAX_ALPHA, m);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ } else {
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 8) {
+ __m128i s0 = _mm_loadu_si128((const __m128i *)&ssrc0[j]);
+ __m128i s1 = _mm_loadu_si128((const __m128i *)&ssrc1[j]);
+ __m128i diff =
+ _mm_sra_epi16(_mm_abs_epi16(_mm_sub_epi16(s0, s1)), xshift);
+ __m128i m = _mm_min_epi16(
+ _mm_max_epi16(x0, _mm_add_epi16(diff, xmask_base)),
+ xAOM_BLEND_A64_MAX_ALPHA);
+ m = _mm_packus_epi16(m, m);
+ _mm_storel_epi64((__m128i *)&mask[j], m);
+ }
+ ssrc0 += src0_stride;
+ ssrc1 += src1_stride;
+ mask += w;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_avx2.c b/third_party/aom/av1/common/x86/selfguided_avx2.c
new file mode 100644
index 0000000000..0aaf1f4547
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_avx2.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_8_32(const void *p) {
+ return _mm256_cvtepu8_epi32(xx_loadl_64(p));
+}
+
+// Load 8 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an AVX2 register.
+static __m256i yy256_load_extend_16_32(const void *p) {
+ return _mm256_cvtepu16_epi32(xx_loadu_128(p));
+}
+
+// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
+// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
+// x0+x1+...+x7
+//
+// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
+// (assumed small enough to be able to add them without overflow).
+//
+// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
+//
+// x = [h g f e][d c b a]
+// x01 = [g f e 0][c b a 0]
+// x02 = [g+h f+g e+f e][c+d b+c a+b a]
+// x03 = [e+f e 0 0][a+b a 0 0]
+// x04 = [e->h e->g e->f e][a->d a->c a->b a]
+// s = a->d
+// s01 = [a->d a->d a->d a->d]
+// s02 = [a->d a->d a->d a->d][0 0 0 0]
+// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
+static __m256i scan_32(__m256i x) {
+ const __m256i x01 = _mm256_slli_si256(x, 4);
+ const __m256i x02 = _mm256_add_epi32(x, x01);
+ const __m256i x03 = _mm256_slli_si256(x02, 8);
+ const __m256i x04 = _mm256_add_epi32(x02, x03);
+ const int32_t s = _mm256_extract_epi32(x04, 3);
+ const __m128i s01 = _mm_set1_epi32(s);
+ const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
+ return _mm256_add_epi32(x04, s02);
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
+// of 8.
+
+static void *memset_zero_avx(int32_t *dest, const __m256i *zero, size_t count) {
+ unsigned int i = 0;
+ for (i = 0; i < (count & 0xffffffe0); i += 32) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 8), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 16), *zero);
+ _mm256_storeu_si256((__m256i *)(dest + i + 24), *zero);
+ }
+ for (; i < (count & 0xfffffff8); i += 8) {
+ _mm256_storeu_si256((__m256i *)(dest + i), *zero);
+ }
+ for (; i < count; i++) {
+ dest[i] = 0;
+ }
+ return dest;
+}
+
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ const __m256i zero = _mm256_setzero_si256();
+ // Write out the zero top row
+ memset_zero_avx(A, &zero, (width + 8));
+ memset_zero_avx(B, &zero, (width + 8));
+
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the eight lanes.
+ __m256i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 8) {
+ const int ABj = 1 + j;
+
+ const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
+ const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
+
+ const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
+ const __m256i x2 = _mm256_madd_epi16(x1, x1);
+
+ const __m256i sc1 = scan_32(x1);
+ const __m256i sc2 = scan_32(x2);
+
+ const __m256i row1 =
+ _mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
+ const __m256i row2 =
+ _mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
+
+ yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
+ yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
+ ldiff2 = _mm256_set1_epi32(
+ _mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
+ }
+ }
+}
+
+// Compute 8 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
+ const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
+ const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
+ const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
+ const __m256i u = _mm256_sub_epi32(tr, tl);
+ const __m256i v = _mm256_sub_epi32(br, bl);
+ return _mm256_sub_epi32(v, u);
+}
+
+static __m256i round_for_shift(unsigned shift) {
+ return _mm256_set1_epi32((1 << shift) >> 1);
+}
+
+static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
+ __m256i an, bb;
+ if (bit_depth > 8) {
+ const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m256i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m256i a =
+ _mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
+ const __m256i b =
+ _mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm256_madd_epi16(b, b);
+ an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
+ } else {
+ bb = _mm256_madd_epi16(sum1, sum1);
+ an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
+ }
+ return _mm256_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed as follows:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fours = _mm256_add_epi32(
+ xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
+ const __m256i threes =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+
+ return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
+ threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
+
+ const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m256i mask[8];
+ for (int idx = 0; idx < 8; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
+ mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 8) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(8, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 8) {
+ sum1 = _mm256_and_si256(mask[idx], sum1);
+ sum2 = _mm256_and_si256(mask[idx], sum2);
+ }
+
+ const __m256i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m256i z = _mm256_min_epi32(
+ _mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm256_set1_epi32(255));
+
+ const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
+
+ yy_storeu_256(A + i * buf_stride + j, a_res);
+
+ const __m256i a_complement =
+ _mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
+ const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
+ const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
+ SGRPROJ_RECIP_BITS);
+
+ yy_storeu_256(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m256i xtl = yy_loadu_256(buf - 1 - stride);
+ const __m256i xt = yy_loadu_256(buf - stride);
+ const __m256i xtr = yy_loadu_256(buf + 1 - stride);
+ const __m256i xbl = yy_loadu_256(buf - 1 + stride);
+ const __m256i xb = yy_loadu_256(buf + stride);
+ const __m256i xbr = yy_loadu_256(buf + 1 + stride);
+
+ const __m256i fives =
+ _mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
+ const __m256i sixes = _mm256_add_epi32(xt, xb);
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 8 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m256i xl = yy_loadu_256(buf - 1);
+ const __m256i x = yy_loadu_256(buf);
+ const __m256i xr = yy_loadu_256(buf + 1);
+
+ const __m256i fives = _mm256_add_epi32(xl, xr);
+ const __m256i sixes = x;
+
+ const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
+
+ return _mm256_add_epi32(
+ _mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
+ fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m256i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m256i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m256i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 8) {
+ const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+
+ const __m128i raw =
+ xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m256i src =
+ highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
+
+ __m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
+ __m256i w =
+ _mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ yy_storeu_256(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
+ int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ // The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
+ // Ctl and Dtl is 32-byte aligned.
+ const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
+
+ int32_t *buf = aom_memalign(
+ 32, 4 * sizeof(*buf) * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3));
+ if (!buf) return -1;
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 32 bytes for efficiency.
+ int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array.
+ int32_t *Atl = buf + 0 * buf_elts + 7;
+ int32_t *Btl = buf + 1 * buf_elts + 7;
+ int32_t *Ctl = buf + 2 * buf_elts + 7;
+ int32_t *Dtl = buf + 3 * buf_elts + 7;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_avx2(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+ decode_xq(xqd, xq, params);
+
+ __m256i xq0 = _mm256_set1_epi32(xq[0]);
+ __m256i xq1 = _mm256_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 16 pixels
+ for (int j = 0; j < width; j += 16) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m256i ep_0, ep_1;
+ __m128i src_0, src_1;
+ if (highbd) {
+ src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
+ ep_0 = _mm256_cvtepu16_epi32(src_0);
+ ep_1 = _mm256_cvtepu16_epi32(src_1);
+ } else {
+ src_0 = xx_loadu_128(dat8ij);
+ ep_0 = _mm256_cvtepu8_epi32(src_0);
+ ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
+ }
+
+ const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
+ const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
+
+ __m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
+
+ const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
+ v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
+
+ const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
+ v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m256i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_0 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m256i w_1 = _mm256_srai_epi32(
+ _mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ // Note that packing into 16 bits messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
+ const __m256i res = _mm256_min_epi16(tmp2, max);
+ yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ // Note that each pack messes up the order of the bits,
+ // so we use a permute function to correct this
+ const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
+ const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
+ const __m256i res =
+ _mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
+ const __m128i res2 =
+ _mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
+ xx_storeu_128(dst8 + m, res2);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c
new file mode 100644
index 0000000000..ea3f6d9422
--- /dev/null
+++ b/third_party/aom/av1/common/x86/selfguided_sse4.c
@@ -0,0 +1,660 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/restoration.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Load 4 bytes from the possibly-misaligned pointer p, extend each byte to
+// 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_8_32(const void *p) {
+ return _mm_cvtepu8_epi32(xx_loadl_32(p));
+}
+
+// Load 4 halfwords from the possibly-misaligned pointer p, extend each
+// halfword to 32-bit precision and return them in an SSE register.
+static __m128i xx_load_extend_16_32(const void *p) {
+ return _mm_cvtepu16_epi32(xx_loadl_64(p));
+}
+
+// Compute the scan of an SSE register holding 4 32-bit integers. If the
+// register holds x0..x3 then the scan will hold x0, x0+x1, x0+x1+x2,
+// x0+x1+x2+x3
+static __m128i scan_32(__m128i x) {
+ const __m128i x01 = _mm_add_epi32(x, _mm_slli_si128(x, 4));
+ return _mm_add_epi32(x01, _mm_slli_si128(x01, 8));
+}
+
+// Compute two integral images from src. B sums elements; A sums their
+// squares. The images are offset by one pixel, so will have width and height
+// equal to width + 1, height + 1 and the first row and column will be zero.
+//
+// A+1 and B+1 should be aligned to 16 bytes. buf_stride should be a multiple
+// of 4.
+static void integral_images(const uint8_t *src, int src_stride, int width,
+ int height, int32_t *A, int32_t *B,
+ int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_8_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute two integral images from src. B sums elements; A sums their squares
+//
+// A and B should be aligned to 16 bytes. buf_stride should be a multiple of 4.
+static void integral_images_highbd(const uint16_t *src, int src_stride,
+ int width, int height, int32_t *A,
+ int32_t *B, int buf_stride) {
+ // Write out the zero top row
+ memset(A, 0, sizeof(*A) * (width + 1));
+ memset(B, 0, sizeof(*B) * (width + 1));
+
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i) {
+ // Zero the left column.
+ A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
+
+ // ldiff is the difference H - D where H is the output sample immediately
+ // to the left and D is the output sample above it. These are scalars,
+ // replicated across the four lanes.
+ __m128i ldiff1 = zero, ldiff2 = zero;
+ for (int j = 0; j < width; j += 4) {
+ const int ABj = 1 + j;
+
+ const __m128i above1 = xx_load_128(B + ABj + i * buf_stride);
+ const __m128i above2 = xx_load_128(A + ABj + i * buf_stride);
+
+ const __m128i x1 = xx_load_extend_16_32(src + j + i * src_stride);
+ const __m128i x2 = _mm_madd_epi16(x1, x1);
+
+ const __m128i sc1 = scan_32(x1);
+ const __m128i sc2 = scan_32(x2);
+
+ const __m128i row1 = _mm_add_epi32(_mm_add_epi32(sc1, above1), ldiff1);
+ const __m128i row2 = _mm_add_epi32(_mm_add_epi32(sc2, above2), ldiff2);
+
+ xx_store_128(B + ABj + (i + 1) * buf_stride, row1);
+ xx_store_128(A + ABj + (i + 1) * buf_stride, row2);
+
+ // Calculate the new H - D.
+ ldiff1 = _mm_shuffle_epi32(_mm_sub_epi32(row1, above1), 0xff);
+ ldiff2 = _mm_shuffle_epi32(_mm_sub_epi32(row2, above2), 0xff);
+ }
+ }
+}
+
+// Compute 4 values of boxsum from the given integral image. ii should point
+// at the middle of the box (for the first value). r is the box radius.
+static INLINE __m128i boxsum_from_ii(const int32_t *ii, int stride, int r) {
+ const __m128i tl = xx_loadu_128(ii - (r + 1) - (r + 1) * stride);
+ const __m128i tr = xx_loadu_128(ii + (r + 0) - (r + 1) * stride);
+ const __m128i bl = xx_loadu_128(ii - (r + 1) + r * stride);
+ const __m128i br = xx_loadu_128(ii + (r + 0) + r * stride);
+ const __m128i u = _mm_sub_epi32(tr, tl);
+ const __m128i v = _mm_sub_epi32(br, bl);
+ return _mm_sub_epi32(v, u);
+}
+
+static __m128i round_for_shift(unsigned shift) {
+ return _mm_set1_epi32((1 << shift) >> 1);
+}
+
+static __m128i compute_p(__m128i sum1, __m128i sum2, int bit_depth, int n) {
+ __m128i an, bb;
+ if (bit_depth > 8) {
+ const __m128i rounding_a = round_for_shift(2 * (bit_depth - 8));
+ const __m128i rounding_b = round_for_shift(bit_depth - 8);
+ const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
+ const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
+ const __m128i a = _mm_srl_epi32(_mm_add_epi32(sum2, rounding_a), shift_a);
+ const __m128i b = _mm_srl_epi32(_mm_add_epi32(sum1, rounding_b), shift_b);
+ // b < 2^14, so we can use a 16-bit madd rather than a 32-bit
+ // mullo to square it
+ bb = _mm_madd_epi16(b, b);
+ an = _mm_max_epi32(_mm_mullo_epi32(a, _mm_set1_epi32(n)), bb);
+ } else {
+ bb = _mm_madd_epi16(sum1, sum1);
+ an = _mm_mullo_epi32(sum2, _mm_set1_epi32(n));
+ }
+ return _mm_sub_epi32(an, bb);
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab(int32_t *A, int32_t *B, const int32_t *C, const int32_t *D,
+ int width, int height, int buf_stride, int bit_depth,
+ int sgr_params_idx, int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; ++i) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+ x_by_xplus1[_mm_extract_epi32(z, 2)],
+ x_by_xplus1[_mm_extract_epi32(z, 1)],
+ x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf. This is a 3x3 filter
+// where the outer four corners have weight 3 and all other pixels have weight
+// 4.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// xl x xr
+// xbl xb xbr
+//
+// buf points to x
+//
+// fours = xl + xt + xr + xb + x
+// threes = xtl + xtr + xbr + xbl
+// cross_sum = 4 * fours + 3 * threes
+// = 4 * (fours + threes) - threes
+// = (fours + threes) << 2 - threes
+static INLINE __m128i cross_sum(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fours = _mm_add_epi32(
+ xl, _mm_add_epi32(xt, _mm_add_epi32(xr, _mm_add_epi32(xb, x))));
+ const __m128i threes =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+
+ return _mm_sub_epi32(_mm_slli_epi32(_mm_add_epi32(fours, threes), 2), threes);
+}
+
+// The final filter for self-guided restoration. Computes a weighted average
+// across A, B with "cross sums" (see cross_sum implementation above).
+static void final_filter(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride, const void *dgd8,
+ int dgd_stride, int width, int height, int highbd) {
+ const int nb = 5;
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum(A + i * buf_stride + j, buf_stride);
+ const __m128i b = cross_sum(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding),
+ SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+}
+
+// Assumes that C, D are integral images for the original buffer which has been
+// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
+// on the sides. A, B, C, D point at logical position (0, 0).
+static void calc_ab_fast(int32_t *A, int32_t *B, const int32_t *C,
+ const int32_t *D, int width, int height,
+ int buf_stride, int bit_depth, int sgr_params_idx,
+ int radius_idx) {
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ const int r = params->r[radius_idx];
+ const int n = (2 * r + 1) * (2 * r + 1);
+ const __m128i s = _mm_set1_epi32(params->s[radius_idx]);
+ // one_over_n[n-1] is 2^12/n, so easily fits in an int16
+ const __m128i one_over_n = _mm_set1_epi32(one_by_x[n - 1]);
+
+ const __m128i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
+ const __m128i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
+
+ // Set up masks
+ const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
+ __m128i mask[4];
+ for (int idx = 0; idx < 4; idx++) {
+ const __m128i shift = _mm_cvtsi32_si128(8 * (4 - idx));
+ mask[idx] = _mm_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
+ }
+
+ for (int i = -1; i < height + 1; i += 2) {
+ for (int j = -1; j < width + 1; j += 4) {
+ const int32_t *Cij = C + i * buf_stride + j;
+ const int32_t *Dij = D + i * buf_stride + j;
+
+ __m128i sum1 = boxsum_from_ii(Dij, buf_stride, r);
+ __m128i sum2 = boxsum_from_ii(Cij, buf_stride, r);
+
+ // When width + 2 isn't a multiple of 4, sum1 and sum2 will contain
+ // some uninitialised data in their upper words. We use a mask to
+ // ensure that these bits are set to 0.
+ int idx = AOMMIN(4, width + 1 - j);
+ assert(idx >= 1);
+
+ if (idx < 4) {
+ sum1 = _mm_and_si128(mask[idx], sum1);
+ sum2 = _mm_and_si128(mask[idx], sum2);
+ }
+
+ const __m128i p = compute_p(sum1, sum2, bit_depth, n);
+
+ const __m128i z = _mm_min_epi32(
+ _mm_srli_epi32(_mm_add_epi32(_mm_mullo_epi32(p, s), rnd_z),
+ SGRPROJ_MTABLE_BITS),
+ _mm_set1_epi32(255));
+
+ // 'Gather' type instructions are not available pre-AVX2, so synthesize a
+ // gather using scalar loads.
+ const __m128i a_res = _mm_set_epi32(x_by_xplus1[_mm_extract_epi32(z, 3)],
+ x_by_xplus1[_mm_extract_epi32(z, 2)],
+ x_by_xplus1[_mm_extract_epi32(z, 1)],
+ x_by_xplus1[_mm_extract_epi32(z, 0)]);
+
+ xx_storeu_128(A + i * buf_stride + j, a_res);
+
+ const __m128i a_complement =
+ _mm_sub_epi32(_mm_set1_epi32(SGRPROJ_SGR), a_res);
+
+ // sum1 might have lanes greater than 2^15, so we can't use madd to do
+ // multiplication involving sum1. However, a_complement and one_over_n
+ // are both less than 256, so we can multiply them first.
+ const __m128i a_comp_over_n = _mm_madd_epi16(a_complement, one_over_n);
+ const __m128i b_int = _mm_mullo_epi32(a_comp_over_n, sum1);
+ const __m128i b_res =
+ _mm_srli_epi32(_mm_add_epi32(b_int, rnd_res), SGRPROJ_RECIP_BITS);
+
+ xx_storeu_128(B + i * buf_stride + j, b_res);
+ }
+ }
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xtl xt xtr
+// - buf -
+// xbl xb xbr
+//
+// Pixels are weighted like this:
+// 5 6 5
+// 0 0 0
+// 5 6 5
+//
+// fives = xtl + xtr + xbl + xbr
+// sixes = xt + xb
+// cross_sum = 6 * sixes + 5 * fives
+// = 5 * (fives + sixes) - sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_even_row(const int32_t *buf, int stride) {
+ const __m128i xtl = xx_loadu_128(buf - 1 - stride);
+ const __m128i xt = xx_loadu_128(buf - stride);
+ const __m128i xtr = xx_loadu_128(buf + 1 - stride);
+ const __m128i xbl = xx_loadu_128(buf - 1 + stride);
+ const __m128i xb = xx_loadu_128(buf + stride);
+ const __m128i xbr = xx_loadu_128(buf + 1 + stride);
+
+ const __m128i fives =
+ _mm_add_epi32(xtl, _mm_add_epi32(xtr, _mm_add_epi32(xbr, xbl)));
+ const __m128i sixes = _mm_add_epi32(xt, xb);
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// Calculate 4 values of the "cross sum" starting at buf.
+//
+// Pixels are indexed like this:
+// xl x xr
+//
+// Pixels are weighted like this:
+// 5 6 5
+//
+// buf points to x
+//
+// fives = xl + xr
+// sixes = x
+// cross_sum = 5 * fives + 6 * sixes
+// = 4 * (fives + sixes) + (fives + sixes) + sixes
+// = (fives + sixes) << 2 + (fives + sixes) + sixes
+static INLINE __m128i cross_sum_fast_odd_row(const int32_t *buf) {
+ const __m128i xl = xx_loadu_128(buf - 1);
+ const __m128i x = xx_loadu_128(buf);
+ const __m128i xr = xx_loadu_128(buf + 1);
+
+ const __m128i fives = _mm_add_epi32(xl, xr);
+ const __m128i sixes = x;
+
+ const __m128i fives_plus_sixes = _mm_add_epi32(fives, sixes);
+
+ return _mm_add_epi32(
+ _mm_add_epi32(_mm_slli_epi32(fives_plus_sixes, 2), fives_plus_sixes),
+ sixes);
+}
+
+// The final filter for the self-guided restoration. Computes a
+// weighted average across A, B with "cross sums" (see cross_sum_...
+// implementations above).
+static void final_filter_fast(int32_t *dst, int dst_stride, const int32_t *A,
+ const int32_t *B, int buf_stride,
+ const void *dgd8, int dgd_stride, int width,
+ int height, int highbd) {
+ const int nb0 = 5;
+ const int nb1 = 4;
+
+ const __m128i rounding0 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+ const __m128i rounding1 =
+ round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ const uint8_t *dgd_real =
+ highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
+
+ for (int i = 0; i < height; ++i) {
+ if (!(i & 1)) { // even row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a =
+ cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
+ const __m128i b =
+ cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding0),
+ SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ } else { // odd row
+ for (int j = 0; j < width; j += 4) {
+ const __m128i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
+ const __m128i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
+ const __m128i raw =
+ xx_loadl_64(dgd_real + ((i * dgd_stride + j) << highbd));
+ const __m128i src =
+ highbd ? _mm_cvtepu16_epi32(raw) : _mm_cvtepu8_epi32(raw);
+
+ __m128i v = _mm_add_epi32(_mm_madd_epi16(a, src), b);
+ __m128i w = _mm_srai_epi32(_mm_add_epi32(v, rounding1),
+ SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
+
+ xx_storeu_128(dst + i * dst_stride + j, w);
+ }
+ }
+ }
+}
+
+int av1_selfguided_restoration_sse4_1(const uint8_t *dgd8, int width,
+ int height, int dgd_stride, int32_t *flt0,
+ int32_t *flt1, int flt_stride,
+ int sgr_params_idx, int bit_depth,
+ int highbd) {
+ int32_t *buf = (int32_t *)aom_memalign(
+ 16, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+ if (!buf) return -1;
+ memset(buf, 0, 4 * sizeof(*buf) * RESTORATION_PROC_UNIT_PELS);
+
+ const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
+ const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
+
+ // Adjusting the stride of A and B here appears to avoid bad cache effects,
+ // leading to a significant speed improvement.
+ // We also align the stride to a multiple of 16 bytes for efficiency.
+ int buf_stride = ((width_ext + 3) & ~3) + 16;
+
+ // The "tl" pointers point at the top-left of the initialised data for the
+ // array. Adding 3 here ensures that column 1 is 16-byte aligned.
+ int32_t *Atl = buf + 0 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Btl = buf + 1 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Ctl = buf + 2 * RESTORATION_PROC_UNIT_PELS + 3;
+ int32_t *Dtl = buf + 3 * RESTORATION_PROC_UNIT_PELS + 3;
+
+ // The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
+ // there's a zero row and column in A, B (integral images), so we move down
+ // and right one for them.
+ const int buf_diag_border =
+ SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
+
+ int32_t *A0 = Atl + 1 + buf_stride;
+ int32_t *B0 = Btl + 1 + buf_stride;
+ int32_t *C0 = Ctl + 1 + buf_stride;
+ int32_t *D0 = Dtl + 1 + buf_stride;
+
+ // Finally, A, B, C, D point at position (0, 0).
+ int32_t *A = A0 + buf_diag_border;
+ int32_t *B = B0 + buf_diag_border;
+ int32_t *C = C0 + buf_diag_border;
+ int32_t *D = D0 + buf_diag_border;
+
+ const int dgd_diag_border =
+ SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
+ const uint8_t *dgd0 = dgd8 - dgd_diag_border;
+
+ // Generate integral images from the input. C will contain sums of squares; D
+ // will contain just sums
+ if (highbd)
+ integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
+ height_ext, Ctl, Dtl, buf_stride);
+ else
+ integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
+ buf_stride);
+
+ const sgr_params_type *const params = &sgr_params[sgr_params_idx];
+ // Write to flt0 and flt1
+ // If params->r == 0 we skip the corresponding filter. We only allow one of
+ // the radii to be 0, as having both equal to 0 would be equivalent to
+ // skipping SGR entirely.
+ assert(!(params->r[0] == 0 && params->r[1] == 0));
+ assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+ assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
+
+ if (params->r[0] > 0) {
+ calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
+ sgr_params_idx, 0);
+ final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
+ width, height, highbd);
+ }
+
+ if (params->r[1] > 0) {
+ calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
+ 1);
+ final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
+ height, highbd);
+ }
+ aom_free(buf);
+ return 0;
+}
+
+void apply_selfguided_restoration_sse4_1(const uint8_t *dat8, int width,
+ int height, int stride, int eps,
+ const int *xqd, uint8_t *dst8,
+ int dst_stride, int32_t *tmpbuf,
+ int bit_depth, int highbd) {
+ int32_t *flt0 = tmpbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ assert(width * height <= RESTORATION_UNITPELS_MAX);
+ const int ret = av1_selfguided_restoration_sse4_1(
+ dat8, width, height, stride, flt0, flt1, width, eps, bit_depth, highbd);
+ (void)ret;
+ assert(!ret);
+ const sgr_params_type *const params = &sgr_params[eps];
+ int xq[2];
+ decode_xq(xqd, xq, params);
+
+ __m128i xq0 = _mm_set1_epi32(xq[0]);
+ __m128i xq1 = _mm_set1_epi32(xq[1]);
+
+ for (int i = 0; i < height; ++i) {
+ // Calculate output in batches of 8 pixels
+ for (int j = 0; j < width; j += 8) {
+ const int k = i * width + j;
+ const int m = i * dst_stride + j;
+
+ const uint8_t *dat8ij = dat8 + i * stride + j;
+ __m128i src;
+ if (highbd) {
+ src = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
+ } else {
+ src = _mm_cvtepu8_epi16(xx_loadl_64(dat8ij));
+ }
+
+ const __m128i u = _mm_slli_epi16(src, SGRPROJ_RST_BITS);
+ const __m128i u_0 = _mm_cvtepu16_epi32(u);
+ const __m128i u_1 = _mm_cvtepu16_epi32(_mm_srli_si128(u, 8));
+
+ __m128i v_0 = _mm_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
+ __m128i v_1 = _mm_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
+
+ if (params->r[0] > 0) {
+ const __m128i f1_0 = _mm_sub_epi32(xx_loadu_128(&flt0[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq0, f1_0));
+
+ const __m128i f1_1 = _mm_sub_epi32(xx_loadu_128(&flt0[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq0, f1_1));
+ }
+
+ if (params->r[1] > 0) {
+ const __m128i f2_0 = _mm_sub_epi32(xx_loadu_128(&flt1[k]), u_0);
+ v_0 = _mm_add_epi32(v_0, _mm_mullo_epi32(xq1, f2_0));
+
+ const __m128i f2_1 = _mm_sub_epi32(xx_loadu_128(&flt1[k + 4]), u_1);
+ v_1 = _mm_add_epi32(v_1, _mm_mullo_epi32(xq1, f2_1));
+ }
+
+ const __m128i rounding =
+ round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_0 = _mm_srai_epi32(_mm_add_epi32(v_0, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+ const __m128i w_1 = _mm_srai_epi32(_mm_add_epi32(v_1, rounding),
+ SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
+
+ if (highbd) {
+ // Pack into 16 bits and clamp to [0, 2^bit_depth)
+ const __m128i tmp = _mm_packus_epi32(w_0, w_1);
+ const __m128i max = _mm_set1_epi16((1 << bit_depth) - 1);
+ const __m128i res = _mm_min_epi16(tmp, max);
+ xx_storeu_128(CONVERT_TO_SHORTPTR(dst8 + m), res);
+ } else {
+ // Pack into 8 bits and clamp to [0, 256)
+ const __m128i tmp = _mm_packs_epi32(w_0, w_1);
+ const __m128i res = _mm_packus_epi16(tmp, tmp /* "don't care" value */);
+ xx_storel_64(dst8 + m, res);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/warp_plane_sse4.c b/third_party/aom/av1/common/x86/warp_plane_sse4.c
new file mode 100644
index 0000000000..b810cea2e5
--- /dev/null
+++ b/third_party/aom/av1/common/x86/warp_plane_sse4.c
@@ -0,0 +1,942 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/warped_motion.h"
+
+/* This is a modified version of 'warped_filter' from warped_motion.c:
+ * Each coefficient is stored in 8 bits instead of 16 bits
+ * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
+
+ This is done in order to avoid overflow: Since the tap with the largest
+ coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
+ order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
+ convolve functions.
+
+ Instead, we use the summation order
+ ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
+ The rearrangement of coefficients in this table is so that we can get the
+ coefficients into the correct order more quickly.
+*/
+/* clang-format off */
+DECLARE_ALIGNED(8, static const int8_t,
+ filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
+#if WARPEDPIXEL_PREC_BITS == 6
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
+ { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
+ { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
+ { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
+ { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
+ { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
+ { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
+ { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
+ { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
+ { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
+ { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
+ { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
+ { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
+ { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
+ { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
+ { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
+ { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
+ { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
+ {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
+ {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
+ {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
+ {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
+ {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
+ {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
+ {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
+ {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
+ {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
+ {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
+ {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
+ {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
+ {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
+ {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
+ { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
+ { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
+ { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
+ { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
+ { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
+ { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
+ { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
+ { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
+ { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
+ { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
+ { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
+ { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
+ { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
+ { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
+ { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
+ { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
+ { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
+ // dummy (replicate row index 191)
+ { 0, 0, 2, -1, 0, 0, 127, 0},
+
+#else
+ // [-1, 0)
+ { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
+ { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
+ { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
+ { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
+ { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
+ { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
+ { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
+ { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
+ { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
+ { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
+ { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
+ { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
+ { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
+ { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
+ { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
+ { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
+ // [0, 1)
+ { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
+ { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
+ {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
+ {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
+ {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
+ {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
+ {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
+ {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
+ {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
+ {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
+ {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
+ {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
+ {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
+ {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
+ {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
+ { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
+ // [1, 2)
+ { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
+ { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
+ { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
+ { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
+ { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
+ { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
+ { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
+ { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
+ { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
+ { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
+ { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
+ { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
+ { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
+ { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
+ { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
+ { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
+ // dummy (replicate row index 95)
+ { 0, 0, 4, -3, 0, -1, 127, 1},
+#endif // WARPEDPIXEL_PREC_BITS == 6
+};
+/* clang-format on */
+
+// Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
+// in an SSE register into two sequences:
+// 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
+// 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
+static const uint8_t even_mask[16] = { 0, 2, 2, 4, 4, 6, 6, 8,
+ 8, 10, 10, 12, 12, 14, 14, 0 };
+static const uint8_t odd_mask[16] = { 1, 3, 3, 5, 5, 7, 7, 9,
+ 9, 11, 11, 13, 13, 15, 15, 0 };
+
+static const uint8_t shuffle_alpha0_mask01[16] = { 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1 };
+
+static const uint8_t shuffle_alpha0_mask23[16] = { 2, 3, 2, 3, 2, 3, 2, 3,
+ 2, 3, 2, 3, 2, 3, 2, 3 };
+
+static const uint8_t shuffle_alpha0_mask45[16] = { 4, 5, 4, 5, 4, 5, 4, 5,
+ 4, 5, 4, 5, 4, 5, 4, 5 };
+
+static const uint8_t shuffle_alpha0_mask67[16] = { 6, 7, 6, 7, 6, 7, 6, 7,
+ 6, 7, 6, 7, 6, 7, 6, 7 };
+
+static const uint8_t shuffle_gamma0_mask0[16] = { 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3 };
+static const uint8_t shuffle_gamma0_mask1[16] = { 4, 5, 6, 7, 4, 5, 6, 7,
+ 4, 5, 6, 7, 4, 5, 6, 7 };
+static const uint8_t shuffle_gamma0_mask2[16] = { 8, 9, 10, 11, 8, 9, 10, 11,
+ 8, 9, 10, 11, 8, 9, 10, 11 };
+static const uint8_t shuffle_gamma0_mask3[16] = {
+ 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15
+};
+
+static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz, int k) {
+ const __m128i src_even =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)even_mask));
+ const __m128i src_odd =
+ _mm_shuffle_epi8(src, _mm_loadu_si128((__m128i *)odd_mask));
+ // The pixel order we need for 'src' is:
+ // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
+ const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
+ const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
+ // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
+ const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
+ _mm_srli_si128(src_odd, 4));
+ const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
+ // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
+ const __m128i src_13 =
+ _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
+ const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
+ // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
+ const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
+ _mm_srli_si128(src_even, 6));
+ const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
+
+ const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
+ ((1 << reduce_bits_horiz) >> 1));
+
+ // Note: The values res_02 + res_46 and res_13 + res_57 both
+ // fit into int16s at this point, but their sum may be too wide to fit
+ // into an int16. However, once we also add round_const, the sum of
+ // all of these fits into a uint16.
+ //
+ // The wrapping behaviour of _mm_add_* is used here to make sure we
+ // get the correct result despite converting between different
+ // (implicit) types.
+ const __m128i res_even = _mm_add_epi16(res_02, res_46);
+ const __m128i res_odd = _mm_add_epi16(res_13, res_57);
+ const __m128i res =
+ _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
+ tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
+}
+
+static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_1 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_2 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_3 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_4 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_5 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_6 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
+ const __m128i tmp_7 = _mm_loadl_epi64(
+ (__m128i *)&filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
+ const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
+ const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
+ const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
+ // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
+ const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
+
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
+ const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
+ // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
+ const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
+ // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
+ __m128i *coeff) {
+ // Filter even-index pixels
+ const __m128i tmp_0 =
+ _mm_loadl_epi64((__m128i *)&filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
+
+ // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
+ coeff[0] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask01));
+ // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
+ coeff[1] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask23));
+ // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
+ coeff[2] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask45));
+ // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
+ coeff[3] = _mm_shuffle_epi8(
+ tmp_0, _mm_loadu_si128((__m128i *)shuffle_alpha0_mask67));
+}
+
+static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
+ int alpha, int k,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+}
+
+static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
+ int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta,
+ int p_height, int height, int i,
+ const int offset_bits_horiz,
+ const int reduce_bits_horiz) {
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)alpha;
+ int k;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx, coeff);
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ int k;
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff(alpha, sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void warp_horizontal_filter_alpha0_beta0(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ (void)beta;
+ (void)alpha;
+ int k;
+
+ __m128i coeff[4];
+ prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
+
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+
+ // Load source pixels
+ const __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
+ }
+}
+
+static INLINE void unpack_weights_and_set_round_const(
+ ConvolveParams *conv_params, const int round_bits, const int offset_bits,
+ __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
+ *res_sub_const =
+ _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
+ (1 << (offset_bits - conv_params->round_1 - 1)));
+ *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
+
+ const int w0 = conv_params->fwd_offset;
+ const int w1 = conv_params->bck_offset;
+ const __m128i wt0 = _mm_set1_epi16(w0);
+ const __m128i wt1 = _mm_set1_epi16(w1);
+ *wt = _mm_unpacklo_epi16(wt0, wt1);
+}
+
+static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_2 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_4 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_6 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
+ const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
+ const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
+ const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
+
+ // even coeffs
+ coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
+ coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
+ coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
+ coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
+
+ const __m128i tmp_1 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_3 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_5 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
+ const __m128i tmp_7 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
+
+ const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
+ const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
+ const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
+ const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
+
+ // odd coeffs
+ coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
+ coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
+ coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
+ coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
+}
+
+static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
+ __m128i *coeffs) {
+ const __m128i tmp_0 = _mm_loadu_si128(
+ (__m128i *)(warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
+
+ // even coeffs
+ coeffs[0] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask0));
+ coeffs[1] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask1));
+ coeffs[2] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask2));
+ coeffs[3] =
+ _mm_shuffle_epi8(tmp_0, _mm_loadu_si128((__m128i *)shuffle_gamma0_mask3));
+
+ // odd coeffs
+ coeffs[4] = coeffs[0];
+ coeffs[5] = coeffs[1];
+ coeffs[6] = coeffs[2];
+ coeffs[7] = coeffs[3];
+}
+
+static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
+ __m128i *res_lo, __m128i *res_hi,
+ int k) {
+ // Load from tmp and rearrange pairs of consecutive rows into the
+ // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+ const __m128i *src = tmp + (k + 4);
+ const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
+ const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
+ const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
+ const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+
+ const __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
+ const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
+ const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
+ const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
+
+ const __m128i res_odd =
+ _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+}
+
+static INLINE void store_vertical_filter_output(
+ __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
+ const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
+ uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
+ const int reduce_bits_vert, int p_stride, int p_width,
+ const int round_bits) {
+ __m128i res_lo_1 = *res_lo;
+ __m128i res_hi_1 = *res_hi;
+
+ if (conv_params->is_compound) {
+ __m128i *const p =
+ (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
+ res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
+ __m128i res_lo_16;
+ if (conv_params->do_average) {
+ __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+ const __m128i p_16 = _mm_loadl_epi64(p);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
+ const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
+ }
+
+ res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
+
+ res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
+ *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
+ } else {
+ _mm_storel_epi64(p, temp_lo_16);
+ }
+ if (p_width > 4) {
+ __m128i *const p4 =
+ (__m128i *)&conv_params
+ ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
+ res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
+ reduce_bits_vert);
+ const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
+ __m128i res_hi_16;
+
+ if (conv_params->do_average) {
+ __m128i *const dst8_4 =
+ (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
+ const __m128i p4_16 = _mm_loadl_epi64(p4);
+
+ if (conv_params->use_jnt_comp_avg) {
+ const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
+ const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
+ const __m128i shifted_32 =
+ _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+ res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
+ } else {
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
+ }
+ res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
+
+ res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
+ round_bits);
+ __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
+ *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
+
+ } else {
+ _mm_storel_epi64(p4, temp_hi_16);
+ }
+ }
+ } else {
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ // Store, blending with 'pred' if needed
+ __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
+
+ // Note: If we're outputting a 4x4 block, we need to be very careful
+ // to only output 4 pixels at this point, to avoid encode/decode
+ // mismatches when encoding with multiple threads.
+ if (p_width == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
+ } else {
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+}
+
+static INLINE void warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ int k;
+ (void)gamma;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ int sy = sy4 + delta * (k + 4);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
+
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void warp_vertical_filter_gamma0_delta0(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ (void)delta;
+ (void)gamma;
+ int k;
+ __m128i res_sub_const, round_bits_const, wt;
+ unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
+ &res_sub_const, &round_bits_const, &wt);
+
+ __m128i coeffs[8];
+ prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
+ // Vertical filter
+ for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
+ __m128i res_lo;
+ __m128i res_hi;
+ filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
+
+ store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
+ &res_sub_const, &round_bits_const, pred,
+ conv_params, i, j, k, reduce_bits_vert,
+ p_stride, p_width, round_bits);
+ }
+}
+
+static INLINE void prepare_warp_vertical_filter(
+ uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
+ int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
+ int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
+ const int round_bits, const int offset_bits) {
+ if (gamma == 0 && delta == 0)
+ warp_vertical_filter_gamma0_delta0(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
+ sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
+ else if (gamma == 0 && delta != 0)
+ warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else if (gamma != 0 && delta == 0)
+ warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+ else
+ warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
+ p_stride, p_width, i, j, sy4, reduce_bits_vert,
+ res_add_const, round_bits, offset_bits);
+}
+
+static INLINE void prepare_warp_horizontal_filter(
+ const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
+ int32_t sx4, int alpha, int beta, int p_height, int height, int i,
+ const int offset_bits_horiz, const int reduce_bits_horiz) {
+ if (alpha == 0 && beta == 0)
+ warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ else if (alpha == 0 && beta != 0)
+ warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else if (alpha != 0 && beta == 0)
+ warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+ else
+ warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
+ p_height, height, i, offset_bits_horiz,
+ reduce_bits_horiz);
+}
+
+void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
+ int height, int stride, uint8_t *pred, int p_col,
+ int p_row, int p_width, int p_height, int p_stride,
+ int subsampling_x, int subsampling_y,
+ ConvolveParams *conv_params, int16_t alpha,
+ int16_t beta, int16_t gamma, int16_t delta) {
+ __m128i tmp[15];
+ int i, j, k;
+ const int bd = 8;
+ const int reduce_bits_horiz = conv_params->round_0;
+ const int reduce_bits_vert = conv_params->is_compound
+ ? conv_params->round_1
+ : 2 * FILTER_BITS - reduce_bits_horiz;
+ const int offset_bits_horiz = bd + FILTER_BITS - 1;
+ assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
+
+ const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
+ const __m128i reduce_bits_vert_const =
+ _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
+ const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
+
+ /* Note: For this code to work, the left/right frame borders need to be
+ extended by at least 13 pixels each. By the time we get here, other
+ code will have set up this border, but we allow an explicit check
+ for debugging purposes.
+ */
+ /*for (i = 0; i < height; ++i) {
+ for (j = 0; j < 13; ++j) {
+ assert(ref[i * stride - 13 + j] == ref[i * stride]);
+ assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+ }
+ }*/
+ __m128i res_add_const_1;
+ if (conv_params->is_compound == 1) {
+ res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
+ } else {
+ res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
+ ((1 << reduce_bits_vert) >> 1));
+ }
+
+ for (i = 0; i < p_height; i += 8) {
+ for (j = 0; j < p_width; j += 8) {
+ const int32_t src_x = (p_col + j + 4) << subsampling_x;
+ const int32_t src_y = (p_row + i + 4) << subsampling_y;
+ const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
+ const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
+ const int32_t x4 = dst_x >> subsampling_x;
+ const int32_t y4 = dst_y >> subsampling_y;
+
+ int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+ int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+ int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+ // Add in all the constant terms, including rounding and offset
+ sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+ sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
+ (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
+
+ sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+ sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
+
+ // Horizontal filter
+ // If the block is aligned such that, after clamping, every sample
+ // would be taken from the leftmost/rightmost column, then we can
+ // skip the expensive horizontal filter.
+ if (ix4 <= -7) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] = _mm_set1_epi16(
+ (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (ix4 >= width + 6) {
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ tmp[k + 7] =
+ _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
+ ref[iy * stride + (width - 1)] *
+ (1 << (FILTER_BITS - reduce_bits_horiz)));
+ }
+ } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
+ const int out_of_boundary_left = -(ix4 - 6);
+ const int out_of_boundary_right = (ix4 + 8) - width;
+ for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
+ int iy = iy4 + k;
+ if (iy < 0)
+ iy = 0;
+ else if (iy > height - 1)
+ iy = height - 1;
+ int sx = sx4 + beta * (k + 4);
+
+ // Load source pixels
+ __m128i src =
+ _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+ if (out_of_boundary_left >= 0) {
+ const __m128i shuffle_reg_left =
+ _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_left);
+ }
+ if (out_of_boundary_right >= 0) {
+ const __m128i shuffle_reg_right = _mm_loadu_si128(
+ (__m128i *)warp_pad_right[out_of_boundary_right]);
+ src = _mm_shuffle_epi8(src, shuffle_reg_right);
+ }
+ horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
+ reduce_bits_horiz);
+ }
+ } else {
+ prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
+ beta, p_height, height, i,
+ offset_bits_horiz, reduce_bits_horiz);
+ }
+
+ // Vertical filter
+ prepare_warp_vertical_filter(
+ pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
+ j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_avx2.c b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
new file mode 100644
index 0000000000..87a6e12396
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_avx2.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
+// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
+// on the left.
+// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
+// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
+void av1_wiener_convolve_add_src_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(32, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero_128 = _mm_setzero_si128();
+ const __m256i zero_256 = _mm256_setzero_si256();
+
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
+
+ const __m256i clamp_low = zero_256;
+ const __m256i clamp_high =
+ _mm256_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1);
+
+ /* Horizontal filter */
+ {
+ // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
+ const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
+
+ // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const = _mm256_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (int i = 0; i < intermediate_height; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint8_t *data_ij = src_ptr + i * src_stride + j;
+
+ // Load 8-bit src data
+ const __m128i data_0 = xx_loadu_128(data_ij + 0);
+ const __m128i data_1 = xx_loadu_128(data_ij + 1);
+ const __m128i data_2 = xx_loadu_128(data_ij + 2);
+ const __m128i data_3 = xx_loadu_128(data_ij + 3);
+ const __m128i data_4 = xx_loadu_128(data_ij + 4);
+ const __m128i data_5 = xx_loadu_128(data_ij + 5);
+ const __m128i data_6 = xx_loadu_128(data_ij + 6);
+ const __m128i data_7 = xx_loadu_128(data_ij + 7);
+
+ // (Zero-)Extend 8-bit data to 16-bit data
+ const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
+ const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
+ const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
+ const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
+ const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
+ const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
+ const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
+ const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
+
+ // Multiply src data by filter coeffs and sum pairs
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ // Calculate scalar product for even- and odd-indices separately,
+ // increasing to 32-bit precision
+ const __m256i res_even_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
+ const __m256i res_odd_sum = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
+
+ const __m256i res_even = _mm256_srai_epi32(
+ _mm256_add_epi32(res_even_sum, round_const), conv_params->round_0);
+ const __m256i res_odd = _mm256_srai_epi32(
+ _mm256_add_epi32(res_odd_sum, round_const), conv_params->round_0);
+
+ // Reduce to 16-bit precision and pack even- and odd-index results
+ // back into one register. The _mm256_packs_epi32 intrinsic returns
+ // a register with the pixels ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ const __m256i res_clamped =
+ _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
+
+ // Store in a temporary array
+ yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
+ const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
+
+ // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
+ const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
+ const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
+
+ // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
+ const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
+ // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
+ const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
+ // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
+ const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
+ // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
+ const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
+
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
+
+ // Load 16-bit data from the output of the horizontal filter in
+ // which the pixels are ordered as follows:
+ // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
+ const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
+ const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
+ const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
+ const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
+ const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
+ const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
+ const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
+ const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
+
+ // Filter the even-indices, increasing to 32-bit precision
+ const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
+ const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
+ const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
+ const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
+
+ const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
+ const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
+ const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
+ const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
+
+ const __m256i res_even = _mm256_add_epi32(
+ _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
+
+ // Filter the odd-indices, increasing to 32-bit precision
+ const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
+ const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
+ const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
+ const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
+
+ const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
+ const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
+ const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
+ const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
+
+ const __m256i res_odd = _mm256_add_epi32(
+ _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
+
+ // Pixels are currently in the following order:
+ // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
+ // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
+ //
+ // Rearrange the pixels into the following order:
+ // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
+ // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
+ const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
+ const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
+
+ const __m256i res_lo_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m256i res_hi_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ // Reduce to 16-bit precision and pack into the correct order:
+ // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
+ const __m256i res_16bit =
+ _mm256_packs_epi32(res_lo_round, res_hi_round);
+
+ // Reduce to 8-bit precision. This messes up the order:
+ // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
+ // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit =
+ _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
+
+ // Swap the two central 32-bit values to get the order:
+ // [ - - - - - - - - - - - - - - - - ]
+ // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
+ const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
+
+ // Store the lower 128-bit lane in the dst array
+ xx_storeu_128(dst + i * dst_stride + j,
+ _mm256_castsi256_si128(res_8bit2));
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/common/x86/wiener_convolve_sse2.c b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
new file mode 100644
index 0000000000..f9d00b7330
--- /dev/null
+++ b/third_party/aom/av1/common/x86/wiener_convolve_sse2.c
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/convolve.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+
+void av1_wiener_convolve_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h,
+ const ConvolveParams *conv_params) {
+ const int bd = 8;
+ assert(x_step_q4 == 16 && y_step_q4 == 16);
+ assert(!(w & 7));
+ (void)x_step_q4;
+ (void)y_step_q4;
+
+ DECLARE_ALIGNED(16, uint16_t,
+ temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
+ int intermediate_height = h + SUBPEL_TAPS - 2;
+ memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
+ int i, j;
+ const int center_tap = ((SUBPEL_TAPS - 1) / 2);
+ const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
+
+ const __m128i zero = _mm_setzero_si128();
+ // Add an offset to account for the "add_src" part of the convolve function.
+ const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
+
+ /* Horizontal filter */
+ {
+ const __m128i coeffs_x =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));
+
+ for (i = 0; i < intermediate_height; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
+ conv_params->round_0);
+
+ // Filter odd-index pixels
+ const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
+ conv_params->round_0);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ res = _mm_min_epi16(
+ _mm_max_epi16(res, zero),
+ _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1));
+ _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const __m128i coeffs_y =
+ _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const =
+ _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
+ (1 << (bd + conv_params->round_1 - 1)));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
+ *(__m128i *)(data + 1 * MAX_SB_SIZE));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
+ *(__m128i *)(data + 3 * MAX_SB_SIZE));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
+ *(__m128i *)(data + 5 * MAX_SB_SIZE));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
+ *(__m128i *)(data + 7 * MAX_SB_SIZE));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ const __m128i res_lo_round = _mm_srai_epi32(
+ _mm_add_epi32(res_lo, round_const), conv_params->round_1);
+ const __m128i res_hi_round = _mm_srai_epi32(
+ _mm_add_epi32(res_hi, round_const), conv_params->round_1);
+
+ const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+ __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+ _mm_storel_epi64(p, res_8bit);
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/decoder/accounting.c b/third_party/aom/av1/decoder/accounting.c
new file mode 100644
index 0000000000..8d8f3dfdb4
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "av1/decoder/accounting.h"
+
+static int aom_accounting_hash(const char *str) {
+ uint32_t val;
+ const unsigned char *ustr;
+ val = 0;
+ ustr = (const unsigned char *)str;
+ /* This is about the worst hash one can design, but it should be good enough
+ here. */
+ while (*ustr) val += *ustr++;
+ return val % AOM_ACCOUNTING_HASH_SIZE;
+}
+
+/* Dictionary lookup based on an open-addressing hash table. */
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str) {
+ int hash;
+ size_t len;
+ AccountingDictionary *dictionary;
+ dictionary = &accounting->syms.dictionary;
+ hash = aom_accounting_hash(str);
+ while (accounting->hash_dictionary[hash] != -1) {
+ if (strcmp(dictionary->strs[accounting->hash_dictionary[hash]], str) == 0) {
+ return accounting->hash_dictionary[hash];
+ }
+ hash++;
+ if (hash == AOM_ACCOUNTING_HASH_SIZE) hash = 0;
+ }
+ /* No match found. */
+ assert(dictionary->num_strs + 1 < MAX_SYMBOL_TYPES);
+ accounting->hash_dictionary[hash] = dictionary->num_strs;
+ len = strlen(str);
+ dictionary->strs[dictionary->num_strs] = malloc(len + 1);
+ snprintf(dictionary->strs[dictionary->num_strs], len + 1, "%s", str);
+ dictionary->num_strs++;
+ return dictionary->num_strs - 1;
+}
+
+void aom_accounting_init(Accounting *accounting) {
+ int i;
+ accounting->num_syms_allocated = 1000;
+ accounting->syms.syms =
+ malloc(sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+ accounting->syms.dictionary.num_strs = 0;
+ assert(AOM_ACCOUNTING_HASH_SIZE > 2 * MAX_SYMBOL_TYPES);
+ for (i = 0; i < AOM_ACCOUNTING_HASH_SIZE; i++)
+ accounting->hash_dictionary[i] = -1;
+ aom_accounting_reset(accounting);
+}
+
+void aom_accounting_reset(Accounting *accounting) {
+ accounting->syms.num_syms = 0;
+ accounting->syms.num_binary_syms = 0;
+ accounting->syms.num_multi_syms = 0;
+ accounting->context.x = -1;
+ accounting->context.y = -1;
+ accounting->last_tell_frac = 0;
+}
+
+void aom_accounting_clear(Accounting *accounting) {
+ int i;
+ AccountingDictionary *dictionary;
+ free(accounting->syms.syms);
+ dictionary = &accounting->syms.dictionary;
+ for (i = 0; i < dictionary->num_strs; i++) {
+ free(dictionary->strs[i]);
+ }
+}
+
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y) {
+ accounting->context.x = x;
+ accounting->context.y = y;
+}
+
+void aom_accounting_record(Accounting *accounting, const char *str,
+ uint32_t bits) {
+ AccountingSymbol sym;
+ // Reuse previous symbol if it has the same context and symbol id.
+ if (accounting->syms.num_syms) {
+ AccountingSymbol *last_sym;
+ last_sym = &accounting->syms.syms[accounting->syms.num_syms - 1];
+ if (memcmp(&last_sym->context, &accounting->context,
+ sizeof(AccountingSymbolContext)) == 0) {
+ uint32_t id;
+ id = aom_accounting_dictionary_lookup(accounting, str);
+ if (id == last_sym->id) {
+ last_sym->bits += bits;
+ last_sym->samples++;
+ return;
+ }
+ }
+ }
+ sym.context = accounting->context;
+ sym.samples = 1;
+ sym.bits = bits;
+ sym.id = aom_accounting_dictionary_lookup(accounting, str);
+ assert(sym.id <= 255);
+ if (accounting->syms.num_syms == accounting->num_syms_allocated) {
+ accounting->num_syms_allocated *= 2;
+ accounting->syms.syms =
+ realloc(accounting->syms.syms,
+ sizeof(AccountingSymbol) * accounting->num_syms_allocated);
+ assert(accounting->syms.syms != NULL);
+ }
+ accounting->syms.syms[accounting->syms.num_syms++] = sym;
+}
+
+void aom_accounting_dump(Accounting *accounting) {
+ int i;
+ AccountingSymbol *sym;
+ printf("\n----- Number of recorded syntax elements = %d -----\n",
+ accounting->syms.num_syms);
+ printf("----- Total number of symbol calls = %d (%d binary) -----\n",
+ accounting->syms.num_multi_syms + accounting->syms.num_binary_syms,
+ accounting->syms.num_binary_syms);
+ for (i = 0; i < accounting->syms.num_syms; i++) {
+ sym = &accounting->syms.syms[i];
+ printf("%s x: %d, y: %d bits: %f samples: %d\n",
+ accounting->syms.dictionary.strs[sym->id], sym->context.x,
+ sym->context.y, (float)sym->bits / 8.0, sym->samples);
+ }
+}
diff --git a/third_party/aom/av1/decoder/accounting.h b/third_party/aom/av1/decoder/accounting.h
new file mode 100644
index 0000000000..288e5e63e3
--- /dev/null
+++ b/third_party/aom/av1/decoder/accounting.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_ACCOUNTING_H_
+#define AOM_AV1_DECODER_ACCOUNTING_H_
+#include <stdlib.h>
+#include "aom/aomdx.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#define AOM_ACCOUNTING_HASH_SIZE (1021)
+
+/* Max number of entries for symbol types in the dictionary (increase as
+ necessary). */
+#define MAX_SYMBOL_TYPES (256)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+ 3 => 1/8th bits.*/
+#define AOM_ACCT_BITRES (3)
+
+typedef struct {
+ int16_t x;
+ int16_t y;
+} AccountingSymbolContext;
+
+typedef struct {
+ AccountingSymbolContext context;
+ uint32_t id;
+ /** Number of bits in units of 1/8 bit. */
+ uint32_t bits;
+ uint32_t samples;
+} AccountingSymbol;
+
+/** Dictionary for translating strings into id. */
+typedef struct {
+ char *(strs[MAX_SYMBOL_TYPES]);
+ int num_strs;
+} AccountingDictionary;
+
+typedef struct {
+ /** All recorded symbols decoded. */
+ AccountingSymbol *syms;
+ /** Number of syntax actually recorded. */
+ int num_syms;
+ /** Raw symbol decoding calls for non-binary values. */
+ int num_multi_syms;
+ /** Raw binary symbol decoding calls. */
+ int num_binary_syms;
+ /** Dictionary for translating strings into id. */
+ AccountingDictionary dictionary;
+} AccountingSymbols;
+
+struct Accounting {
+ AccountingSymbols syms;
+ /** Size allocated for symbols (not all may be used). */
+ int num_syms_allocated;
+ int16_t hash_dictionary[AOM_ACCOUNTING_HASH_SIZE];
+ AccountingSymbolContext context;
+ uint32_t last_tell_frac;
+};
+
+void aom_accounting_init(Accounting *accounting);
+void aom_accounting_reset(Accounting *accounting);
+void aom_accounting_clear(Accounting *accounting);
+void aom_accounting_set_context(Accounting *accounting, int16_t x, int16_t y);
+int aom_accounting_dictionary_lookup(Accounting *accounting, const char *str);
+void aom_accounting_record(Accounting *accounting, const char *str,
+ uint32_t bits);
+void aom_accounting_dump(Accounting *accounting);
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_AV1_DECODER_ACCOUNTING_H_
diff --git a/third_party/aom/av1/decoder/decodeframe.c b/third_party/aom/av1/decoder/decodeframe.c
new file mode 100644
index 0000000000..31f14b531f
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.c
@@ -0,0 +1,5567 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/frame_buffers.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+#include "av1/common/obmc.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodetxb.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+// This is needed by ext_tile related unit tests.
+#define EXT_TILE_DEBUG 1
+#define MC_TEMP_BUF_PELS \
+ (((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2) * \
+ ((MAX_SB_SIZE)*2 + (AOM_INTERP_EXTEND)*2))
+
+// Checks that the remaining bits start with a 1 and ends with 0s.
+// It consumes an additional byte, if already byte aligned before the check.
+int av1_check_trailing_bits(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ // bit_offset is set to 0 (mod 8) when the reader is already byte aligned
+ int bits_before_alignment = 8 - rb->bit_offset % 8;
+ int trailing = aom_rb_read_literal(rb, bits_before_alignment);
+ if (trailing != (1 << (bits_before_alignment - 1))) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ return 0;
+}
+
+// Use only_chroma = 1 to only set the chroma planes
+static void set_planes_to_neutral_grey(const SequenceHeader *const seq_params,
+ const YV12_BUFFER_CONFIG *const buf,
+ int only_chroma) {
+ if (seq_params->use_highbitdepth) {
+ const int val = 1 << (seq_params->bit_depth - 1);
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ uint16_t *const base = CONVERT_TO_SHORTPTR(buf->buffers[plane]);
+ // Set the first row to neutral grey. Then copy the first row to all
+ // subsequent rows.
+ if (buf->crop_heights[is_uv] > 0) {
+ aom_memset16(base, val, buf->crop_widths[is_uv]);
+ for (int row_idx = 1; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memcpy(&base[row_idx * buf->strides[is_uv]], base,
+ sizeof(*base) * buf->crop_widths[is_uv]);
+ }
+ }
+ }
+ } else {
+ for (int plane = only_chroma; plane < MAX_MB_PLANE; plane++) {
+ const int is_uv = plane > 0;
+ for (int row_idx = 0; row_idx < buf->crop_heights[is_uv]; row_idx++) {
+ memset(&buf->buffers[plane][row_idx * buf->uv_stride], 1 << 7,
+ buf->crop_widths[is_uv]);
+ }
+ }
+ }
+}
+
+static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd,
+ aom_reader *const r, int plane,
+ int runit_idx);
+
+static void setup_compound_reference_mode(AV1_COMMON *cm) {
+ cm->comp_fwd_ref[0] = LAST_FRAME;
+ cm->comp_fwd_ref[1] = LAST2_FRAME;
+ cm->comp_fwd_ref[2] = LAST3_FRAME;
+ cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+
+ cm->comp_bwd_ref[0] = BWDREF_FRAME;
+ cm->comp_bwd_ref[1] = ALTREF2_FRAME;
+ cm->comp_bwd_ref[2] = ALTREF_FRAME;
+}
+
+static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
+ return len != 0 && len <= (size_t)(end - start);
+}
+
+static TX_MODE read_tx_mode(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ if (cm->coded_lossless) return ONLY_4X4;
+ return aom_rb_read_bit(rb) ? TX_MODE_SELECT : TX_MODE_LARGEST;
+}
+
+static REFERENCE_MODE read_frame_reference_mode(
+ const AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ if (frame_is_intra_only(cm)) {
+ return SINGLE_REFERENCE;
+ } else {
+ return aom_rb_read_bit(rb) ? REFERENCE_MODE_SELECT : SINGLE_REFERENCE;
+ }
+}
+
+static void inverse_transform_block(MACROBLOCKD *xd, int plane,
+ const TX_TYPE tx_type,
+ const TX_SIZE tx_size, uint8_t *dst,
+ int stride, int reduced_tx_set) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = pd->dqcoeff;
+ eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+ uint16_t scan_line = eob_data->max_scan_line;
+ uint16_t eob = eob_data->eob;
+
+ memcpy(dqcoeff, pd->dqcoeff_block + xd->cb_offset[plane],
+ (scan_line + 1) * sizeof(dqcoeff[0]));
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst, stride,
+ eob, reduced_tx_set);
+ memset(dqcoeff, 0, (scan_line + 1) * sizeof(dqcoeff[0]));
+}
+
+static void read_coeffs_tx_intra_block(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ if (!mbmi->skip) {
+#if TXCOEFF_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ av1_read_coeffs_txb_facade(cm, xd, r, plane, row, col, tx_size);
+#if TXCOEFF_TIMER
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ cm->txcoeff_timer += elapsed_time;
+ ++cm->txb_count;
+#endif
+ }
+}
+
+static void decode_block_void(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size) {
+ (void)cm;
+ (void)xd;
+ (void)r;
+ (void)plane;
+ (void)row;
+ (void)col;
+ (void)tx_size;
+}
+
+static void predict_inter_block_void(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ (void)cm;
+ (void)xd;
+ (void)mi_row;
+ (void)mi_col;
+ (void)bsize;
+}
+
+static void cfl_store_inter_block_void(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd) {
+ (void)cm;
+ (void)xd;
+}
+
+static void predict_and_reconstruct_intra_block(
+ const AV1_COMMON *const cm, MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col, const TX_SIZE tx_size) {
+ (void)r;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ PLANE_TYPE plane_type = get_plane_type(plane);
+
+ av1_predict_intra_block_facade(cm, xd, plane, col, row, tx_size);
+
+ if (!mbmi->skip) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ // tx_type will be read out in av1_read_coeffs_txb_facade
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+ cm->reduced_tx_set_used);
+ eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+ if (eob_data->eob) {
+ uint8_t *dst =
+ &pd->dst.buf[(row * pd->dst.stride + col) << tx_size_wide_log2[0]];
+ inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+ cm->reduced_tx_set_used);
+ }
+ }
+ if (plane == AOM_PLANE_Y && store_cfl_required(cm, xd)) {
+ cfl_store_tx(xd, row, col, tx_size, mbmi->sb_type);
+ }
+}
+
+static void inverse_transform_inter_block(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int blk_row, const int blk_col,
+ const TX_SIZE tx_size) {
+ (void)r;
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ // tx_type will be read out in av1_read_coeffs_txb_facade
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+
+ uint8_t *dst =
+ &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+ inverse_transform_block(xd, plane, tx_type, tx_size, dst, pd->dst.stride,
+ cm->reduced_tx_set_used);
+#if CONFIG_MISMATCH_DEBUG
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+ pd->subsampling_x, pd->subsampling_y);
+ mismatch_check_block_tx(dst, pd->dst.stride, cm->frame_offset, plane, pixel_c,
+ pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+#endif
+}
+
+static void set_cb_buffer_offsets(MACROBLOCKD *const xd, TX_SIZE tx_size,
+ int plane) {
+ xd->cb_offset[plane] += tx_size_wide[tx_size] * tx_size_high[tx_size];
+ xd->txb_offset[plane] =
+ xd->cb_offset[plane] / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+}
+
+static void decode_reconstruct_tx(AV1_COMMON *cm, ThreadData *const td,
+ aom_reader *r, MB_MODE_INFO *const mbmi,
+ int plane, BLOCK_SIZE plane_bsize,
+ int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int *eob_total) {
+ MACROBLOCKD *const xd = &td->xd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+ // Scale to match transform block unit.
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size || plane) {
+ td->read_coeffs_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+ tx_size);
+
+ td->inverse_tx_inter_block_visit(cm, xd, r, plane, blk_row, blk_col,
+ tx_size);
+ eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+ *eob_total += eob_data->eob;
+ set_cb_buffer_offsets(xd, tx_size, plane);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+ assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int sub_step = bsw * bsh;
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize, offsetr,
+ offsetc, block, sub_txs, eob_total);
+ block += sub_step;
+ }
+ }
+ }
+}
+
+static void set_offsets(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, int bw,
+ int bh, int x_mis, int y_mis) {
+ const int num_planes = av1_num_planes(cm);
+
+ const int offset = mi_row * cm->mi_stride + mi_col;
+ const TileInfo *const tile = &xd->tile;
+
+ xd->mi = cm->mi_grid_visible + offset;
+ xd->mi[0] = &cm->mi[offset];
+ // TODO(slavarnway): Generate sb_type based on bwl and bhl, instead of
+ // passing bsize from decode_partition().
+ xd->mi[0]->sb_type = bsize;
+#if CONFIG_RD_DEBUG
+ xd->mi[0]->mi_row = mi_row;
+ xd->mi[0]->mi_col = mi_col;
+#endif
+ xd->cfl.mi_row = mi_row;
+ xd->cfl.mi_col = mi_col;
+
+ assert(x_mis && y_mis);
+ for (int x = 1; x < x_mis; ++x) xd->mi[x] = xd->mi[0];
+ int idx = cm->mi_stride;
+ for (int y = 1; y < y_mis; ++y) {
+ memcpy(&xd->mi[idx], &xd->mi[0], x_mis * sizeof(xd->mi[0]));
+ idx += cm->mi_stride;
+ }
+
+ set_plane_n4(xd, bw, bh, num_planes);
+ set_skip_context(xd, mi_row, mi_col, num_planes);
+
+ // Distance of Mb to the various image edges. These are specified to 8th pel
+ // as they are always compared to values that are in 1/8th pel units
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col, 0, num_planes);
+}
+
+static void decode_mbmi_block(AV1Decoder *const pbi, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+
+#if CONFIG_ACCOUNTING
+ aom_accounting_set_context(&pbi->accounting, mi_col, mi_row);
+#endif
+ set_offsets(cm, xd, bsize, mi_row, mi_col, bw, bh, x_mis, y_mis);
+ xd->mi[0]->partition = partition;
+ av1_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+ if (bsize >= BLOCK_8X8 &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
+ const BLOCK_SIZE uv_subsize =
+ ss_size_lookup[bsize][seq_params->subsampling_x]
+ [seq_params->subsampling_y];
+ if (uv_subsize == BLOCK_INVALID)
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid block size.");
+ }
+
+ int reader_corrupted_flag = aom_reader_has_error(r);
+ aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+typedef struct PadBlock {
+ int x0;
+ int x1;
+ int y0;
+ int y1;
+} PadBlock;
+
+static void highbd_build_mc_border(const uint8_t *src8, int src_stride,
+ uint8_t *dst8, int dst_stride, int x, int y,
+ int b_w, int b_h, int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const uint16_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w) left = b_w;
+
+ if (x + b_w > w) right = x + b_w - w;
+
+ if (right > b_w) right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left) aom_memset16(dst, ref_row[0], left);
+
+ if (copy) memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
+
+ if (right) aom_memset16(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h) ref_row += src_stride;
+ } while (--b_h);
+}
+
+static void build_mc_border(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int x, int y, int b_w, int b_h,
+ int w, int h) {
+ // Get a pointer to the start of the real data for this row.
+ const uint8_t *ref_row = src - x - y * src_stride;
+
+ if (y >= h)
+ ref_row += (h - 1) * src_stride;
+ else if (y > 0)
+ ref_row += y * src_stride;
+
+ do {
+ int right = 0, copy;
+ int left = x < 0 ? -x : 0;
+
+ if (left > b_w) left = b_w;
+
+ if (x + b_w > w) right = x + b_w - w;
+
+ if (right > b_w) right = b_w;
+
+ copy = b_w - left - right;
+
+ if (left) memset(dst, ref_row[0], left);
+
+ if (copy) memcpy(dst + left, ref_row + x + left, copy);
+
+ if (right) memset(dst + left + copy, ref_row[w - 1], right);
+
+ dst += dst_stride;
+ ++y;
+
+ if (y > 0 && y < h) ref_row += src_stride;
+ } while (--b_h);
+}
+
+static INLINE int update_extend_mc_border_params(
+ const struct scale_factors *const sf, struct buf_2d *const pre_buf,
+ MV32 scaled_mv, PadBlock *block, int subpel_x_mv, int subpel_y_mv,
+ int do_warp, int is_intrabc, int *x_pad, int *y_pad) {
+ const int is_scaled = av1_is_scaled(sf);
+ // Get reference width and height.
+ int frame_width = pre_buf->width;
+ int frame_height = pre_buf->height;
+
+ // Do border extension if there is motion or
+ // width/height is not a multiple of 8 pixels.
+ if ((!is_intrabc) && (!do_warp) &&
+ (is_scaled || scaled_mv.col || scaled_mv.row || (frame_width & 0x7) ||
+ (frame_height & 0x7))) {
+ if (subpel_x_mv || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+ block->x0 -= AOM_INTERP_EXTEND - 1;
+ block->x1 += AOM_INTERP_EXTEND;
+ *x_pad = 1;
+ }
+
+ if (subpel_y_mv || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+ block->y0 -= AOM_INTERP_EXTEND - 1;
+ block->y1 += AOM_INTERP_EXTEND;
+ *y_pad = 1;
+ }
+
+ // Skip border extension if block is inside the frame.
+ if (block->x0 < 0 || block->x1 > frame_width - 1 || block->y0 < 0 ||
+ block->y1 > frame_height - 1) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static INLINE void extend_mc_border(const struct scale_factors *const sf,
+ struct buf_2d *const pre_buf,
+ MV32 scaled_mv, PadBlock block,
+ int subpel_x_mv, int subpel_y_mv,
+ int do_warp, int is_intrabc, int highbd,
+ uint8_t *mc_buf, uint8_t **pre,
+ int *src_stride) {
+ int x_pad = 0, y_pad = 0;
+ if (update_extend_mc_border_params(sf, pre_buf, scaled_mv, &block,
+ subpel_x_mv, subpel_y_mv, do_warp,
+ is_intrabc, &x_pad, &y_pad)) {
+ // Get reference block pointer.
+ const uint8_t *const buf_ptr =
+ pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+ int buf_stride = pre_buf->stride;
+ const int b_w = block.x1 - block.x0;
+ const int b_h = block.y1 - block.y0;
+
+ // Extend the border.
+ if (highbd) {
+ highbd_build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0,
+ block.y0, b_w, b_h, pre_buf->width,
+ pre_buf->height);
+ } else {
+ build_mc_border(buf_ptr, buf_stride, mc_buf, b_w, block.x0, block.y0, b_w,
+ b_h, pre_buf->width, pre_buf->height);
+ }
+ *src_stride = b_w;
+ *pre = mc_buf + y_pad * (AOM_INTERP_EXTEND - 1) * b_w +
+ x_pad * (AOM_INTERP_EXTEND - 1);
+ }
+}
+
+static INLINE void dec_calc_subpel_params(
+ MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+ int plane, const int pre_x, const int pre_y, int x, int y,
+ struct buf_2d *const pre_buf, SubpelParams *subpel_params, int bw, int bh,
+ PadBlock *block, int mi_x, int mi_y, MV32 *scaled_mv, int *subpel_x_mv,
+ int *subpel_y_mv) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int is_scaled = av1_is_scaled(sf);
+ if (is_scaled) {
+ int ssx = pd->subsampling_x;
+ int ssy = pd->subsampling_y;
+ int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+ orig_pos_y += mv.row * (1 << (1 - ssy));
+ int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+ orig_pos_x += mv.col * (1 << (1 - ssx));
+ int pos_y = sf->scale_value_y(orig_pos_y, sf);
+ int pos_x = sf->scale_value_x(orig_pos_x, sf);
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+ const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+ const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+ << SCALE_SUBPEL_BITS;
+ const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, top, bottom);
+ pos_x = clamp(pos_x, left, right);
+
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+
+ // Get reference block top left coordinate.
+ block->x0 = pos_x >> SCALE_SUBPEL_BITS;
+ block->y0 = pos_y >> SCALE_SUBPEL_BITS;
+
+ // Get reference block bottom right coordinate.
+ block->x1 =
+ ((pos_x + (bw - 1) * subpel_params->xs) >> SCALE_SUBPEL_BITS) + 1;
+ block->y1 =
+ ((pos_y + (bh - 1) * subpel_params->ys) >> SCALE_SUBPEL_BITS) + 1;
+
+ MV temp_mv;
+ temp_mv = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh, pd->subsampling_x,
+ pd->subsampling_y);
+ *scaled_mv = av1_scale_mv(&temp_mv, (mi_x + x), (mi_y + y), sf);
+ scaled_mv->row += SCALE_EXTRA_OFF;
+ scaled_mv->col += SCALE_EXTRA_OFF;
+
+ *subpel_x_mv = scaled_mv->col & SCALE_SUBPEL_MASK;
+ *subpel_y_mv = scaled_mv->row & SCALE_SUBPEL_MASK;
+ } else {
+ // Get block position in current frame.
+ int pos_x = (pre_x + x) << SUBPEL_BITS;
+ int pos_y = (pre_y + y) << SUBPEL_BITS;
+
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+ subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+
+ // Get reference block top left coordinate.
+ pos_x += mv_q4.col;
+ pos_y += mv_q4.row;
+ block->x0 = pos_x >> SUBPEL_BITS;
+ block->y0 = pos_y >> SUBPEL_BITS;
+
+ // Get reference block bottom right coordinate.
+ block->x1 = (pos_x >> SUBPEL_BITS) + (bw - 1) + 1;
+ block->y1 = (pos_y >> SUBPEL_BITS) + (bh - 1) + 1;
+
+ scaled_mv->row = mv_q4.row;
+ scaled_mv->col = mv_q4.col;
+ *subpel_x_mv = scaled_mv->col & SUBPEL_MASK;
+ *subpel_y_mv = scaled_mv->row & SUBPEL_MASK;
+ }
+}
+
+static INLINE void dec_build_inter_predictors(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int plane,
+ const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw,
+ int bh, int mi_x, int mi_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int is_compound = has_second_ref(mi);
+ int ref;
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ int is_global[2] = { 0, 0 };
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+ (block_size_high[bsize] < 8 && ss_y);
+
+ if (is_intrabc) sub8x8_inter = 0;
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ sub8x8_inter = sub8x8_inter && !build_for_obmc;
+ if (sub8x8_inter) {
+ for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+ if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+ }
+ }
+ }
+
+ if (sub8x8_inter) {
+ // block size
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+ const int b8_h = block_size_high[plane_bsize] >> ss_y;
+ assert(!is_compound);
+
+ const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+ int row = row_start;
+ int src_stride;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ is_compound = has_second_ref(this_mbmi);
+ int tmp_dst_stride = 8;
+ assert(bw < 8 || bh < 8);
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+ ref = 0;
+ const RefBuffer *ref_buf =
+ &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+ pd->pre[ref].buf =
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf->uv_stride,
+ &ref_buf->sf);
+ pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ PadBlock block;
+ MV32 scaled_mv;
+ int subpel_x_mv, subpel_y_mv;
+ int highbd;
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+ dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf,
+ &subpel_params, bw, bh, &block, mi_x, mi_y,
+ &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+ pre = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+ src_stride = pre_buf->stride;
+ highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv,
+ subpel_y_mv, 0, is_intrabc, highbd, xd->mc_buf[ref],
+ &pre, &src_stride);
+ conv_params.do_average = ref;
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ }
+
+ av1_make_inter_predictor(
+ pre, src_stride, dst, dst_buf->stride, &subpel_params, sf, b4_w,
+ b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+ (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+ plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+ ++col;
+ }
+ ++row;
+ }
+
+ for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+ return;
+ }
+
+ {
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ uint8_t *pre[2];
+ SubpelParams subpel_params[2];
+ int src_stride[2];
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+ PadBlock block;
+ MV32 scaled_mv;
+ int subpel_x_mv, subpel_y_mv;
+ int highbd;
+
+ dec_calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf,
+ &subpel_params[ref], bw, bh, &block, mi_x, mi_y,
+ &scaled_mv, &subpel_x_mv, &subpel_y_mv);
+ pre[ref] = pre_buf->buf0 + block.y0 * pre_buf->stride + block.x0;
+ src_stride[ref] = pre_buf->stride;
+ highbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+ int do_warp = (bw >= 8 && bh >= 8 &&
+ av1_allow_warp(mi, &warp_types,
+ &xd->global_motion[mi->ref_frame[ref]],
+ build_for_obmc, subpel_params[ref].xs,
+ subpel_params[ref].ys, NULL));
+ do_warp = (do_warp && xd->cur_frame_force_integer_mv == 0);
+
+ extend_mc_border(sf, pre_buf, scaled_mv, block, subpel_x_mv, subpel_y_mv,
+ do_warp, is_intrabc, highbd, xd->mc_buf[ref], &pre[ref],
+ &src_stride[ref]);
+ }
+
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+ &conv_params.bck_offset,
+ &conv_params.use_jnt_comp_avg, is_compound);
+
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+ conv_params.do_average = ref;
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ }
+
+ if (ref && is_masked_compound_type(mi->interinter_comp.type))
+ av1_make_masked_inter_predictor(
+ pre[ref], src_stride[ref], dst, dst_buf->stride,
+ &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+ plane, &warp_types, mi_x >> pd->subsampling_x,
+ mi_y >> pd->subsampling_y, ref, xd, cm->allow_warped_motion);
+ else
+ av1_make_inter_predictor(
+ pre[ref], src_stride[ref], dst, dst_buf->stride,
+ &subpel_params[ref], sf, bw, bh, &conv_params, mi->interp_filters,
+ &warp_types, mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y,
+ plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+ }
+ }
+}
+
+static void dec_build_inter_predictors_for_planes(const AV1_COMMON *cm,
+ MACROBLOCKD *xd,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int plane_from,
+ int plane_to) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = pd->width;
+ const int bh = pd->height;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+
+ dec_build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+ }
+}
+
+static void dec_build_inter_predictors_sby(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_row,
+ int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 0, 0);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = { { xd->plane[0].dst.buf, NULL, NULL },
+ { xd->plane[0].dst.stride, 0, 0 } };
+ if (!ctx) ctx = &default_ctx;
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, ctx, 0, bsize);
+ }
+}
+
+static void dec_build_inter_predictors_sbuv(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_row,
+ int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ dec_build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, 1,
+ MAX_MB_PLANE - 1);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = {
+ { NULL, xd->plane[1].dst.buf, xd->plane[2].dst.buf },
+ { 0, xd->plane[1].dst.stride, xd->plane[2].dst.stride }
+ };
+ if (!ctx) ctx = &default_ctx;
+ av1_build_interintra_predictors_sbuv(
+ cm, xd, xd->plane[1].dst.buf, xd->plane[2].dst.buf,
+ xd->plane[1].dst.stride, xd->plane[2].dst.stride, ctx, bsize);
+ }
+}
+
+static void dec_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ BUFFER_SET *ctx, BLOCK_SIZE bsize) {
+ const int num_planes = av1_num_planes(cm);
+ dec_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ if (num_planes > 1)
+ dec_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+static INLINE void dec_build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int above_mi_col = ctxt->mi_col + rel_mi_col;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+ av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+ &backup_mbmi, ctxt, num_planes);
+ mi_x = above_mi_col << MI_SIZE_LOG2;
+ mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+ dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+ mi_y);
+ }
+}
+
+static void dec_build_prediction_by_above_preds(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+
+ // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+ // prediction block. This is half the height of the original block,
+ // except for 128-wide blocks, where we only use a height of 32.
+ int this_height = xd->n4_h * MI_SIZE;
+ int pred_height = AOMMIN(this_height / 2, 32);
+ xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_right_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ dec_build_prediction_by_above_pred, &ctxt);
+
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+ xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void dec_build_prediction_by_left_pred(
+ MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int left_mi_row = ctxt->mi_row + rel_mi_row;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+ av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+ &backup_mbmi, ctxt, num_planes);
+ mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+ mi_y = left_mi_row << MI_SIZE_LOG2;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+ dec_build_inter_predictors(ctxt->cm, xd, j, &backup_mbmi, 1, bw, bh, mi_x,
+ mi_y);
+ }
+}
+
+static void dec_build_prediction_by_left_preds(
+ const AV1_COMMON *cm, MACROBLOCKD *xd, int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE], int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE], int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+
+ // Adjust mb_to_right_edge to have the correct value for the OBMC
+ // prediction block. This is half the width of the original block,
+ // except for 128-wide blocks, where we only use a width of 32.
+ int this_width = xd->n4_w * MI_SIZE;
+ int pred_width = AOMMIN(this_width / 2, 32);
+ xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_bottom_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ dec_build_prediction_by_left_pred, &ctxt);
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+ xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+static void dec_build_obmc_inter_predictors_sb(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, int mi_row,
+ int mi_col) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+ } else {
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+ }
+ dec_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+ dst_width1, dst_height1, dst_stride1);
+ dec_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+ dst_width2, dst_height2, dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+ mi_row, mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+ dst_buf2, dst_stride2);
+}
+
+static void cfl_store_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ if (store_cfl_required(cm, xd)) {
+ cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+ }
+}
+
+static void predict_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, BLOCK_SIZE bsize) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+ if (frame < LAST_FRAME) {
+ assert(is_intrabc_block(mbmi));
+ assert(frame == INTRA_FRAME);
+ assert(ref == 0);
+ } else {
+ RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+ xd->block_refs[ref] = ref_buf;
+ av1_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col, &ref_buf->sf,
+ num_planes);
+ }
+ }
+
+ dec_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL) {
+ dec_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+ }
+#if CONFIG_MISMATCH_DEBUG
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0, pd->subsampling_x,
+ pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_check_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+ plane, pixel_c, pixel_r, pd->width, pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static void set_color_index_map_offset(MACROBLOCKD *const xd, int plane,
+ aom_reader *r) {
+ (void)r;
+ Av1ColorMapParam params;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+ &params.plane_height, NULL, NULL);
+ xd->color_index_map_offset[plane] += params.plane_width * params.plane_height;
+}
+
+static void decode_token_recon_block(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, aom_reader *r,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
+ const int num_planes = av1_num_planes(cm);
+
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ CFL_CTX *const cfl = &xd->cfl;
+ cfl->is_chroma_reference = is_chroma_reference(
+ mi_row, mi_col, bsize, cfl->subsampling_x, cfl->subsampling_y);
+
+ if (!is_inter_block(mbmi)) {
+ int row, col;
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ int mu_blocks_wide =
+ block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high =
+ block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+
+ for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += stepr) {
+ for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += stepc) {
+ td->read_coeffs_tx_intra_block_visit(cm, xd, r, plane, blk_row,
+ blk_col, tx_size);
+ td->predict_and_recon_intra_block_visit(cm, xd, r, plane, blk_row,
+ blk_col, tx_size);
+ set_cb_buffer_offsets(xd, tx_size, plane);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ td->predict_inter_block_visit(cm, xd, mi_row, mi_col, bsize);
+ // Reconstruction
+ if (!mbmi->skip) {
+ int eobtotal = 0;
+
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ int row, col;
+
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ assert(max_unit_bsize ==
+ get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ int mu_blocks_wide =
+ block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high =
+ block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ const BLOCK_SIZE bsizec =
+ scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsizec, pd->subsampling_x, pd->subsampling_y);
+
+ const TX_SIZE max_tx_size =
+ get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int bh_var_tx = tx_size_high_unit[max_tx_size];
+ const int bw_var_tx = tx_size_wide_unit[max_tx_size];
+ int block = 0;
+ int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ int blk_row, blk_col;
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high),
+ pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide),
+ pd->subsampling_x);
+
+ for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += bh_var_tx) {
+ for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += bw_var_tx) {
+ decode_reconstruct_tx(cm, td, r, mbmi, plane, plane_bsize,
+ blk_row, blk_col, block, max_tx_size,
+ &eobtotal);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+ }
+ td->cfl_store_inter_block_visit(cm, xd);
+ }
+
+ av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+ set_color_index_map_offset);
+}
+
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi);
+#endif
+
+static void read_tx_size_vartx(MACROBLOCKD *xd, MB_MODE_INFO *mbmi,
+ TX_SIZE tx_size, int depth,
+#if LOOP_FILTER_BITMASK
+ AV1_COMMON *cm, int mi_row, int mi_col,
+#endif
+ int blk_row, int blk_col, aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ int is_split = 0;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+ assert(tx_size > TX_4X4);
+
+ if (depth == MAX_VARTX_DEPTH) {
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = tx_size;
+ }
+ }
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->sb_type, tx_size);
+ is_split = aom_read_symbol(r, ec_ctx->txfm_partition_cdf[ctx], 2, ACCT_STR);
+
+ if (is_split) {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+ if (sub_txs == TX_4X4) {
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = sub_txs;
+ }
+ }
+ mbmi->tx_size = sub_txs;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, sub_txs, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col, BLOCK_8X8,
+ TX_4X4, mbmi);
+#endif
+ return;
+ }
+#if LOOP_FILTER_BITMASK
+ if (depth + 1 == MAX_VARTX_DEPTH) {
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], sub_txs, mbmi);
+ }
+#endif
+
+ assert(bsw > 0 && bsh > 0);
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = blk_row + row;
+ int offsetc = blk_col + col;
+ read_tx_size_vartx(xd, mbmi, sub_txs, depth + 1,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ offsetr, offsetc, r);
+ }
+ }
+ } else {
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = tx_size;
+ }
+ }
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+#if LOOP_FILTER_BITMASK
+ store_bitmask_vartx(cm, mi_row + blk_row, mi_col + blk_col,
+ txsize_to_bsize[tx_size], tx_size, mbmi);
+#endif
+ }
+}
+
+static TX_SIZE read_selected_tx_size(MACROBLOCKD *xd, aom_reader *r) {
+ // TODO(debargha): Clean up the logic here. This function should only
+ // be called for intra.
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+ const int ctx = get_tx_size_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int depth = aom_read_symbol(r, ec_ctx->tx_size_cdf[tx_size_cat][ctx],
+ max_depths + 1, ACCT_STR);
+ assert(depth >= 0 && depth <= max_depths);
+ const TX_SIZE tx_size = depth_to_tx_size(depth, bsize);
+ return tx_size;
+}
+
+static TX_SIZE read_tx_size(AV1_COMMON *cm, MACROBLOCKD *xd, int is_inter,
+ int allow_select_inter, aom_reader *r) {
+ const TX_MODE tx_mode = cm->tx_mode;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ if (xd->lossless[xd->mi[0]->segment_id]) return TX_4X4;
+
+ if (block_signals_txsize(bsize)) {
+ if ((!is_inter || allow_select_inter) && tx_mode == TX_MODE_SELECT) {
+ const TX_SIZE coded_tx_size = read_selected_tx_size(xd, r);
+ return coded_tx_size;
+ } else {
+ return tx_size_from_tx_mode(bsize, tx_mode);
+ }
+ } else {
+ assert(IMPLIES(tx_mode == ONLY_4X4, bsize == BLOCK_4X4));
+ return max_txsize_rect_lookup[bsize];
+ }
+}
+
+#if LOOP_FILTER_BITMASK
+static void store_bitmask_vartx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ MB_MODE_INFO *mbmi) {
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (tx_size > TX_64X64) & (tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[tx_size];
+ mask_id = 47 + 2 * (tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (tx_size - TX_4X16);
+ }
+ int index = 0;
+ const int row = mi_row % MI_SIZE_64X64;
+ const int col = mi_col % MI_SIZE_64X64;
+ const int shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_univariant_tx(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ // Use a lookup table that provides one bitmask for a given block size and
+ // a univariant transform size.
+ int index;
+ int shift;
+ int row;
+ int col;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const TX_SIZE tx_size_y_vert = txsize_vert_map[mbmi->tx_size];
+ const TX_SIZE tx_size_y_horz = txsize_horz_map[mbmi->tx_size];
+ const TX_SIZE tx_size_uv_vert = txsize_vert_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const TX_SIZE tx_size_uv_horz = txsize_horz_map[av1_get_max_uv_txsize(
+ mbmi->sb_type, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y)];
+ const int is_square_transform_size = mbmi->tx_size <= TX_64X64;
+ int mask_id = 0;
+ int offset = 0;
+ const int half_ratio_tx_size_max32 =
+ (mbmi->tx_size > TX_64X64) & (mbmi->tx_size <= TX_32X16);
+ if (is_square_transform_size) {
+ switch (mbmi->tx_size) {
+ case TX_4X4: mask_id = mask_id_table_tx_4x4[bsize]; break;
+ case TX_8X8:
+ mask_id = mask_id_table_tx_8x8[bsize];
+ offset = 19;
+ break;
+ case TX_16X16:
+ mask_id = mask_id_table_tx_16x16[bsize];
+ offset = 33;
+ break;
+ case TX_32X32:
+ mask_id = mask_id_table_tx_32x32[bsize];
+ offset = 42;
+ break;
+ case TX_64X64: mask_id = 46; break;
+ default: assert(!is_square_transform_size); return;
+ }
+ mask_id += offset;
+ } else if (half_ratio_tx_size_max32) {
+ int tx_size_equal_block_size = bsize == txsize_to_bsize[mbmi->tx_size];
+ mask_id =
+ 47 + 2 * (mbmi->tx_size - TX_4X8) + (tx_size_equal_block_size ? 0 : 1);
+ } else if (mbmi->tx_size == TX_32X64) {
+ mask_id = 59;
+ } else if (mbmi->tx_size == TX_64X32) {
+ mask_id = 60;
+ } else { // quarter ratio tx size
+ mask_id = 61 + (mbmi->tx_size - TX_4X16);
+ }
+ row = mi_row % MI_SIZE_64X64;
+ col = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col, row, &index);
+ const int vert_shift = tx_size_y_vert <= TX_8X8 ? shift : col;
+ for (int i = 0; i + index < 4; ++i) {
+ // y vertical.
+ lfm->tx_size_ver[0][tx_size_y_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // y horizontal.
+ lfm->tx_size_hor[0][tx_size_y_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ // u/v vertical.
+ lfm->tx_size_ver[1][tx_size_uv_horz].bits[i + index] |=
+ (left_mask_univariant_reordered[mask_id].bits[i] << vert_shift);
+ // u/v horizontal.
+ lfm->tx_size_hor[1][tx_size_uv_vert].bits[i + index] |=
+ (above_mask_univariant_reordered[mask_id].bits[i] << shift);
+ }
+}
+
+static void store_bitmask_other_info(AV1_COMMON *cm, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, MB_MODE_INFO *mbmi) {
+ int index;
+ int shift;
+ int row;
+ LoopFilterMask *lfm = get_loop_filter_mask(cm, mi_row, mi_col);
+ const int row_start = mi_row % MI_SIZE_64X64;
+ const int col_start = mi_col % MI_SIZE_64X64;
+ shift = get_index_shift(col_start, row_start, &index);
+ const uint64_t top_edge_mask =
+ ((uint64_t)1 << (shift + mi_size_wide[bsize])) - ((uint64_t)1 << shift);
+ lfm->is_horz_border.bits[index] |= top_edge_mask;
+ const int is_vert_border = mask_id_table_vert_border[bsize];
+ const int vert_shift = block_size_high[bsize] <= 8 ? shift : col_start;
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->is_vert_border.bits[i + index] |=
+ (left_mask_univariant_reordered[is_vert_border].bits[i] << vert_shift);
+ }
+ const int is_skip = mbmi->skip && is_inter_block(mbmi);
+ if (is_skip) {
+ const int is_skip_mask = mask_id_table_tx_4x4[bsize];
+ for (int i = 0; i + index < 4; ++i) {
+ lfm->skip.bits[i + index] |=
+ (above_mask_univariant_reordered[is_skip_mask].bits[i] << shift);
+ }
+ }
+ const uint8_t level_vert_y = get_filter_level(cm, &cm->lf_info, 0, 0, mbmi);
+ const uint8_t level_horz_y = get_filter_level(cm, &cm->lf_info, 1, 0, mbmi);
+ const uint8_t level_u = get_filter_level(cm, &cm->lf_info, 0, 1, mbmi);
+ const uint8_t level_v = get_filter_level(cm, &cm->lf_info, 0, 2, mbmi);
+ for (int r = mi_row; r < mi_row + mi_size_high[bsize]; r++) {
+ index = 0;
+ row = r % MI_SIZE_64X64;
+ memset(&lfm->lfl_y_ver[row][col_start], level_vert_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_y_hor[row][col_start], level_horz_y,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_u[row][col_start], level_u,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ memset(&lfm->lfl_v[row][col_start], level_v,
+ sizeof(uint8_t) * mi_size_wide[bsize]);
+ }
+}
+#endif
+
+static void parse_decode_block(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+ MACROBLOCKD *const xd = &td->xd;
+ decode_mbmi_block(pbi, xd, mi_row, mi_col, r, partition, bsize);
+
+ av1_visit_palette(pbi, xd, mi_row, mi_col, r, bsize,
+ av1_decode_palette_tokens);
+
+ AV1_COMMON *cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int inter_block_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+ if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+ !mbmi->skip && inter_block_tx && !xd->lossless[mbmi->segment_id]) {
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+
+ for (int idy = 0; idy < height; idy += bh)
+ for (int idx = 0; idx < width; idx += bw)
+ read_tx_size_vartx(xd, mbmi, max_tx_size, 0,
+#if LOOP_FILTER_BITMASK
+ cm, mi_row, mi_col,
+#endif
+ idy, idx, r);
+ } else {
+ mbmi->tx_size = read_tx_size(cm, xd, inter_block_tx, !mbmi->skip, r);
+ if (inter_block_tx)
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
+ mbmi->skip && is_inter_block(mbmi), xd);
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_univariant_tx(cm, mi_row + row, mi_col + col,
+ BLOCK_64X64, mbmi);
+ }
+ }
+ }
+#endif
+ }
+#if LOOP_FILTER_BITMASK
+ const int w = mi_size_wide[bsize];
+ const int h = mi_size_high[bsize];
+ if (w <= mi_size_wide[BLOCK_64X64] && h <= mi_size_high[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row, mi_col, bsize, mbmi);
+ } else {
+ for (int row = 0; row < h; row += mi_size_high[BLOCK_64X64]) {
+ for (int col = 0; col < w; col += mi_size_wide[BLOCK_64X64]) {
+ store_bitmask_other_info(cm, mi_row + row, mi_col + col, BLOCK_64X64,
+ mbmi);
+ }
+ }
+ }
+#endif
+
+ if (cm->delta_q_present_flag) {
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ const int current_qindex =
+ av1_get_qindex(&cm->seg, i, xd->current_qindex);
+ for (int j = 0; j < num_planes; ++j) {
+ const int dc_delta_q =
+ j == 0 ? cm->y_dc_delta_q
+ : (j == 1 ? cm->u_dc_delta_q : cm->v_dc_delta_q);
+ const int ac_delta_q =
+ j == 0 ? 0 : (j == 1 ? cm->u_ac_delta_q : cm->v_ac_delta_q);
+ xd->plane[j].seg_dequant_QTX[i][0] = av1_dc_quant_QTX(
+ current_qindex, dc_delta_q, cm->seq_params.bit_depth);
+ xd->plane[j].seg_dequant_QTX[i][1] = av1_ac_quant_QTX(
+ current_qindex, ac_delta_q, cm->seq_params.bit_depth);
+ }
+ }
+ }
+ if (mbmi->skip) av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+
+ decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+
+ int reader_corrupted_flag = aom_reader_has_error(r);
+ aom_merge_corrupted_flag(&xd->corrupted, reader_corrupted_flag);
+}
+
+static void set_offsets_for_pred_and_recon(AV1Decoder *const pbi,
+ ThreadData *const td, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int num_planes = av1_num_planes(cm);
+
+ const int offset = mi_row * cm->mi_stride + mi_col;
+ const TileInfo *const tile = &xd->tile;
+
+ xd->mi = cm->mi_grid_visible + offset;
+ xd->cfl.mi_row = mi_row;
+ xd->cfl.mi_col = mi_col;
+
+ set_plane_n4(xd, bw, bh, num_planes);
+
+ // Distance of Mb to the various image edges. These are specified to 8th pel
+ // as they are always compared to values that are in 1/8th pel units
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col, 0, num_planes);
+}
+
+static void decode_block(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize) {
+ (void)partition;
+ set_offsets_for_pred_and_recon(pbi, td, mi_row, mi_col, bsize);
+ decode_token_recon_block(pbi, td, mi_row, mi_col, r, bsize);
+}
+
+static PARTITION_TYPE read_partition(MACROBLOCKD *xd, int mi_row, int mi_col,
+ aom_reader *r, int has_rows, int has_cols,
+ BLOCK_SIZE bsize) {
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!has_rows && !has_cols) return PARTITION_SPLIT;
+
+ assert(ctx >= 0);
+ aom_cdf_prob *partition_cdf = ec_ctx->partition_cdf[ctx];
+ if (has_rows && has_cols) {
+ return (PARTITION_TYPE)aom_read_symbol(
+ r, partition_cdf, partition_cdf_length(bsize), ACCT_STR);
+ } else if (!has_rows && has_cols) {
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_vert_alike(cdf, partition_cdf, bsize);
+ assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+ return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_HORZ;
+ } else {
+ assert(has_rows && !has_cols);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_horz_alike(cdf, partition_cdf, bsize);
+ assert(cdf[1] == AOM_ICDF(CDF_PROB_TOP));
+ return aom_read_cdf(r, cdf, 2, ACCT_STR) ? PARTITION_SPLIT : PARTITION_VERT;
+ }
+}
+
+// TODO(slavarnway): eliminate bsize and subsize in future commits
+static void decode_partition(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ BLOCK_SIZE bsize, int parse_decode_flag) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &td->xd;
+ const int bw = mi_size_wide[bsize];
+ const int hbs = bw >> 1;
+ PARTITION_TYPE partition;
+ BLOCK_SIZE subsize;
+ const int quarter_step = bw / 4;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ // parse_decode_flag takes the following values :
+ // 01 - do parse only
+ // 10 - do decode only
+ // 11 - do parse and decode
+ static const block_visitor_fn_t block_visit[4] = {
+ NULL, parse_decode_block, decode_block, parse_decode_block
+ };
+
+ if (parse_decode_flag & 1) {
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units_per_tile;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ loop_restoration_read_sb_coeffs(cm, xd, r, plane, runit_idx);
+ }
+ }
+ }
+ }
+
+ partition = (bsize < BLOCK_8X8) ? PARTITION_NONE
+ : read_partition(xd, mi_row, mi_col, r,
+ has_rows, has_cols, bsize);
+ } else {
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ }
+ subsize = get_partition_subsize(bsize, partition);
+
+ // Check the bitstream is conformant: if there is subsampling on the
+ // chroma planes, subsize must subsample to a valid block size.
+ const struct macroblockd_plane *const pd_u = &xd->plane[1];
+ if (get_plane_block_size(subsize, pd_u->subsampling_x, pd_u->subsampling_y) ==
+ BLOCK_INVALID) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Block size %dx%d invalid with this subsampling mode",
+ block_size_wide[subsize], block_size_high[subsize]);
+ }
+
+#define DEC_BLOCK_STX_ARG
+#define DEC_BLOCK_EPT_ARG partition,
+#define DEC_BLOCK(db_r, db_c, db_subsize) \
+ block_visit[parse_decode_flag](pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, \
+ DEC_BLOCK_EPT_ARG(db_subsize))
+#define DEC_PARTITION(db_r, db_c, db_subsize) \
+ decode_partition(pbi, td, DEC_BLOCK_STX_ARG(db_r), (db_c), r, (db_subsize), \
+ parse_decode_flag)
+
+ switch (partition) {
+ case PARTITION_NONE: DEC_BLOCK(mi_row, mi_col, subsize); break;
+ case PARTITION_HORZ:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ if (has_rows) DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+ break;
+ case PARTITION_VERT:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ if (has_cols) DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+ break;
+ case PARTITION_SPLIT:
+ DEC_PARTITION(mi_row, mi_col, subsize);
+ DEC_PARTITION(mi_row, mi_col + hbs, subsize);
+ DEC_PARTITION(mi_row + hbs, mi_col, subsize);
+ DEC_PARTITION(mi_row + hbs, mi_col + hbs, subsize);
+ break;
+ case PARTITION_HORZ_A:
+ DEC_BLOCK(mi_row, mi_col, bsize2);
+ DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col, subsize);
+ break;
+ case PARTITION_HORZ_B:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+ break;
+ case PARTITION_VERT_A:
+ DEC_BLOCK(mi_row, mi_col, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col, bsize2);
+ DEC_BLOCK(mi_row, mi_col + hbs, subsize);
+ break;
+ case PARTITION_VERT_B:
+ DEC_BLOCK(mi_row, mi_col, subsize);
+ DEC_BLOCK(mi_row, mi_col + hbs, bsize2);
+ DEC_BLOCK(mi_row + hbs, mi_col + hbs, bsize2);
+ break;
+ case PARTITION_HORZ_4:
+ for (int i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= cm->mi_rows) break;
+ DEC_BLOCK(this_mi_row, mi_col, subsize);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (int i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= cm->mi_cols) break;
+ DEC_BLOCK(mi_row, this_mi_col, subsize);
+ }
+ break;
+ default: assert(0 && "Invalid partition type");
+ }
+
+#undef DEC_PARTITION
+#undef DEC_BLOCK
+#undef DEC_BLOCK_EPT_ARG
+#undef DEC_BLOCK_STX_ARG
+
+ if (parse_decode_flag & 1)
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static void setup_bool_decoder(const uint8_t *data, const uint8_t *data_end,
+ const size_t read_size,
+ struct aom_internal_error_info *error_info,
+ aom_reader *r, uint8_t allow_update_cdf) {
+ // Validate the calculated partition length. If the buffer
+ // described by the partition can't be fully read, then restrict
+ // it to the portion that can be (for EC mode) or throw an error.
+ if (!read_is_valid(data, read_size, data_end))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+
+ if (aom_reader_init(r, data, read_size))
+ aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate bool decoder %d", 1);
+
+ r->allow_update_cdf = allow_update_cdf;
+}
+
+static void setup_segmentation(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb) {
+ struct segmentation *const seg = &cm->seg;
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+ seg->temporal_update = 0;
+
+ seg->enabled = aom_rb_read_bit(rb);
+ if (!seg->enabled) {
+ if (cm->cur_frame->seg_map)
+ memset(cm->cur_frame->seg_map, 0, (cm->mi_rows * cm->mi_cols));
+
+ memset(seg, 0, sizeof(*seg));
+ segfeatures_copy(&cm->cur_frame->seg, seg);
+ return;
+ }
+ if (cm->seg.enabled && cm->prev_frame &&
+ (cm->mi_rows == cm->prev_frame->mi_rows) &&
+ (cm->mi_cols == cm->prev_frame->mi_cols)) {
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ } else {
+ cm->last_frame_seg_map = NULL;
+ }
+ // Read update flags
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+ // These frames can't use previous frames, so must signal map + features
+ seg->update_map = 1;
+ seg->temporal_update = 0;
+ seg->update_data = 1;
+ } else {
+ seg->update_map = aom_rb_read_bit(rb);
+ if (seg->update_map) {
+ seg->temporal_update = aom_rb_read_bit(rb);
+ } else {
+ seg->temporal_update = 0;
+ }
+ seg->update_data = aom_rb_read_bit(rb);
+ }
+
+ // Segmentation data update
+ if (seg->update_data) {
+ av1_clearall_segfeatures(seg);
+
+ for (int i = 0; i < MAX_SEGMENTS; i++) {
+ for (int j = 0; j < SEG_LVL_MAX; j++) {
+ int data = 0;
+ const int feature_enabled = aom_rb_read_bit(rb);
+ if (feature_enabled) {
+ av1_enable_segfeature(seg, i, j);
+
+ const int data_max = av1_seg_feature_data_max(j);
+ const int data_min = -data_max;
+ const int ubits = get_unsigned_bits(data_max);
+
+ if (av1_is_segfeature_signed(j)) {
+ data = aom_rb_read_inv_signed_literal(rb, ubits);
+ } else {
+ data = aom_rb_read_literal(rb, ubits);
+ }
+
+ data = clamp(data, data_min, data_max);
+ }
+ av1_set_segdata(seg, i, j, data);
+ }
+ }
+ calculate_segdata(seg);
+ } else if (cm->prev_frame) {
+ segfeatures_copy(seg, &cm->prev_frame->seg);
+ }
+ segfeatures_copy(&cm->cur_frame->seg, seg);
+}
+
+static void decode_restoration_mode(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ assert(!cm->all_lossless);
+ const int num_planes = av1_num_planes(cm);
+ if (cm->allow_intrabc) return;
+ int all_none = 1, chroma_none = 1;
+ for (int p = 0; p < num_planes; ++p) {
+ RestorationInfo *rsi = &cm->rst_info[p];
+ if (aom_rb_read_bit(rb)) {
+ rsi->frame_restoration_type =
+ aom_rb_read_bit(rb) ? RESTORE_SGRPROJ : RESTORE_WIENER;
+ } else {
+ rsi->frame_restoration_type =
+ aom_rb_read_bit(rb) ? RESTORE_SWITCHABLE : RESTORE_NONE;
+ }
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ all_none = 0;
+ chroma_none &= p == 0;
+ }
+ }
+ if (!all_none) {
+ assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+ cm->seq_params.sb_size == BLOCK_128X128);
+ const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+ for (int p = 0; p < num_planes; ++p)
+ cm->rst_info[p].restoration_unit_size = sb_size;
+
+ RestorationInfo *rsi = &cm->rst_info[0];
+
+ if (sb_size == 64) {
+ rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+ }
+ if (rsi->restoration_unit_size > 64) {
+ rsi->restoration_unit_size <<= aom_rb_read_bit(rb);
+ }
+ } else {
+ const int size = RESTORATION_UNITSIZE_MAX;
+ for (int p = 0; p < num_planes; ++p)
+ cm->rst_info[p].restoration_unit_size = size;
+ }
+
+ if (num_planes > 1) {
+ int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+ if (s && !chroma_none) {
+ cm->rst_info[1].restoration_unit_size =
+ cm->rst_info[0].restoration_unit_size >> (aom_rb_read_bit(rb) * s);
+ } else {
+ cm->rst_info[1].restoration_unit_size =
+ cm->rst_info[0].restoration_unit_size;
+ }
+ cm->rst_info[2].restoration_unit_size =
+ cm->rst_info[1].restoration_unit_size;
+ }
+}
+
+static void read_wiener_filter(int wiener_win, WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info, aom_reader *rb) {
+ memset(wiener_info->vfilter, 0, sizeof(wiener_info->vfilter));
+ memset(wiener_info->hfilter, 0, sizeof(wiener_info->hfilter));
+
+ if (wiener_win == WIENER_WIN)
+ wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+ WIENER_FILT_TAP0_MINV;
+ else
+ wiener_info->vfilter[0] = wiener_info->vfilter[WIENER_WIN - 1] = 0;
+ wiener_info->vfilter[1] = wiener_info->vfilter[WIENER_WIN - 2] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+ WIENER_FILT_TAP1_MINV;
+ wiener_info->vfilter[2] = wiener_info->vfilter[WIENER_WIN - 3] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+ WIENER_FILT_TAP2_MINV;
+ // The central element has an implicit +WIENER_FILT_STEP
+ wiener_info->vfilter[WIENER_HALFWIN] =
+ -2 * (wiener_info->vfilter[0] + wiener_info->vfilter[1] +
+ wiener_info->vfilter[2]);
+
+ if (wiener_win == WIENER_WIN)
+ wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV, ACCT_STR) +
+ WIENER_FILT_TAP0_MINV;
+ else
+ wiener_info->hfilter[0] = wiener_info->hfilter[WIENER_WIN - 1] = 0;
+ wiener_info->hfilter[1] = wiener_info->hfilter[WIENER_WIN - 2] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV, ACCT_STR) +
+ WIENER_FILT_TAP1_MINV;
+ wiener_info->hfilter[2] = wiener_info->hfilter[WIENER_WIN - 3] =
+ aom_read_primitive_refsubexpfin(
+ rb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV, ACCT_STR) +
+ WIENER_FILT_TAP2_MINV;
+ // The central element has an implicit +WIENER_FILT_STEP
+ wiener_info->hfilter[WIENER_HALFWIN] =
+ -2 * (wiener_info->hfilter[0] + wiener_info->hfilter[1] +
+ wiener_info->hfilter[2]);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void read_sgrproj_filter(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info, aom_reader *rb) {
+ sgrproj_info->ep = aom_read_literal(rb, SGRPROJ_PARAMS_BITS, ACCT_STR);
+ const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+ if (params->r[0] == 0) {
+ sgrproj_info->xqd[0] = 0;
+ sgrproj_info->xqd[1] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+ SGRPROJ_PRJ_MIN1;
+ } else if (params->r[1] == 0) {
+ sgrproj_info->xqd[0] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+ SGRPROJ_PRJ_MIN0;
+ sgrproj_info->xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - sgrproj_info->xqd[0],
+ SGRPROJ_PRJ_MIN1, SGRPROJ_PRJ_MAX1);
+ } else {
+ sgrproj_info->xqd[0] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0, ACCT_STR) +
+ SGRPROJ_PRJ_MIN0;
+ sgrproj_info->xqd[1] =
+ aom_read_primitive_refsubexpfin(
+ rb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1, ACCT_STR) +
+ SGRPROJ_PRJ_MIN1;
+ }
+
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void loop_restoration_read_sb_coeffs(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd,
+ aom_reader *const r, int plane,
+ int runit_idx) {
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationUnitInfo *rui = &rsi->unit_info[runit_idx];
+ if (rsi->frame_restoration_type == RESTORE_NONE) return;
+
+ assert(!cm->all_lossless);
+
+ const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+ WienerInfo *wiener_info = xd->wiener_info + plane;
+ SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+
+ if (rsi->frame_restoration_type == RESTORE_SWITCHABLE) {
+ rui->restoration_type =
+ aom_read_symbol(r, xd->tile_ctx->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES, ACCT_STR);
+ switch (rui->restoration_type) {
+ case RESTORE_WIENER:
+ read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+ break;
+ case RESTORE_SGRPROJ:
+ read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+ break;
+ default: assert(rui->restoration_type == RESTORE_NONE); break;
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_WIENER) {
+ if (aom_read_symbol(r, xd->tile_ctx->wiener_restore_cdf, 2, ACCT_STR)) {
+ rui->restoration_type = RESTORE_WIENER;
+ read_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, r);
+ } else {
+ rui->restoration_type = RESTORE_NONE;
+ }
+ } else if (rsi->frame_restoration_type == RESTORE_SGRPROJ) {
+ if (aom_read_symbol(r, xd->tile_ctx->sgrproj_restore_cdf, 2, ACCT_STR)) {
+ rui->restoration_type = RESTORE_SGRPROJ;
+ read_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, r);
+ } else {
+ rui->restoration_type = RESTORE_NONE;
+ }
+ }
+}
+
+static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *lf = &cm->lf;
+ if (cm->allow_intrabc || cm->coded_lossless) {
+ // write default deltas to frame buffer
+ av1_set_default_ref_deltas(cm->cur_frame->ref_deltas);
+ av1_set_default_mode_deltas(cm->cur_frame->mode_deltas);
+ return;
+ }
+ assert(!cm->coded_lossless);
+ if (cm->prev_frame) {
+ // write deltas to frame buffer
+ memcpy(lf->ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+ memcpy(lf->mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+ } else {
+ av1_set_default_ref_deltas(lf->ref_deltas);
+ av1_set_default_mode_deltas(lf->mode_deltas);
+ }
+ lf->filter_level[0] = aom_rb_read_literal(rb, 6);
+ lf->filter_level[1] = aom_rb_read_literal(rb, 6);
+ if (num_planes > 1) {
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+ lf->filter_level_u = aom_rb_read_literal(rb, 6);
+ lf->filter_level_v = aom_rb_read_literal(rb, 6);
+ }
+ }
+ lf->sharpness_level = aom_rb_read_literal(rb, 3);
+
+ // Read in loop filter deltas applied at the MB level based on mode or ref
+ // frame.
+ lf->mode_ref_delta_update = 0;
+
+ lf->mode_ref_delta_enabled = aom_rb_read_bit(rb);
+ if (lf->mode_ref_delta_enabled) {
+ lf->mode_ref_delta_update = aom_rb_read_bit(rb);
+ if (lf->mode_ref_delta_update) {
+ for (int i = 0; i < REF_FRAMES; i++)
+ if (aom_rb_read_bit(rb))
+ lf->ref_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+
+ for (int i = 0; i < MAX_MODE_LF_DELTAS; i++)
+ if (aom_rb_read_bit(rb))
+ lf->mode_deltas[i] = aom_rb_read_inv_signed_literal(rb, 6);
+ }
+ }
+
+ // write deltas to frame buffer
+ memcpy(cm->cur_frame->ref_deltas, lf->ref_deltas, REF_FRAMES);
+ memcpy(cm->cur_frame->mode_deltas, lf->mode_deltas, MAX_MODE_LF_DELTAS);
+}
+
+static void setup_cdef(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ const int num_planes = av1_num_planes(cm);
+ if (cm->allow_intrabc) return;
+ cm->cdef_pri_damping = cm->cdef_sec_damping = aom_rb_read_literal(rb, 2) + 3;
+ cm->cdef_bits = aom_rb_read_literal(rb, 2);
+ cm->nb_cdef_strengths = 1 << cm->cdef_bits;
+ for (int i = 0; i < cm->nb_cdef_strengths; i++) {
+ cm->cdef_strengths[i] = aom_rb_read_literal(rb, CDEF_STRENGTH_BITS);
+ cm->cdef_uv_strengths[i] =
+ num_planes > 1 ? aom_rb_read_literal(rb, CDEF_STRENGTH_BITS) : 0;
+ }
+}
+
+static INLINE int read_delta_q(struct aom_read_bit_buffer *rb) {
+ return aom_rb_read_bit(rb) ? aom_rb_read_inv_signed_literal(rb, 6) : 0;
+}
+
+static void setup_quantization(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ cm->base_qindex = aom_rb_read_literal(rb, QINDEX_BITS);
+ cm->y_dc_delta_q = read_delta_q(rb);
+ if (num_planes > 1) {
+ int diff_uv_delta = 0;
+ if (seq_params->separate_uv_delta_q) diff_uv_delta = aom_rb_read_bit(rb);
+ cm->u_dc_delta_q = read_delta_q(rb);
+ cm->u_ac_delta_q = read_delta_q(rb);
+ if (diff_uv_delta) {
+ cm->v_dc_delta_q = read_delta_q(rb);
+ cm->v_ac_delta_q = read_delta_q(rb);
+ } else {
+ cm->v_dc_delta_q = cm->u_dc_delta_q;
+ cm->v_ac_delta_q = cm->u_ac_delta_q;
+ }
+ } else {
+ cm->u_dc_delta_q = 0;
+ cm->u_ac_delta_q = 0;
+ cm->v_dc_delta_q = 0;
+ cm->v_ac_delta_q = 0;
+ }
+ cm->dequant_bit_depth = seq_params->bit_depth;
+ cm->using_qmatrix = aom_rb_read_bit(rb);
+ if (cm->using_qmatrix) {
+ cm->qm_y = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ cm->qm_u = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ if (!seq_params->separate_uv_delta_q)
+ cm->qm_v = cm->qm_u;
+ else
+ cm->qm_v = aom_rb_read_literal(rb, QM_LEVEL_BITS);
+ } else {
+ cm->qm_y = 0;
+ cm->qm_u = 0;
+ cm->qm_v = 0;
+ }
+}
+
+// Build y/uv dequant values based on segmentation.
+static void setup_segmentation_dequant(AV1_COMMON *const cm) {
+ const int bit_depth = cm->seq_params.bit_depth;
+ const int using_qm = cm->using_qmatrix;
+ // When segmentation is disabled, only the first value is used. The
+ // remaining are don't cares.
+ const int max_segments = cm->seg.enabled ? MAX_SEGMENTS : 1;
+ for (int i = 0; i < max_segments; ++i) {
+ const int qindex = av1_get_qindex(&cm->seg, i, cm->base_qindex);
+ cm->y_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, cm->y_dc_delta_q, bit_depth);
+ cm->y_dequant_QTX[i][1] = av1_ac_quant_QTX(qindex, 0, bit_depth);
+ cm->u_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, cm->u_dc_delta_q, bit_depth);
+ cm->u_dequant_QTX[i][1] =
+ av1_ac_quant_QTX(qindex, cm->u_ac_delta_q, bit_depth);
+ cm->v_dequant_QTX[i][0] =
+ av1_dc_quant_QTX(qindex, cm->v_dc_delta_q, bit_depth);
+ cm->v_dequant_QTX[i][1] =
+ av1_ac_quant_QTX(qindex, cm->v_ac_delta_q, bit_depth);
+ const int lossless = qindex == 0 && cm->y_dc_delta_q == 0 &&
+ cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+ cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+ // NB: depends on base index so there is only 1 set per frame
+ // No quant weighting when lossless or signalled not using QM
+ int qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_y;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ cm->y_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_Y, j);
+ }
+ qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_u;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ cm->u_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_U, j);
+ }
+ qmlevel = (lossless || using_qm == 0) ? NUM_QM_LEVELS - 1 : cm->qm_v;
+ for (int j = 0; j < TX_SIZES_ALL; ++j) {
+ cm->v_iqmatrix[i][j] = av1_iqmatrix(cm, qmlevel, AOM_PLANE_V, j);
+ }
+ }
+}
+
+static InterpFilter read_frame_interp_filter(struct aom_read_bit_buffer *rb) {
+ return aom_rb_read_bit(rb) ? SWITCHABLE
+ : aom_rb_read_literal(rb, LOG_SWITCHABLE_FILTERS);
+}
+
+static void setup_render_size(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ cm->render_width = cm->superres_upscaled_width;
+ cm->render_height = cm->superres_upscaled_height;
+ if (aom_rb_read_bit(rb))
+ av1_read_frame_size(rb, 16, 16, &cm->render_width, &cm->render_height);
+}
+
+// TODO(afergs): make "struct aom_read_bit_buffer *const rb"?
+static void setup_superres(AV1_COMMON *const cm, struct aom_read_bit_buffer *rb,
+ int *width, int *height) {
+ cm->superres_upscaled_width = *width;
+ cm->superres_upscaled_height = *height;
+
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ if (!seq_params->enable_superres) return;
+
+ if (aom_rb_read_bit(rb)) {
+ cm->superres_scale_denominator =
+ (uint8_t)aom_rb_read_literal(rb, SUPERRES_SCALE_BITS);
+ cm->superres_scale_denominator += SUPERRES_SCALE_DENOMINATOR_MIN;
+ // Don't edit cm->width or cm->height directly, or the buffers won't get
+ // resized correctly
+ av1_calculate_scaled_superres_size(width, height,
+ cm->superres_scale_denominator);
+ } else {
+ // 1:1 scaling - ie. no scaling, scale not provided
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ }
+}
+
+static void resize_context_buffers(AV1_COMMON *cm, int width, int height) {
+#if CONFIG_SIZE_LIMIT
+ if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Dimensions of %dx%d beyond allowed size of %dx%d.",
+ width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
+#endif
+ if (cm->width != width || cm->height != height) {
+ const int new_mi_rows =
+ ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+ const int new_mi_cols =
+ ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2) >> MI_SIZE_LOG2;
+
+ // Allocations in av1_alloc_context_buffers() depend on individual
+ // dimensions as well as the overall size.
+ if (new_mi_cols > cm->mi_cols || new_mi_rows > cm->mi_rows) {
+ if (av1_alloc_context_buffers(cm, width, height)) {
+ // The cm->mi_* values have been cleared and any existing context
+ // buffers have been freed. Clear cm->width and cm->height to be
+ // consistent and to force a realloc next time.
+ cm->width = 0;
+ cm->height = 0;
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+ } else {
+ av1_set_mb_mi(cm, width, height);
+ }
+ av1_init_context_buffers(cm);
+ cm->width = width;
+ cm->height = height;
+ }
+
+ ensure_mv_buffer(cm->cur_frame, cm);
+ cm->cur_frame->width = cm->width;
+ cm->cur_frame->height = cm->height;
+}
+
+static void setup_buffer_pool(AV1_COMMON *cm) {
+ BufferPool *const pool = cm->buffer_pool;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+
+ lock_buffer_pool(pool);
+ if (aom_realloc_frame_buffer(
+ get_frame_new_buffer(cm), cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment,
+ &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
+ pool->cb_priv)) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ unlock_buffer_pool(pool);
+
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x =
+ seq_params->subsampling_x;
+ pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y =
+ seq_params->subsampling_y;
+ pool->frame_bufs[cm->new_fb_idx].buf.bit_depth =
+ (unsigned int)seq_params->bit_depth;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_primaries =
+ seq_params->color_primaries;
+ pool->frame_bufs[cm->new_fb_idx].buf.transfer_characteristics =
+ seq_params->transfer_characteristics;
+ pool->frame_bufs[cm->new_fb_idx].buf.matrix_coefficients =
+ seq_params->matrix_coefficients;
+ pool->frame_bufs[cm->new_fb_idx].buf.monochrome = seq_params->monochrome;
+ pool->frame_bufs[cm->new_fb_idx].buf.chroma_sample_position =
+ seq_params->chroma_sample_position;
+ pool->frame_bufs[cm->new_fb_idx].buf.color_range = seq_params->color_range;
+ pool->frame_bufs[cm->new_fb_idx].buf.render_width = cm->render_width;
+ pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
+}
+
+static void setup_frame_size(AV1_COMMON *cm, int frame_size_override_flag,
+ struct aom_read_bit_buffer *rb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ int width, height;
+
+ if (frame_size_override_flag) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ if (width > seq_params->max_frame_width ||
+ height > seq_params->max_frame_height) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame dimensions are larger than the maximum values");
+ }
+ } else {
+ width = seq_params->max_frame_width;
+ height = seq_params->max_frame_height;
+ }
+
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ setup_buffer_pool(cm);
+}
+
+static void setup_sb_size(SequenceHeader *seq_params,
+ struct aom_read_bit_buffer *rb) {
+ set_sb_size(seq_params, aom_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+}
+
+static INLINE int valid_ref_frame_img_fmt(aom_bit_depth_t ref_bit_depth,
+ int ref_xss, int ref_yss,
+ aom_bit_depth_t this_bit_depth,
+ int this_xss, int this_yss) {
+ return ref_bit_depth == this_bit_depth && ref_xss == this_xss &&
+ ref_yss == this_yss;
+}
+
+static void setup_frame_size_with_refs(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ int width, height;
+ int found = 0;
+ int has_valid_ref_frame = 0;
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ if (aom_rb_read_bit(rb)) {
+ YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
+ width = buf->y_crop_width;
+ height = buf->y_crop_height;
+ cm->render_width = buf->render_width;
+ cm->render_height = buf->render_height;
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ found = 1;
+ break;
+ }
+ }
+
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ if (!found) {
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+
+ av1_read_frame_size(rb, num_bits_width, num_bits_height, &width, &height);
+ setup_superres(cm, rb, &width, &height);
+ resize_context_buffers(cm, width, height);
+ setup_render_size(cm, rb);
+ }
+
+ if (width <= 0 || height <= 0)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid frame size");
+
+ // Check to make sure at least one of frames that this frame references
+ // has valid dimensions.
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefBuffer *const ref_frame = &cm->frame_refs[i];
+ has_valid_ref_frame |=
+ valid_ref_frame_size(ref_frame->buf->y_crop_width,
+ ref_frame->buf->y_crop_height, width, height);
+ }
+ if (!has_valid_ref_frame)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Referenced frame has invalid size");
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefBuffer *const ref_frame = &cm->frame_refs[i];
+ if (!valid_ref_frame_img_fmt(
+ ref_frame->buf->bit_depth, ref_frame->buf->subsampling_x,
+ ref_frame->buf->subsampling_y, seq_params->bit_depth,
+ seq_params->subsampling_x, seq_params->subsampling_y))
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Referenced frame has incompatible color format");
+ }
+ setup_buffer_pool(cm);
+}
+
+// Same function as av1_read_uniform but reading from uncompresses header wb
+static int rb_read_uniform(struct aom_read_bit_buffer *const rb, int n) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ const int v = aom_rb_read_literal(rb, l - 1);
+ assert(l != 0);
+ if (v < m)
+ return v;
+ else
+ return (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static void read_tile_info_max_tile(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *const rb) {
+ int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+ int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+ int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+
+ av1_get_tile_limits(cm);
+ cm->uniform_tile_spacing_flag = aom_rb_read_bit(rb);
+
+ // Read tile columns
+ if (cm->uniform_tile_spacing_flag) {
+ cm->log2_tile_cols = cm->min_log2_tile_cols;
+ while (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+ if (!aom_rb_read_bit(rb)) {
+ break;
+ }
+ cm->log2_tile_cols++;
+ }
+ } else {
+ int i;
+ int start_sb;
+ for (i = 0, start_sb = 0; width_sb > 0 && i < MAX_TILE_COLS; i++) {
+ const int size_sb =
+ 1 + rb_read_uniform(rb, AOMMIN(width_sb, cm->max_tile_width_sb));
+ cm->tile_col_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ width_sb -= size_sb;
+ }
+ cm->tile_cols = i;
+ cm->tile_col_start_sb[i] = start_sb + width_sb;
+ }
+ av1_calculate_tile_cols(cm);
+
+ // Read tile rows
+ if (cm->uniform_tile_spacing_flag) {
+ cm->log2_tile_rows = cm->min_log2_tile_rows;
+ while (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+ if (!aom_rb_read_bit(rb)) {
+ break;
+ }
+ cm->log2_tile_rows++;
+ }
+ } else {
+ int i;
+ int start_sb;
+ for (i = 0, start_sb = 0; height_sb > 0 && i < MAX_TILE_ROWS; i++) {
+ const int size_sb =
+ 1 + rb_read_uniform(rb, AOMMIN(height_sb, cm->max_tile_height_sb));
+ cm->tile_row_start_sb[i] = start_sb;
+ start_sb += size_sb;
+ height_sb -= size_sb;
+ }
+ cm->tile_rows = i;
+ cm->tile_row_start_sb[i] = start_sb + height_sb;
+ }
+ av1_calculate_tile_rows(cm);
+}
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm) {
+ cm->single_tile_decoding = 0;
+ if (cm->large_scale_tile) {
+ struct loopfilter *lf = &cm->lf;
+
+ // Figure out single_tile_decoding by loopfilter_level.
+ const int no_loopfilter = !(lf->filter_level[0] || lf->filter_level[1]);
+ const int no_cdef = cm->cdef_bits == 0 && cm->cdef_strengths[0] == 0 &&
+ cm->cdef_uv_strengths[0] == 0;
+ const int no_restoration =
+ cm->rst_info[0].frame_restoration_type == RESTORE_NONE &&
+ cm->rst_info[1].frame_restoration_type == RESTORE_NONE &&
+ cm->rst_info[2].frame_restoration_type == RESTORE_NONE;
+ assert(IMPLIES(cm->coded_lossless, no_loopfilter && no_cdef));
+ assert(IMPLIES(cm->all_lossless, no_restoration));
+ cm->single_tile_decoding = no_loopfilter && no_cdef && no_restoration;
+ }
+}
+
+static void read_tile_info(AV1Decoder *const pbi,
+ struct aom_read_bit_buffer *const rb) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ read_tile_info_max_tile(cm, rb);
+
+ cm->context_update_tile_id = 0;
+ if (cm->tile_rows * cm->tile_cols > 1) {
+ // tile to use for cdf update
+ cm->context_update_tile_id =
+ aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ if (cm->context_update_tile_id >= cm->tile_rows * cm->tile_cols) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid context_update_tile_id");
+ }
+ // tile size magnitude
+ pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ }
+}
+
+#if EXT_TILE_DEBUG
+static void read_ext_tile_info(AV1Decoder *const pbi,
+ struct aom_read_bit_buffer *const rb) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ // This information is stored as a separate byte.
+ int mod = rb->bit_offset % CHAR_BIT;
+ if (mod > 0) aom_rb_read_literal(rb, CHAR_BIT - mod);
+ assert(rb->bit_offset % CHAR_BIT == 0);
+
+ if (cm->tile_cols * cm->tile_rows > 1) {
+ // Read the number of bytes used to store tile size
+ pbi->tile_col_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ pbi->tile_size_bytes = aom_rb_read_literal(rb, 2) + 1;
+ }
+}
+#endif // EXT_TILE_DEBUG
+
+static size_t mem_get_varsize(const uint8_t *src, int sz) {
+ switch (sz) {
+ case 1: return src[0];
+ case 2: return mem_get_le16(src);
+ case 3: return mem_get_le24(src);
+ case 4: return mem_get_le32(src);
+ default: assert(0 && "Invalid size"); return -1;
+ }
+}
+
+#if EXT_TILE_DEBUG
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'. On return, '*data' is updated to point to the end of the
+// raw tile buffer in the bit stream.
+static void get_ls_tile_buffer(
+ const uint8_t *const data_end, struct aom_internal_error_info *error_info,
+ const uint8_t **data, TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+ int tile_size_bytes, int col, int row, int tile_copy_mode) {
+ size_t size;
+
+ size_t copy_size = 0;
+ const uint8_t *copy_data = NULL;
+
+ if (!read_is_valid(*data, tile_size_bytes, data_end))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+ size = mem_get_varsize(*data, tile_size_bytes);
+
+ // If tile_copy_mode = 1, then the top bit of the tile header indicates copy
+ // mode.
+ if (tile_copy_mode && (size >> (tile_size_bytes * 8 - 1)) == 1) {
+ // The remaining bits in the top byte signal the row offset
+ int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+ // Currently, only use tiles in same column as reference tiles.
+ copy_data = tile_buffers[row - offset][col].data;
+ copy_size = tile_buffers[row - offset][col].size;
+ size = 0;
+ } else {
+ size += AV1_MIN_TILE_SIZE_BYTES;
+ }
+
+ *data += tile_size_bytes;
+
+ if (size > (size_t)(data_end - *data))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile size");
+
+ if (size > 0) {
+ tile_buffers[row][col].data = *data;
+ tile_buffers[row][col].size = size;
+ } else {
+ tile_buffers[row][col].data = copy_data;
+ tile_buffers[row][col].size = copy_size;
+ }
+
+ *data += size;
+}
+
+// Returns the end of the last tile buffer
+// (tile_buffers[cm->tile_rows - 1][cm->tile_cols - 1]).
+static const uint8_t *get_ls_tile_buffers(
+ AV1Decoder *pbi, const uint8_t *data, const uint8_t *data_end,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int have_tiles = tile_cols * tile_rows > 1;
+ const uint8_t *raw_data_end; // The end of the last tile buffer
+
+ if (!have_tiles) {
+ const size_t tile_size = data_end - data;
+ tile_buffers[0][0].data = data;
+ tile_buffers[0][0].size = tile_size;
+ raw_data_end = NULL;
+ } else {
+ // We locate only the tile buffers that are required, which are the ones
+ // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+ // need the last (bottom right) tile buffer, as we need to know where the
+ // end of the compressed frame buffer is for proper superframe decoding.
+
+ const uint8_t *tile_col_data_end[MAX_TILE_COLS] = { NULL };
+ const uint8_t *const data_start = data;
+
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int tile_rows_start = single_row ? dec_tile_row : 0;
+ const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ const int tile_cols_start = single_col ? dec_tile_col : 0;
+ const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+ const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+ const int tile_size_bytes = pbi->tile_size_bytes;
+ const int tile_copy_mode =
+ ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256) ? 1
+ : 0;
+ // Read tile column sizes for all columns (we need the last tile buffer)
+ for (int c = 0; c < tile_cols; ++c) {
+ const int is_last = c == tile_cols - 1;
+ size_t tile_col_size;
+
+ if (!is_last) {
+ tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+ data += tile_col_size_bytes;
+ tile_col_data_end[c] = data + tile_col_size;
+ } else {
+ tile_col_size = data_end - data;
+ tile_col_data_end[c] = data_end;
+ }
+ data += tile_col_size;
+ }
+
+ data = data_start;
+
+ // Read the required tile sizes.
+ for (int c = tile_cols_start; c < tile_cols_end; ++c) {
+ const int is_last = c == tile_cols - 1;
+
+ if (c > 0) data = tile_col_data_end[c - 1];
+
+ if (!is_last) data += tile_col_size_bytes;
+
+ // Get the whole of the last column, otherwise stop at the required tile.
+ for (int r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+ get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+ tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+ }
+ }
+
+ // If we have not read the last column, then read it to get the last tile.
+ if (tile_cols_end != tile_cols) {
+ const int c = tile_cols - 1;
+
+ data = tile_col_data_end[c - 1];
+
+ for (int r = 0; r < tile_rows; ++r) {
+ get_ls_tile_buffer(tile_col_data_end[c], &pbi->common.error, &data,
+ tile_buffers, tile_size_bytes, c, r, tile_copy_mode);
+ }
+ }
+ raw_data_end = data;
+ }
+ return raw_data_end;
+}
+#endif // EXT_TILE_DEBUG
+
+static const uint8_t *get_ls_single_tile_buffer(
+ AV1Decoder *pbi, const uint8_t *data,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+ assert(pbi->dec_tile_row >= 0 && pbi->dec_tile_col >= 0);
+ tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].data = data;
+ tile_buffers[pbi->dec_tile_row][pbi->dec_tile_col].size =
+ (size_t)pbi->coded_tile_data_size;
+ return data + pbi->coded_tile_data_size;
+}
+
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+ const int tile_size_bytes, int is_last,
+ struct aom_internal_error_info *error_info,
+ const uint8_t **data, TileBufferDec *const buf) {
+ size_t size;
+
+ if (!is_last) {
+ if (!read_is_valid(*data, tile_size_bytes, data_end))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile length");
+
+ size = mem_get_varsize(*data, tile_size_bytes) + AV1_MIN_TILE_SIZE_BYTES;
+ *data += tile_size_bytes;
+
+ if (size > (size_t)(data_end - *data))
+ aom_internal_error(error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Truncated packet or corrupt tile size");
+ } else {
+ size = data_end - *data;
+ }
+
+ buf->data = *data;
+ buf->size = size;
+
+ *data += size;
+}
+
+static void get_tile_buffers(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tc = 0;
+ int first_tile_in_tg = 0;
+
+ for (int r = 0; r < tile_rows; ++r) {
+ for (int c = 0; c < tile_cols; ++c, ++tc) {
+ TileBufferDec *const buf = &tile_buffers[r][c];
+
+ const int is_last = (tc == end_tile);
+ const size_t hdr_offset = 0;
+
+ if (tc < start_tile || tc > end_tile) continue;
+
+ if (data + hdr_offset >= data_end)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Data ended before all tiles were read.");
+ first_tile_in_tg += tc == first_tile_in_tg ? pbi->tg_size : 0;
+ data += hdr_offset;
+ get_tile_buffer(data_end, pbi->tile_size_bytes, is_last,
+ &pbi->common.error, &data, buf);
+ }
+ }
+}
+
+static void set_cb_buffer(AV1Decoder *pbi, MACROBLOCKD *const xd,
+ CB_BUFFER *cb_buffer_base, const int num_planes,
+ int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &pbi->common;
+ int mib_size_log2 = cm->seq_params.mib_size_log2;
+ int stride = (cm->mi_cols >> mib_size_log2) + 1;
+ int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ CB_BUFFER *cb_buffer = cb_buffer_base + offset;
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ xd->plane[plane].dqcoeff_block = cb_buffer->dqcoeff[plane];
+ xd->plane[plane].eob_data = cb_buffer->eob_data[plane];
+ xd->cb_offset[plane] = 0;
+ xd->txb_offset[plane] = 0;
+ }
+ xd->plane[0].color_index_map = cb_buffer->color_index_map[0];
+ xd->plane[1].color_index_map = cb_buffer->color_index_map[1];
+ xd->color_index_map_offset[0] = 0;
+ xd->color_index_map_offset[1] = 0;
+}
+
+static void decoder_alloc_tile_data(AV1Decoder *pbi, const int n_tiles) {
+ AV1_COMMON *const cm = &pbi->common;
+ aom_free(pbi->tile_data);
+ CHECK_MEM_ERROR(cm, pbi->tile_data,
+ aom_memalign(32, n_tiles * sizeof(*pbi->tile_data)));
+ pbi->allocated_tiles = n_tiles;
+ for (int i = 0; i < n_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_zero(tile_data->dec_row_mt_sync);
+ }
+ pbi->allocated_row_mt_sync_rows = 0;
+}
+
+// Set up nsync by width.
+static INLINE int get_sync_range(int width) {
+// nsync numbers are picked by testing.
+#if 0
+ if (width < 640)
+ return 1;
+ else if (width <= 1280)
+ return 2;
+ else if (width <= 4096)
+ return 4;
+ else
+ return 8;
+#else
+ (void)width;
+#endif
+ return 1;
+}
+
+// Allocate memory for decoder row synchronization
+static void dec_row_mt_alloc(AV1DecRowMTSync *dec_row_mt_sync, AV1_COMMON *cm,
+ int rows) {
+ dec_row_mt_sync->allocated_sb_rows = rows;
+#if CONFIG_MULTITHREAD
+ {
+ int i;
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->mutex_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->mutex_)) * rows));
+ if (dec_row_mt_sync->mutex_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_mutex_init(&dec_row_mt_sync->mutex_[i], NULL);
+ }
+ }
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cond_,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cond_)) * rows));
+ if (dec_row_mt_sync->cond_) {
+ for (i = 0; i < rows; ++i) {
+ pthread_cond_init(&dec_row_mt_sync->cond_[i], NULL);
+ }
+ }
+ }
+#endif // CONFIG_MULTITHREAD
+
+ CHECK_MEM_ERROR(cm, dec_row_mt_sync->cur_sb_col,
+ aom_malloc(sizeof(*(dec_row_mt_sync->cur_sb_col)) * rows));
+
+ // Set up nsync.
+ dec_row_mt_sync->sync_range = get_sync_range(cm->width);
+}
+
+// Deallocate decoder row synchronization related mutex and data
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync) {
+ if (dec_row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+ int i;
+ if (dec_row_mt_sync->mutex_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_mutex_destroy(&dec_row_mt_sync->mutex_[i]);
+ }
+ aom_free(dec_row_mt_sync->mutex_);
+ }
+ if (dec_row_mt_sync->cond_ != NULL) {
+ for (i = 0; i < dec_row_mt_sync->allocated_sb_rows; ++i) {
+ pthread_cond_destroy(&dec_row_mt_sync->cond_[i]);
+ }
+ aom_free(dec_row_mt_sync->cond_);
+ }
+#endif // CONFIG_MULTITHREAD
+ aom_free(dec_row_mt_sync->cur_sb_col);
+
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*dec_row_mt_sync);
+ }
+}
+
+static INLINE void sync_read(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+
+ if (r && !(c & (nsync - 1))) {
+ pthread_mutex_t *const mutex = &dec_row_mt_sync->mutex_[r - 1];
+ pthread_mutex_lock(mutex);
+
+ while (c > dec_row_mt_sync->cur_sb_col[r - 1] - nsync) {
+ pthread_cond_wait(&dec_row_mt_sync->cond_[r - 1], mutex);
+ }
+ pthread_mutex_unlock(mutex);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+#endif // CONFIG_MULTITHREAD
+}
+
+static INLINE void sync_write(AV1DecRowMTSync *const dec_row_mt_sync, int r,
+ int c, const int sb_cols) {
+#if CONFIG_MULTITHREAD
+ const int nsync = dec_row_mt_sync->sync_range;
+ int cur;
+ int sig = 1;
+
+ if (c < sb_cols - 1) {
+ cur = c;
+ if (c % nsync) sig = 0;
+ } else {
+ cur = sb_cols + nsync;
+ }
+
+ if (sig) {
+ pthread_mutex_lock(&dec_row_mt_sync->mutex_[r]);
+
+ dec_row_mt_sync->cur_sb_col[r] = cur;
+
+ pthread_cond_signal(&dec_row_mt_sync->cond_[r]);
+ pthread_mutex_unlock(&dec_row_mt_sync->mutex_[r]);
+ }
+#else
+ (void)dec_row_mt_sync;
+ (void)r;
+ (void)c;
+ (void)sb_cols;
+#endif // CONFIG_MULTITHREAD
+}
+
+static void decode_tile_sb_row(AV1Decoder *pbi, ThreadData *const td,
+ TileInfo tile_info, const int mi_row) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile_info.tile_row * cm->tile_cols + tile_info.tile_col;
+ const int sb_cols_in_tile = av1_get_sb_cols_in_tile(cm, tile_info);
+ const int sb_row_in_tile =
+ (mi_row - tile_info.mi_row_start) >> cm->seq_params.mib_size_log2;
+ int sb_col_in_tile = 0;
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size, sb_col_in_tile++) {
+ set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+ mi_col);
+
+ sync_read(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile);
+
+ // Decoding of the super-block
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x2);
+
+ sync_write(&tile_data->dec_row_mt_sync, sb_row_in_tile, sb_col_in_tile,
+ sb_cols_in_tile);
+ }
+}
+
+static int check_trailing_bits_after_symbol_coder(aom_reader *r) {
+ if (aom_reader_has_overflowed(r)) return -1;
+
+ uint32_t nb_bits = aom_reader_tell(r);
+ uint32_t nb_bytes = (nb_bits + 7) >> 3;
+ const uint8_t *p = aom_reader_find_begin(r) + nb_bytes;
+
+ // aom_reader_tell() returns 1 for a newly initialized decoder, and the
+ // return value only increases as values are decoded. So nb_bits > 0, and
+ // thus p > p_begin. Therefore accessing p[-1] is safe.
+ uint8_t last_byte = p[-1];
+ uint8_t pattern = 128 >> ((nb_bits - 1) & 7);
+ if ((last_byte & (2 * pattern - 1)) != pattern) return -1;
+
+ // Make sure that all padding bytes are zero as required by the spec.
+ const uint8_t *p_end = aom_reader_find_end(r);
+ while (p < p_end) {
+ if (*p != 0) return -1;
+ p++;
+ }
+ return 0;
+}
+
+static void set_decode_func_pointers(ThreadData *td, int parse_decode_flag) {
+ td->read_coeffs_tx_intra_block_visit = decode_block_void;
+ td->predict_and_recon_intra_block_visit = decode_block_void;
+ td->read_coeffs_tx_inter_block_visit = decode_block_void;
+ td->inverse_tx_inter_block_visit = decode_block_void;
+ td->predict_inter_block_visit = predict_inter_block_void;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block_void;
+
+ if (parse_decode_flag & 0x1) {
+ td->read_coeffs_tx_intra_block_visit = read_coeffs_tx_intra_block;
+ td->read_coeffs_tx_inter_block_visit = av1_read_coeffs_txb_facade;
+ }
+ if (parse_decode_flag & 0x2) {
+ td->predict_and_recon_intra_block_visit =
+ predict_and_reconstruct_intra_block;
+ td->inverse_tx_inter_block_visit = inverse_transform_inter_block;
+ td->predict_inter_block_visit = predict_inter_block;
+ td->cfl_store_inter_block_visit = cfl_store_inter_block;
+ }
+}
+
+static void decode_tile(AV1Decoder *pbi, ThreadData *const td, int tile_row,
+ int tile_col) {
+ TileInfo tile_info;
+
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+ tile_info.mi_col_end, tile_row);
+ av1_reset_loop_filter_delta(&td->xd, num_planes);
+ av1_reset_loop_restoration(&td->xd, num_planes);
+
+ for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->seq_params.mib_size) {
+ av1_zero_left_context(&td->xd);
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ set_cb_buffer(pbi, &td->xd, &td->cb_buffer_base, num_planes, 0, 0);
+
+ // Bit-stream parsing and decoding of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x3);
+
+ if (aom_reader_has_overflowed(td->bit_reader)) {
+ aom_merge_corrupted_flag(&td->xd.corrupted, 1);
+ return;
+ }
+ }
+ }
+
+ int corrupted =
+ (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+ aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+}
+
+static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end, int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ ThreadData *const td = &pbi->td;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int inv_col_order;
+ int inv_row_order;
+ int tile_row, tile_col;
+ uint8_t allow_update_cdf;
+ const uint8_t *raw_data_end = NULL;
+
+ if (cm->large_scale_tile) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ inv_col_order = pbi->inv_tile_order && !single_col;
+ inv_row_order = pbi->inv_tile_order && !single_row;
+ allow_update_cdf = 0;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ inv_col_order = pbi->inv_tile_order;
+ inv_row_order = pbi->inv_tile_order;
+ allow_update_cdf = 1;
+ }
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * cm->tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * cm->tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile && !pbi->ext_tile_debug)
+ raw_data_end = get_ls_single_tile_buffer(pbi, data, tile_buffers);
+ else if (cm->large_scale_tile && pbi->ext_tile_debug)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ aom_accounting_reset(&pbi->accounting);
+ }
+#endif
+
+ set_decode_func_pointers(&pbi->td, 0x3);
+
+ // Load all tile information into thread_data.
+ td->xd = pbi->mb;
+ td->xd.corrupted = 0;
+ td->xd.mc_buf[0] = td->mc_buf[0];
+ td->xd.mc_buf[1] = td->mc_buf[1];
+ td->xd.tmp_conv_dst = td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ td->xd.tmp_obmc_bufs[j] = td->tmp_obmc_bufs[j];
+ }
+
+ for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+ const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+
+ for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+ const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+ TileDataDec *const tile_data = pbi->tile_data + row * cm->tile_cols + col;
+ const TileBufferDec *const tile_bs_buf = &tile_buffers[row][col];
+
+ if (row * cm->tile_cols + col < start_tile ||
+ row * cm->tile_cols + col > end_tile)
+ continue;
+
+ td->bit_reader = &tile_data->bit_reader;
+ av1_zero(td->dqcoeff);
+ av1_tile_init(&td->xd.tile, cm, row, col);
+ td->xd.current_qindex = cm->base_qindex;
+ setup_bool_decoder(tile_bs_buf->data, data_end, tile_bs_buf->size,
+ &cm->error, td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ td->bit_reader->accounting = &pbi->accounting;
+ td->bit_reader->accounting->last_tell_frac =
+ aom_reader_tell_frac(td->bit_reader);
+ } else {
+ td->bit_reader->accounting = NULL;
+ }
+#endif
+ av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ av1_init_above_context(cm, &td->xd, row);
+
+ // Initialise the tile context from the frame context
+ tile_data->tctx = *cm->fc;
+ td->xd.tile_ctx = &tile_data->tctx;
+
+ // decode tile
+ decode_tile(pbi, td, row, col);
+ aom_merge_corrupted_flag(&pbi->mb.corrupted, td->xd.corrupted);
+ if (pbi->mb.corrupted)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+ }
+ }
+
+ if (cm->large_scale_tile) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static TileJobsDec *get_dec_job_info(AV1DecTileMT *tile_mt_info) {
+ TileJobsDec *cur_job_info = NULL;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(tile_mt_info->job_mutex);
+
+ if (tile_mt_info->jobs_dequeued < tile_mt_info->jobs_enqueued) {
+ cur_job_info = tile_mt_info->job_queue + tile_mt_info->jobs_dequeued;
+ tile_mt_info->jobs_dequeued++;
+ }
+
+ pthread_mutex_unlock(tile_mt_info->job_mutex);
+#else
+ (void)tile_mt_info;
+#endif
+ return cur_job_info;
+}
+
+static void tile_worker_hook_init(AV1Decoder *const pbi,
+ DecWorkerData *const thread_data,
+ const TileBufferDec *const tile_buffer,
+ TileDataDec *const tile_data,
+ uint8_t allow_update_cdf) {
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+
+ td->bit_reader = &tile_data->bit_reader;
+ av1_zero(td->dqcoeff);
+ av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+ td->xd.current_qindex = cm->base_qindex;
+ setup_bool_decoder(tile_buffer->data, thread_data->data_end,
+ tile_buffer->size, &thread_data->error_info,
+ td->bit_reader, allow_update_cdf);
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ td->bit_reader->accounting = &pbi->accounting;
+ td->bit_reader->accounting->last_tell_frac =
+ aom_reader_tell_frac(td->bit_reader);
+ } else {
+ td->bit_reader->accounting = NULL;
+ }
+#endif
+ av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ td->xd.error_info = &thread_data->error_info;
+ av1_init_above_context(cm, &td->xd, tile_row);
+
+ // Initialise the tile context from the frame context
+ tile_data->tctx = *cm->fc;
+ td->xd.tile_ctx = &tile_data->tctx;
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ tile_data->bit_reader.accounting->last_tell_frac =
+ aom_reader_tell_frac(&tile_data->bit_reader);
+ }
+#endif
+}
+
+static int tile_worker_hook(void *arg1, void *arg2) {
+ DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+ AV1Decoder *const pbi = (AV1Decoder *)arg2;
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ uint8_t allow_update_cdf;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(thread_data->error_info.jmp)) {
+ thread_data->error_info.setjmp = 0;
+ thread_data->td->xd.corrupted = 1;
+ return 0;
+ }
+ thread_data->error_info.setjmp = 1;
+
+ allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+ allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+ set_decode_func_pointers(td, 0x3);
+
+ assert(cm->tile_cols > 0);
+ while (1) {
+ TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+ if (cur_job_info != NULL && !td->xd.corrupted) {
+ const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+ TileDataDec *const tile_data = cur_job_info->tile_data;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+ // decode tile
+ int tile_row = tile_data->tile_info.tile_row;
+ int tile_col = tile_data->tile_info.tile_col;
+ decode_tile(pbi, td, tile_row, tile_col);
+ } else {
+ break;
+ }
+ }
+ thread_data->error_info.setjmp = 0;
+ return !td->xd.corrupted;
+}
+
+static int get_next_job_info(AV1Decoder *const pbi,
+ AV1DecRowMTJobInfo *next_job_info,
+ int *end_of_frame) {
+ AV1_COMMON *cm = &pbi->common;
+ TileDataDec *tile_data;
+ AV1DecRowMTSync *dec_row_mt_sync;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ TileInfo tile_info;
+ const int tile_rows_start = frame_row_mt_info->tile_rows_start;
+ const int tile_rows_end = frame_row_mt_info->tile_rows_end;
+ const int tile_cols_start = frame_row_mt_info->tile_cols_start;
+ const int tile_cols_end = frame_row_mt_info->tile_cols_end;
+ const int start_tile = frame_row_mt_info->start_tile;
+ const int end_tile = frame_row_mt_info->end_tile;
+ const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+ int num_mis_to_decode, num_threads_working;
+ int num_mis_waiting_for_decode;
+ int min_threads_working = INT_MAX;
+ int max_mis_to_decode = 0;
+ int tile_row_idx, tile_col_idx;
+ int tile_row = 0;
+ int tile_col = 0;
+
+ memset(next_job_info, 0, sizeof(*next_job_info));
+
+ // Frame decode is completed or error is encountered.
+ *end_of_frame = (frame_row_mt_info->mi_rows_decode_started ==
+ frame_row_mt_info->mi_rows_to_decode) ||
+ (frame_row_mt_info->row_mt_exit == 1);
+ if (*end_of_frame) {
+ return 1;
+ }
+
+ // Decoding cannot start as bit-stream parsing is not complete.
+ if (frame_row_mt_info->mi_rows_parse_done -
+ frame_row_mt_info->mi_rows_decode_started ==
+ 0)
+ return 0;
+
+ // Choose the tile to decode.
+ for (tile_row_idx = tile_rows_start; tile_row_idx < tile_rows_end;
+ ++tile_row_idx) {
+ for (tile_col_idx = tile_cols_start; tile_col_idx < tile_cols_end;
+ ++tile_col_idx) {
+ if (tile_row_idx * cm->tile_cols + tile_col_idx < start_tile ||
+ tile_row_idx * cm->tile_cols + tile_col_idx > end_tile)
+ continue;
+
+ tile_data = pbi->tile_data + tile_row_idx * cm->tile_cols + tile_col_idx;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ num_threads_working = dec_row_mt_sync->num_threads_working;
+ num_mis_waiting_for_decode = (dec_row_mt_sync->mi_rows_parse_done -
+ dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+ num_mis_to_decode =
+ (dec_row_mt_sync->mi_rows - dec_row_mt_sync->mi_rows_decode_started) *
+ dec_row_mt_sync->mi_cols;
+
+ assert(num_mis_to_decode >= num_mis_waiting_for_decode);
+
+ // Pick the tile which has minimum number of threads working on it.
+ if (num_mis_waiting_for_decode > 0) {
+ if (num_threads_working < min_threads_working) {
+ min_threads_working = num_threads_working;
+ max_mis_to_decode = 0;
+ }
+ if (num_threads_working == min_threads_working &&
+ num_mis_to_decode > max_mis_to_decode) {
+ max_mis_to_decode = num_mis_to_decode;
+ tile_row = tile_row_idx;
+ tile_col = tile_col_idx;
+ }
+ }
+ }
+ }
+
+ tile_data = pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ tile_info = tile_data->tile_info;
+ dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+
+ next_job_info->tile_row = tile_row;
+ next_job_info->tile_col = tile_col;
+ next_job_info->mi_row =
+ dec_row_mt_sync->mi_rows_decode_started + tile_info.mi_row_start;
+
+ dec_row_mt_sync->num_threads_working++;
+ dec_row_mt_sync->mi_rows_decode_started += sb_mi_size;
+ frame_row_mt_info->mi_rows_decode_started += sb_mi_size;
+
+ return 1;
+}
+
+static INLINE void signal_parse_sb_row_done(AV1Decoder *const pbi,
+ TileDataDec *const tile_data,
+ const int sb_mi_size) {
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ tile_data->dec_row_mt_sync.mi_rows_parse_done += sb_mi_size;
+ frame_row_mt_info->mi_rows_parse_done += sb_mi_size;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+}
+
+static int row_mt_worker_hook(void *arg1, void *arg2) {
+ DecWorkerData *const thread_data = (DecWorkerData *)arg1;
+ AV1Decoder *const pbi = (AV1Decoder *)arg2;
+ AV1_COMMON *cm = &pbi->common;
+ ThreadData *const td = thread_data->td;
+ uint8_t allow_update_cdf;
+ const int sb_mi_size = mi_size_wide[cm->seq_params.sb_size];
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+ td->xd.corrupted = 0;
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(thread_data->error_info.jmp)) {
+ thread_data->error_info.setjmp = 0;
+ thread_data->td->xd.corrupted = 1;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ frame_row_mt_info->row_mt_exit = 1;
+#if CONFIG_MULTITHREAD
+ pthread_cond_broadcast(pbi->row_mt_cond_);
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ return 0;
+ }
+ thread_data->error_info.setjmp = 1;
+
+ const int num_planes = av1_num_planes(cm);
+ allow_update_cdf = cm->large_scale_tile ? 0 : 1;
+ allow_update_cdf = allow_update_cdf && !cm->disable_cdf_update;
+
+ assert(cm->tile_cols > 0);
+ while (1) {
+ TileJobsDec *cur_job_info = get_dec_job_info(&pbi->tile_mt_info);
+
+ if (cur_job_info != NULL && !td->xd.corrupted) {
+ const TileBufferDec *const tile_buffer = cur_job_info->tile_buffer;
+ TileDataDec *const tile_data = cur_job_info->tile_data;
+ tile_worker_hook_init(pbi, thread_data, tile_buffer, tile_data,
+ allow_update_cdf);
+
+ set_decode_func_pointers(td, 0x1);
+
+ // decode tile
+ TileInfo tile_info = tile_data->tile_info;
+ int tile_row = tile_info.tile_row;
+
+ av1_zero_above_context(cm, &td->xd, tile_info.mi_col_start,
+ tile_info.mi_col_end, tile_row);
+ av1_reset_loop_filter_delta(&td->xd, num_planes);
+ av1_reset_loop_restoration(&td->xd, num_planes);
+
+ for (int mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->seq_params.mib_size) {
+ av1_zero_left_context(&td->xd);
+
+ for (int mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ set_cb_buffer(pbi, &td->xd, pbi->cb_buffer_base, num_planes, mi_row,
+ mi_col);
+
+ // Bit-stream parsing of the superblock
+ decode_partition(pbi, td, mi_row, mi_col, td->bit_reader,
+ cm->seq_params.sb_size, 0x1);
+ }
+ signal_parse_sb_row_done(pbi, tile_data, sb_mi_size);
+ }
+
+ int corrupted =
+ (check_trailing_bits_after_symbol_coder(td->bit_reader)) ? 1 : 0;
+ aom_merge_corrupted_flag(&td->xd.corrupted, corrupted);
+ } else {
+ break;
+ }
+ }
+
+ set_decode_func_pointers(td, 0x2);
+
+ while (1) {
+ AV1DecRowMTJobInfo next_job_info;
+ int end_of_frame = 0;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ while (!get_next_job_info(pbi, &next_job_info, &end_of_frame)) {
+#if CONFIG_MULTITHREAD
+ pthread_cond_wait(pbi->row_mt_cond_, pbi->row_mt_mutex_);
+#endif
+ }
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+
+ if (end_of_frame) break;
+
+ int tile_row = next_job_info.tile_row;
+ int tile_col = next_job_info.tile_col;
+ int mi_row = next_job_info.mi_row;
+
+ TileDataDec *tile_data =
+ pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ AV1DecRowMTSync *dec_row_mt_sync = &tile_data->dec_row_mt_sync;
+ TileInfo tile_info = tile_data->tile_info;
+
+ av1_tile_init(&td->xd.tile, cm, tile_row, tile_col);
+ av1_init_macroblockd(cm, &td->xd, td->dqcoeff);
+ td->xd.error_info = &thread_data->error_info;
+
+ decode_tile_sb_row(pbi, td, tile_info, mi_row);
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_lock(pbi->row_mt_mutex_);
+#endif
+ dec_row_mt_sync->num_threads_working--;
+#if CONFIG_MULTITHREAD
+ pthread_mutex_unlock(pbi->row_mt_mutex_);
+#endif
+ }
+ thread_data->error_info.setjmp = 0;
+ return !td->xd.corrupted;
+}
+
+// sorts in descending order
+static int compare_tile_buffers(const void *a, const void *b) {
+ const TileJobsDec *const buf1 = (const TileJobsDec *)a;
+ const TileJobsDec *const buf2 = (const TileJobsDec *)b;
+ return (((int)buf2->tile_buffer->size) - ((int)buf1->tile_buffer->size));
+}
+
+static void enqueue_tile_jobs(AV1Decoder *pbi, AV1_COMMON *cm,
+ int tile_rows_start, int tile_rows_end,
+ int tile_cols_start, int tile_cols_end,
+ int startTile, int endTile) {
+ AV1DecTileMT *tile_mt_info = &pbi->tile_mt_info;
+ TileJobsDec *tile_job_queue = tile_mt_info->job_queue;
+ tile_mt_info->jobs_enqueued = 0;
+ tile_mt_info->jobs_dequeued = 0;
+
+ for (int row = tile_rows_start; row < tile_rows_end; row++) {
+ for (int col = tile_cols_start; col < tile_cols_end; col++) {
+ if (row * cm->tile_cols + col < startTile ||
+ row * cm->tile_cols + col > endTile)
+ continue;
+ tile_job_queue->tile_buffer = &pbi->tile_buffers[row][col];
+ tile_job_queue->tile_data = pbi->tile_data + row * cm->tile_cols + col;
+ tile_job_queue++;
+ tile_mt_info->jobs_enqueued++;
+ }
+ }
+}
+
+static void alloc_dec_jobs(AV1DecTileMT *tile_mt_info, AV1_COMMON *cm,
+ int tile_rows, int tile_cols) {
+ tile_mt_info->alloc_tile_rows = tile_rows;
+ tile_mt_info->alloc_tile_cols = tile_cols;
+ int num_tiles = tile_rows * tile_cols;
+#if CONFIG_MULTITHREAD
+ {
+ CHECK_MEM_ERROR(cm, tile_mt_info->job_mutex,
+ aom_malloc(sizeof(*tile_mt_info->job_mutex) * num_tiles));
+
+ for (int i = 0; i < num_tiles; i++) {
+ pthread_mutex_init(&tile_mt_info->job_mutex[i], NULL);
+ }
+ }
+#endif
+ CHECK_MEM_ERROR(cm, tile_mt_info->job_queue,
+ aom_malloc(sizeof(*tile_mt_info->job_queue) * num_tiles));
+}
+
+void av1_free_mc_tmp_buf(ThreadData *thread_data) {
+ int ref;
+ for (ref = 0; ref < 2; ref++) {
+ if (thread_data->mc_buf_use_highbd)
+ aom_free(CONVERT_TO_SHORTPTR(thread_data->mc_buf[ref]));
+ else
+ aom_free(thread_data->mc_buf[ref]);
+ thread_data->mc_buf[ref] = NULL;
+ }
+ thread_data->mc_buf_size = 0;
+ thread_data->mc_buf_use_highbd = 0;
+
+ aom_free(thread_data->tmp_conv_dst);
+ thread_data->tmp_conv_dst = NULL;
+ for (int i = 0; i < 2; ++i) {
+ aom_free(thread_data->tmp_obmc_bufs[i]);
+ thread_data->tmp_obmc_bufs[i] = NULL;
+ }
+}
+
+static void allocate_mc_tmp_buf(AV1_COMMON *const cm, ThreadData *thread_data,
+ int buf_size, int use_highbd) {
+ for (int ref = 0; ref < 2; ref++) {
+ if (use_highbd) {
+ uint16_t *hbd_mc_buf;
+ CHECK_MEM_ERROR(cm, hbd_mc_buf, (uint16_t *)aom_memalign(16, buf_size));
+ thread_data->mc_buf[ref] = CONVERT_TO_BYTEPTR(hbd_mc_buf);
+ } else {
+ CHECK_MEM_ERROR(cm, thread_data->mc_buf[ref],
+ (uint8_t *)aom_memalign(16, buf_size));
+ }
+ }
+ thread_data->mc_buf_size = buf_size;
+ thread_data->mc_buf_use_highbd = use_highbd;
+
+ CHECK_MEM_ERROR(cm, thread_data->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->tmp_conv_dst)));
+ for (int i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->tmp_obmc_bufs[i])));
+ }
+}
+
+static void reset_dec_workers(AV1Decoder *pbi, AVxWorkerHook worker_hook,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Reset tile decoding hook
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ thread_data->td->xd = pbi->mb;
+ thread_data->td->xd.corrupted = 0;
+ thread_data->td->xd.mc_buf[0] = thread_data->td->mc_buf[0];
+ thread_data->td->xd.mc_buf[1] = thread_data->td->mc_buf[1];
+ thread_data->td->xd.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->xd.tmp_obmc_bufs[j] = thread_data->td->tmp_obmc_bufs[j];
+ }
+ winterface->sync(worker);
+
+ worker->hook = worker_hook;
+ worker->data1 = thread_data;
+ worker->data2 = pbi;
+ }
+#if CONFIG_ACCOUNTING
+ if (pbi->acct_enabled) {
+ aom_accounting_reset(&pbi->accounting);
+ }
+#endif
+}
+
+static void launch_dec_workers(AV1Decoder *pbi, const uint8_t *data_end,
+ int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = (DecWorkerData *)worker->data1;
+
+ thread_data->data_end = data_end;
+
+ worker->had_error = 0;
+ if (worker_idx == num_workers - 1) {
+ winterface->execute(worker);
+ } else {
+ winterface->launch(worker);
+ }
+ }
+}
+
+static void sync_dec_workers(AV1Decoder *pbi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int corrupted = 0;
+
+ for (int worker_idx = num_workers; worker_idx > 0; --worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx - 1];
+ aom_merge_corrupted_flag(&corrupted, !winterface->sync(worker));
+ }
+
+ pbi->mb.corrupted = corrupted;
+}
+
+static void decode_mt_init(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int worker_idx;
+
+ // Create workers and thread_data
+ if (pbi->num_workers == 0) {
+ const int num_threads = pbi->max_threads;
+ CHECK_MEM_ERROR(cm, pbi->tile_workers,
+ aom_malloc(num_threads * sizeof(*pbi->tile_workers)));
+ CHECK_MEM_ERROR(cm, pbi->thread_data,
+ aom_malloc(num_threads * sizeof(*pbi->thread_data)));
+
+ for (worker_idx = 0; worker_idx < num_threads; ++worker_idx) {
+ AVxWorker *const worker = &pbi->tile_workers[worker_idx];
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ ++pbi->num_workers;
+
+ winterface->init(worker);
+ if (worker_idx < num_threads - 1 && !winterface->reset(worker)) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile decoder thread creation failed");
+ }
+
+ if (worker_idx < num_threads - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+ } else {
+ // Main thread acts as a worker and uses the thread data in pbi
+ thread_data->td = &pbi->td;
+ }
+ thread_data->error_info.error_code = AOM_CODEC_OK;
+ thread_data->error_info.setjmp = 0;
+ }
+ }
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+ for (worker_idx = 0; worker_idx < pbi->max_threads - 1; ++worker_idx) {
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ if (thread_data->td->mc_buf_size != buf_size) {
+ av1_free_mc_tmp_buf(thread_data->td);
+ allocate_mc_tmp_buf(cm, thread_data->td, buf_size, use_highbd);
+ }
+ }
+}
+
+static void tile_mt_queue(AV1Decoder *pbi, int tile_cols, int tile_rows,
+ int tile_rows_start, int tile_rows_end,
+ int tile_cols_start, int tile_cols_end,
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ if (pbi->tile_mt_info.alloc_tile_cols != tile_cols ||
+ pbi->tile_mt_info.alloc_tile_rows != tile_rows) {
+ av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+ alloc_dec_jobs(&pbi->tile_mt_info, cm, tile_rows, tile_cols);
+ }
+ enqueue_tile_jobs(pbi, cm, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile);
+ qsort(pbi->tile_mt_info.job_queue, pbi->tile_mt_info.jobs_enqueued,
+ sizeof(pbi->tile_mt_info.job_queue[0]), compare_tile_buffers);
+}
+
+static const uint8_t *decode_tiles_mt(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end, int start_tile,
+ int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int tile_count_tg;
+ int num_workers;
+ const uint8_t *raw_data_end = NULL;
+
+ if (cm->large_scale_tile) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ }
+ tile_count_tg = end_tile - start_tile + 1;
+ num_workers = AOMMIN(pbi->max_threads, tile_count_tg);
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+ assert(tile_count_tg > 0);
+ assert(num_workers > 0);
+ assert(start_tile <= end_tile);
+ assert(start_tile >= 0 && end_tile < n_tiles);
+
+ decode_mt_init(pbi);
+
+ // get tile size in tile group
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+ if (cm->large_scale_tile)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+ av1_tile_init(&tile_data->tile_info, cm, row, col);
+ }
+ }
+
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+ reset_dec_workers(pbi, tile_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
+
+ if (pbi->mb.corrupted)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (cm->large_scale_tile) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void dec_alloc_cb_buf(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+ ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+ if (pbi->cb_buffer_alloc_size < size) {
+ av1_dec_free_cb_buf(pbi);
+ CHECK_MEM_ERROR(cm, pbi->cb_buffer_base,
+ aom_memalign(32, sizeof(*pbi->cb_buffer_base) * size));
+ pbi->cb_buffer_alloc_size = size;
+ }
+}
+
+static void row_mt_frame_init(AV1Decoder *pbi, int tile_rows_start,
+ int tile_rows_end, int tile_cols_start,
+ int tile_cols_end, int start_tile, int end_tile,
+ int max_sb_rows) {
+ AV1_COMMON *const cm = &pbi->common;
+ AV1DecRowMTInfo *frame_row_mt_info = &pbi->frame_row_mt_info;
+
+ frame_row_mt_info->tile_rows_start = tile_rows_start;
+ frame_row_mt_info->tile_rows_end = tile_rows_end;
+ frame_row_mt_info->tile_cols_start = tile_cols_start;
+ frame_row_mt_info->tile_cols_end = tile_cols_end;
+ frame_row_mt_info->start_tile = start_tile;
+ frame_row_mt_info->end_tile = end_tile;
+ frame_row_mt_info->mi_rows_to_decode = 0;
+ frame_row_mt_info->mi_rows_parse_done = 0;
+ frame_row_mt_info->mi_rows_decode_started = 0;
+ frame_row_mt_info->row_mt_exit = 0;
+
+ for (int tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+ for (int tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+ if (tile_row * cm->tile_cols + tile_col < start_tile ||
+ tile_row * cm->tile_cols + tile_col > end_tile)
+ continue;
+
+ TileDataDec *const tile_data =
+ pbi->tile_data + tile_row * cm->tile_cols + tile_col;
+ TileInfo tile_info = tile_data->tile_info;
+
+ tile_data->dec_row_mt_sync.mi_rows_parse_done = 0;
+ tile_data->dec_row_mt_sync.mi_rows_decode_started = 0;
+ tile_data->dec_row_mt_sync.num_threads_working = 0;
+ tile_data->dec_row_mt_sync.mi_rows =
+ ALIGN_POWER_OF_TWO(tile_info.mi_row_end - tile_info.mi_row_start,
+ cm->seq_params.mib_size_log2);
+ tile_data->dec_row_mt_sync.mi_cols =
+ ALIGN_POWER_OF_TWO(tile_info.mi_col_end - tile_info.mi_col_start,
+ cm->seq_params.mib_size_log2);
+
+ frame_row_mt_info->mi_rows_to_decode +=
+ tile_data->dec_row_mt_sync.mi_rows;
+
+ // Initialize cur_sb_col to -1 for all SB rows.
+ memset(tile_data->dec_row_mt_sync.cur_sb_col, -1,
+ sizeof(*tile_data->dec_row_mt_sync.cur_sb_col) * max_sb_rows);
+ }
+ }
+
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_mutex_,
+ aom_malloc(sizeof(*(pbi->row_mt_mutex_))));
+ if (pbi->row_mt_mutex_) {
+ pthread_mutex_init(pbi->row_mt_mutex_, NULL);
+ }
+ }
+
+ if (pbi->row_mt_cond_ == NULL) {
+ CHECK_MEM_ERROR(cm, pbi->row_mt_cond_,
+ aom_malloc(sizeof(*(pbi->row_mt_cond_))));
+ if (pbi->row_mt_cond_) {
+ pthread_cond_init(pbi->row_mt_cond_, NULL);
+ }
+ }
+#endif
+}
+
+static const uint8_t *decode_tiles_row_mt(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ int start_tile, int end_tile) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ const int n_tiles = tile_cols * tile_rows;
+ TileBufferDec(*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+ const int dec_tile_row = AOMMIN(pbi->dec_tile_row, tile_rows);
+ const int single_row = pbi->dec_tile_row >= 0;
+ const int dec_tile_col = AOMMIN(pbi->dec_tile_col, tile_cols);
+ const int single_col = pbi->dec_tile_col >= 0;
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int tile_count_tg;
+ int num_workers;
+ const uint8_t *raw_data_end = NULL;
+ int max_sb_rows = 0;
+
+ if (cm->large_scale_tile) {
+ tile_rows_start = single_row ? dec_tile_row : 0;
+ tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+ tile_cols_start = single_col ? dec_tile_col : 0;
+ tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+ } else {
+ tile_rows_start = 0;
+ tile_rows_end = tile_rows;
+ tile_cols_start = 0;
+ tile_cols_end = tile_cols;
+ }
+ tile_count_tg = end_tile - start_tile + 1;
+ num_workers = pbi->max_threads;
+
+ // No tiles to decode.
+ if (tile_rows_end <= tile_rows_start || tile_cols_end <= tile_cols_start ||
+ // First tile is larger than end_tile.
+ tile_rows_start * tile_cols + tile_cols_start > end_tile ||
+ // Last tile is smaller than start_tile.
+ (tile_rows_end - 1) * tile_cols + tile_cols_end - 1 < start_tile)
+ return data;
+
+ assert(tile_rows <= MAX_TILE_ROWS);
+ assert(tile_cols <= MAX_TILE_COLS);
+ assert(tile_count_tg > 0);
+ assert(num_workers > 0);
+ assert(start_tile <= end_tile);
+ assert(start_tile >= 0 && end_tile < n_tiles);
+
+ (void)tile_count_tg;
+
+ decode_mt_init(pbi);
+
+ // get tile size in tile group
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile) assert(pbi->ext_tile_debug == 1);
+ if (cm->large_scale_tile)
+ raw_data_end = get_ls_tile_buffers(pbi, data, data_end, tile_buffers);
+ else
+#endif // EXT_TILE_DEBUG
+ get_tile_buffers(pbi, data, data_end, tile_buffers, start_tile, end_tile);
+
+ if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
+ for (int i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
+ decoder_alloc_tile_data(pbi, n_tiles);
+ }
+
+ for (int row = 0; row < tile_rows; row++) {
+ for (int col = 0; col < tile_cols; col++) {
+ TileDataDec *tile_data = pbi->tile_data + row * cm->tile_cols + col;
+ av1_tile_init(&tile_data->tile_info, cm, row, col);
+
+ max_sb_rows = AOMMAX(max_sb_rows,
+ av1_get_sb_rows_in_tile(cm, tile_data->tile_info));
+ }
+ }
+
+ if (pbi->allocated_row_mt_sync_rows != max_sb_rows) {
+ for (int i = 0; i < n_tiles; ++i) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ dec_row_mt_alloc(&tile_data->dec_row_mt_sync, cm, max_sb_rows);
+ }
+ pbi->allocated_row_mt_sync_rows = max_sb_rows;
+ }
+
+ tile_mt_queue(pbi, tile_cols, tile_rows, tile_rows_start, tile_rows_end,
+ tile_cols_start, tile_cols_end, start_tile, end_tile);
+
+ dec_alloc_cb_buf(pbi);
+
+ row_mt_frame_init(pbi, tile_rows_start, tile_rows_end, tile_cols_start,
+ tile_cols_end, start_tile, end_tile, max_sb_rows);
+
+ reset_dec_workers(pbi, row_mt_worker_hook, num_workers);
+ launch_dec_workers(pbi, data_end, num_workers);
+ sync_dec_workers(pbi, num_workers);
+
+ if (pbi->mb.corrupted)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Failed to decode tile data");
+
+ if (cm->large_scale_tile) {
+ if (n_tiles == 1) {
+ // Find the end of the single tile buffer
+ return aom_reader_find_end(&pbi->tile_data->bit_reader);
+ }
+ // Return the end of the last tile buffer
+ return raw_data_end;
+ }
+ TileDataDec *const tile_data = pbi->tile_data + end_tile;
+
+ return aom_reader_find_end(&tile_data->bit_reader);
+}
+
+static void error_handler(void *data) {
+ AV1_COMMON *const cm = (AV1_COMMON *)data;
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME, "Truncated packet");
+}
+
+// Reads the high_bitdepth and twelve_bit fields in color_config() and sets
+// seq_params->bit_depth based on the values of those fields and
+// seq_params->profile. Reports errors by calling rb->error_handler() or
+// aom_internal_error().
+static void read_bitdepth(struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
+ const int high_bitdepth = aom_rb_read_bit(rb);
+ if (seq_params->profile == PROFILE_2 && high_bitdepth) {
+ const int twelve_bit = aom_rb_read_bit(rb);
+ seq_params->bit_depth = twelve_bit ? AOM_BITS_12 : AOM_BITS_10;
+ } else if (seq_params->profile <= PROFILE_2) {
+ seq_params->bit_depth = high_bitdepth ? AOM_BITS_10 : AOM_BITS_8;
+ } else {
+ aom_internal_error(error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Unsupported profile/bit-depth combination");
+ }
+}
+
+void av1_read_film_grain_params(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ aom_film_grain_t *pars = &cm->film_grain_params;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+
+ pars->apply_grain = aom_rb_read_bit(rb);
+ if (!pars->apply_grain) {
+ memset(pars, 0, sizeof(*pars));
+ return;
+ }
+
+ pars->random_seed = aom_rb_read_literal(rb, 16);
+ if (cm->frame_type == INTER_FRAME)
+ pars->update_parameters = aom_rb_read_bit(rb);
+ else
+ pars->update_parameters = 1;
+
+ pars->bit_depth = seq_params->bit_depth;
+
+ if (!pars->update_parameters) {
+ // inherit parameters from a previous reference frame
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int film_grain_params_ref_idx = aom_rb_read_literal(rb, 3);
+ int buf_idx = cm->ref_frame_map[film_grain_params_ref_idx];
+ if (buf_idx == INVALID_IDX) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Invalid Film grain reference idx");
+ }
+ if (!frame_bufs[buf_idx].film_grain_params_present) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Film grain reference parameters not available");
+ }
+ uint16_t random_seed = pars->random_seed;
+ *pars = frame_bufs[buf_idx].film_grain_params; // inherit paramaters
+ pars->random_seed = random_seed; // with new random seed
+ return;
+ }
+
+ // Scaling functions parameters
+ pars->num_y_points = aom_rb_read_literal(rb, 4); // max 14
+ if (pars->num_y_points > 14)
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain luma scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_y_points; i++) {
+ pars->scaling_points_y[i][0] = aom_rb_read_literal(rb, 8);
+ if (i && pars->scaling_points_y[i - 1][0] >= pars->scaling_points_y[i][0])
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_y[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ if (!seq_params->monochrome)
+ pars->chroma_scaling_from_luma = aom_rb_read_bit(rb);
+ else
+ pars->chroma_scaling_from_luma = 0;
+
+ if (seq_params->monochrome || pars->chroma_scaling_from_luma ||
+ ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+ (pars->num_y_points == 0))) {
+ pars->num_cb_points = 0;
+ pars->num_cr_points = 0;
+ } else {
+ pars->num_cb_points = aom_rb_read_literal(rb, 4); // max 10
+ if (pars->num_cb_points > 10)
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain cb scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_cb_points; i++) {
+ pars->scaling_points_cb[i][0] = aom_rb_read_literal(rb, 8);
+ if (i &&
+ pars->scaling_points_cb[i - 1][0] >= pars->scaling_points_cb[i][0])
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_cb[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ pars->num_cr_points = aom_rb_read_literal(rb, 4); // max 10
+ if (pars->num_cr_points > 10)
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Number of points for film grain cr scaling function "
+ "exceeds the maximum value.");
+ for (int i = 0; i < pars->num_cr_points; i++) {
+ pars->scaling_points_cr[i][0] = aom_rb_read_literal(rb, 8);
+ if (i &&
+ pars->scaling_points_cr[i - 1][0] >= pars->scaling_points_cr[i][0])
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "First coordinate of the scaling function points "
+ "shall be increasing.");
+ pars->scaling_points_cr[i][1] = aom_rb_read_literal(rb, 8);
+ }
+
+ if ((seq_params->subsampling_x == 1) && (seq_params->subsampling_y == 1) &&
+ (((pars->num_cb_points == 0) && (pars->num_cr_points != 0)) ||
+ ((pars->num_cb_points != 0) && (pars->num_cr_points == 0))))
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "In YCbCr 4:2:0, film grain shall be applied "
+ "to both chroma components or neither.");
+ }
+
+ pars->scaling_shift = aom_rb_read_literal(rb, 2) + 8; // 8 + value
+
+ // AR coefficients
+ // Only sent if the corresponsing scaling function has
+ // more than 0 points
+
+ pars->ar_coeff_lag = aom_rb_read_literal(rb, 2);
+
+ int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (pars->num_y_points > 0) ++num_pos_chroma;
+
+ if (pars->num_y_points)
+ for (int i = 0; i < num_pos_luma; i++)
+ pars->ar_coeffs_y[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ pars->ar_coeffs_cb[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ pars->ar_coeffs_cr[i] = aom_rb_read_literal(rb, 8) - 128;
+
+ pars->ar_coeff_shift = aom_rb_read_literal(rb, 2) + 6; // 6 + value
+
+ pars->grain_scale_shift = aom_rb_read_literal(rb, 2);
+
+ if (pars->num_cb_points) {
+ pars->cb_mult = aom_rb_read_literal(rb, 8);
+ pars->cb_luma_mult = aom_rb_read_literal(rb, 8);
+ pars->cb_offset = aom_rb_read_literal(rb, 9);
+ }
+
+ if (pars->num_cr_points) {
+ pars->cr_mult = aom_rb_read_literal(rb, 8);
+ pars->cr_luma_mult = aom_rb_read_literal(rb, 8);
+ pars->cr_offset = aom_rb_read_literal(rb, 9);
+ }
+
+ pars->overlap_flag = aom_rb_read_bit(rb);
+
+ pars->clip_to_restricted_range = aom_rb_read_bit(rb);
+}
+
+static void read_film_grain(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ if (cm->seq_params.film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ av1_read_film_grain_params(cm, rb);
+ } else {
+ memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+ }
+ cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+ memcpy(&cm->cur_frame->film_grain_params, &cm->film_grain_params,
+ sizeof(aom_film_grain_t));
+}
+
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info) {
+ read_bitdepth(rb, seq_params, error_info);
+
+ seq_params->use_highbitdepth =
+ seq_params->bit_depth > AOM_BITS_8 || !allow_lowbitdepth;
+ // monochrome bit (not needed for PROFILE_1)
+ const int is_monochrome =
+ seq_params->profile != PROFILE_1 ? aom_rb_read_bit(rb) : 0;
+ seq_params->monochrome = is_monochrome;
+ int color_description_present_flag = aom_rb_read_bit(rb);
+ if (color_description_present_flag) {
+ seq_params->color_primaries = aom_rb_read_literal(rb, 8);
+ seq_params->transfer_characteristics = aom_rb_read_literal(rb, 8);
+ seq_params->matrix_coefficients = aom_rb_read_literal(rb, 8);
+ } else {
+ seq_params->color_primaries = AOM_CICP_CP_UNSPECIFIED;
+ seq_params->transfer_characteristics = AOM_CICP_TC_UNSPECIFIED;
+ seq_params->matrix_coefficients = AOM_CICP_MC_UNSPECIFIED;
+ }
+ if (is_monochrome) {
+ // [16,235] (including xvycc) vs [0,255] range
+ seq_params->color_range = aom_rb_read_bit(rb);
+ seq_params->subsampling_y = seq_params->subsampling_x = 1;
+ seq_params->chroma_sample_position = AOM_CSP_UNKNOWN;
+ seq_params->separate_uv_delta_q = 0;
+ return;
+ }
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ // It would be good to remove this dependency.
+ seq_params->subsampling_y = seq_params->subsampling_x = 0;
+ seq_params->color_range = 1; // assume full color-range
+ if (!(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12))) {
+ aom_internal_error(
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "sRGB colorspace not compatible with specified profile");
+ }
+ } else {
+ // [16,235] (including xvycc) vs [0,255] range
+ seq_params->color_range = aom_rb_read_bit(rb);
+ if (seq_params->profile == PROFILE_0) {
+ // 420 only
+ seq_params->subsampling_x = seq_params->subsampling_y = 1;
+ } else if (seq_params->profile == PROFILE_1) {
+ // 444 only
+ seq_params->subsampling_x = seq_params->subsampling_y = 0;
+ } else {
+ assert(seq_params->profile == PROFILE_2);
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ seq_params->subsampling_x = aom_rb_read_bit(rb);
+ if (seq_params->subsampling_x)
+ seq_params->subsampling_y = aom_rb_read_bit(rb); // 422 or 420
+ else
+ seq_params->subsampling_y = 0; // 444
+ } else {
+ // 422
+ seq_params->subsampling_x = 1;
+ seq_params->subsampling_y = 0;
+ }
+ }
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY &&
+ (seq_params->subsampling_x || seq_params->subsampling_y)) {
+ aom_internal_error(
+ error_info, AOM_CODEC_UNSUP_BITSTREAM,
+ "Identity CICP Matrix incompatible with non 4:4:4 color sampling");
+ }
+ if (seq_params->subsampling_x && seq_params->subsampling_y) {
+ seq_params->chroma_sample_position = aom_rb_read_literal(rb, 2);
+ }
+ }
+ seq_params->separate_uv_delta_q = aom_rb_read_bit(rb);
+}
+
+void av1_read_timing_info_header(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ cm->timing_info.num_units_in_display_tick = aom_rb_read_unsigned_literal(
+ rb, 32); // Number of units in a display tick
+ cm->timing_info.time_scale =
+ aom_rb_read_unsigned_literal(rb, 32); // Time scale
+ if (cm->timing_info.num_units_in_display_tick == 0 ||
+ cm->timing_info.time_scale == 0) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "num_units_in_display_tick and time_scale must be greater than 0.");
+ }
+ cm->timing_info.equal_picture_interval =
+ aom_rb_read_bit(rb); // Equal picture interval bit
+ if (cm->timing_info.equal_picture_interval) {
+ cm->timing_info.num_ticks_per_picture =
+ aom_rb_read_uvlc(rb) + 1; // ticks per picture
+ if (cm->timing_info.num_ticks_per_picture == 0) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "num_ticks_per_picture_minus_1 cannot be (1 << 32) − 1.");
+ }
+ }
+}
+
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb) {
+ cm->buffer_model.encoder_decoder_buffer_delay_length =
+ aom_rb_read_literal(rb, 5) + 1;
+ cm->buffer_model.num_units_in_decoding_tick = aom_rb_read_unsigned_literal(
+ rb, 32); // Number of units in a decoding tick
+ cm->buffer_model.buffer_removal_time_length = aom_rb_read_literal(rb, 5) + 1;
+ cm->buffer_model.frame_presentation_time_length =
+ aom_rb_read_literal(rb, 5) + 1;
+}
+
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb, int op_num) {
+ // The cm->op_params array has MAX_NUM_OPERATING_POINTS + 1 elements.
+ if (op_num > MAX_NUM_OPERATING_POINTS) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support %d decoder model operating points",
+ op_num + 1);
+ }
+
+ cm->op_params[op_num].decoder_buffer_delay = aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+ cm->op_params[op_num].encoder_buffer_delay = aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+ cm->op_params[op_num].low_delay_mode_flag = aom_rb_read_bit(rb);
+}
+
+static void av1_read_temporal_point_info(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb) {
+ cm->frame_presentation_time = aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.frame_presentation_time_length);
+}
+
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params) {
+ const int num_bits_width = aom_rb_read_literal(rb, 4) + 1;
+ const int num_bits_height = aom_rb_read_literal(rb, 4) + 1;
+ const int max_frame_width = aom_rb_read_literal(rb, num_bits_width) + 1;
+ const int max_frame_height = aom_rb_read_literal(rb, num_bits_height) + 1;
+
+ seq_params->num_bits_width = num_bits_width;
+ seq_params->num_bits_height = num_bits_height;
+ seq_params->max_frame_width = max_frame_width;
+ seq_params->max_frame_height = max_frame_height;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ seq_params->frame_id_numbers_present_flag = 0;
+ } else {
+ seq_params->frame_id_numbers_present_flag = aom_rb_read_bit(rb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ // We must always have delta_frame_id_length < frame_id_length,
+ // in order for a frame to be referenced with a unique delta.
+ // Avoid wasting bits by using a coding that enforces this restriction.
+ seq_params->delta_frame_id_length = aom_rb_read_literal(rb, 4) + 2;
+ seq_params->frame_id_length =
+ aom_rb_read_literal(rb, 3) + seq_params->delta_frame_id_length + 1;
+ if (seq_params->frame_id_length > 16)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid frame_id_length");
+ }
+
+ setup_sb_size(seq_params, rb);
+
+ seq_params->enable_filter_intra = aom_rb_read_bit(rb);
+ seq_params->enable_intra_edge_filter = aom_rb_read_bit(rb);
+
+ if (seq_params->reduced_still_picture_hdr) {
+ seq_params->enable_interintra_compound = 0;
+ seq_params->enable_masked_compound = 0;
+ seq_params->enable_warped_motion = 0;
+ seq_params->enable_dual_filter = 0;
+ seq_params->enable_order_hint = 0;
+ seq_params->enable_jnt_comp = 0;
+ seq_params->enable_ref_frame_mvs = 0;
+ seq_params->force_screen_content_tools = 2; // SELECT_SCREEN_CONTENT_TOOLS
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ seq_params->order_hint_bits_minus_1 = -1;
+ } else {
+ seq_params->enable_interintra_compound = aom_rb_read_bit(rb);
+ seq_params->enable_masked_compound = aom_rb_read_bit(rb);
+ seq_params->enable_warped_motion = aom_rb_read_bit(rb);
+ seq_params->enable_dual_filter = aom_rb_read_bit(rb);
+
+ seq_params->enable_order_hint = aom_rb_read_bit(rb);
+ seq_params->enable_jnt_comp =
+ seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+ seq_params->enable_ref_frame_mvs =
+ seq_params->enable_order_hint ? aom_rb_read_bit(rb) : 0;
+
+ if (aom_rb_read_bit(rb)) {
+ seq_params->force_screen_content_tools =
+ 2; // SELECT_SCREEN_CONTENT_TOOLS
+ } else {
+ seq_params->force_screen_content_tools = aom_rb_read_bit(rb);
+ }
+
+ if (seq_params->force_screen_content_tools > 0) {
+ if (aom_rb_read_bit(rb)) {
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ } else {
+ seq_params->force_integer_mv = aom_rb_read_bit(rb);
+ }
+ } else {
+ seq_params->force_integer_mv = 2; // SELECT_INTEGER_MV
+ }
+ seq_params->order_hint_bits_minus_1 =
+ seq_params->enable_order_hint ? aom_rb_read_literal(rb, 3) : -1;
+ }
+
+ seq_params->enable_superres = aom_rb_read_bit(rb);
+ seq_params->enable_cdef = aom_rb_read_bit(rb);
+ seq_params->enable_restoration = aom_rb_read_bit(rb);
+}
+
+static int read_global_motion_params(WarpedMotionParams *params,
+ const WarpedMotionParams *ref_params,
+ struct aom_read_bit_buffer *rb,
+ int allow_hp) {
+ TransformationType type = aom_rb_read_bit(rb);
+ if (type != IDENTITY) {
+ if (aom_rb_read_bit(rb))
+ type = ROTZOOM;
+ else
+ type = aom_rb_read_bit(rb) ? TRANSLATION : AFFINE;
+ }
+
+ *params = default_warp_params;
+ params->wmtype = type;
+
+ if (type >= ROTZOOM) {
+ params->wmmat[2] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS)) *
+ GM_ALPHA_DECODE_FACTOR +
+ (1 << WARPEDMODEL_PREC_BITS);
+ params->wmmat[3] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF)) *
+ GM_ALPHA_DECODE_FACTOR;
+ }
+
+ if (type >= AFFINE) {
+ params->wmmat[4] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF)) *
+ GM_ALPHA_DECODE_FACTOR;
+ params->wmmat[5] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS)) *
+ GM_ALPHA_DECODE_FACTOR +
+ (1 << WARPEDMODEL_PREC_BITS);
+ } else {
+ params->wmmat[4] = -params->wmmat[3];
+ params->wmmat[5] = params->wmmat[2];
+ }
+
+ if (type >= TRANSLATION) {
+ const int trans_bits = (type == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ const int trans_dec_factor =
+ (type == TRANSLATION) ? GM_TRANS_ONLY_DECODE_FACTOR * (1 << !allow_hp)
+ : GM_TRANS_DECODE_FACTOR;
+ const int trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params->wmmat[0] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff)) *
+ trans_dec_factor;
+ params->wmmat[1] = aom_rb_read_signed_primitive_refsubexpfin(
+ rb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff)) *
+ trans_dec_factor;
+ }
+
+ if (params->wmtype <= AFFINE) {
+ int good_shear_params = get_shear_params(params);
+ if (!good_shear_params) return 0;
+ }
+
+ return 1;
+}
+
+static void read_global_motion(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
+ for (int frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ int good_params = read_global_motion_params(
+ &cm->global_motion[frame], ref_params, rb, cm->allow_high_precision_mv);
+ if (!good_params) {
+#if WARPED_MOTION_DEBUG
+ printf("Warning: unexpected global motion shear params from aomenc\n");
+#endif
+ cm->global_motion[frame].invalid = 1;
+ }
+
+ // TODO(sarahparker, debargha): The logic in the commented out code below
+ // does not work currently and causes mismatches when resize is on. Fix it
+ // before turning the optimization back on.
+ /*
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame(cm, frame);
+ if (cm->width == ref_buf->y_crop_width &&
+ cm->height == ref_buf->y_crop_height) {
+ read_global_motion_params(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame], rb,
+ cm->allow_high_precision_mv);
+ } else {
+ cm->global_motion[frame] = default_warp_params;
+ }
+ */
+ /*
+ printf("Dec Ref %d [%d/%d]: %d %d %d %d\n",
+ frame, cm->current_video_frame, cm->show_frame,
+ cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1],
+ cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ REF_FRAMES * sizeof(WarpedMotionParams));
+}
+
+static void show_existing_frame_reset(AV1Decoder *const pbi,
+ int existing_frame_idx) {
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+ assert(cm->show_existing_frame);
+
+ cm->frame_type = KEY_FRAME;
+
+ pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ cm->frame_refs[i].idx = INVALID_IDX;
+ cm->frame_refs[i].buf = NULL;
+ }
+
+ if (pbi->need_resync) {
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ pbi->need_resync = 0;
+ }
+
+ cm->cur_frame->intra_only = 1;
+
+ if (cm->seq_params.frame_id_numbers_present_flag) {
+ /* If bitmask is set, update reference frame id values and
+ mark frames as valid for reference.
+ Note that the displayed frame be valid for referencing
+ in order to have been selected.
+ */
+ int refresh_frame_flags = pbi->refresh_frame_flags;
+ int display_frame_id = cm->ref_frame_id[existing_frame_idx];
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = display_frame_id;
+ cm->valid_for_referencing[i] = 1;
+ }
+ }
+ }
+
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ // Generate next_ref_frame_map.
+ lock_buffer_pool(pool);
+ int ref_index = 0;
+ for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ if (mask & 1) {
+ cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+ ++frame_bufs[cm->new_fb_idx].ref_count;
+ } else {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+ }
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ ++ref_index;
+ }
+
+ for (; ref_index < REF_FRAMES; ++ref_index) {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ }
+ unlock_buffer_pool(pool);
+ pbi->hold_ref_buf = 1;
+
+ // Reload the adapted CDFs from when we originally coded this keyframe
+ *cm->fc = cm->frame_contexts[existing_frame_idx];
+}
+
+static INLINE void reset_frame_buffers(AV1_COMMON *cm) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int i;
+
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+ lock_buffer_pool(cm->buffer_pool);
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ if (i != cm->new_fb_idx) {
+ frame_bufs[i].ref_count = 0;
+ cm->buffer_pool->release_fb_cb(cm->buffer_pool->cb_priv,
+ &frame_bufs[i].raw_frame_buffer);
+ } else {
+ assert(frame_bufs[i].ref_count == 1);
+ }
+ frame_bufs[i].cur_frame_offset = 0;
+ av1_zero(frame_bufs[i].ref_frame_offset);
+ }
+ av1_zero_unused_internal_frame_buffers(&cm->buffer_pool->int_frame_buffers);
+ unlock_buffer_pool(cm->buffer_pool);
+}
+
+// On success, returns 0. On failure, calls aom_internal_error and does not
+// return.
+static int read_uncompressed_header(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ MACROBLOCKD *const xd = &pbi->mb;
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = pool->frame_bufs;
+
+ if (!pbi->sequence_header_ready) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "No sequence header");
+ }
+
+ cm->last_frame_type = cm->frame_type;
+ cm->last_intra_only = cm->intra_only;
+
+ // NOTE: By default all coded frames to be used as a reference
+ cm->is_reference_frame = 1;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ cm->show_existing_frame = 0;
+ cm->show_frame = 1;
+ cm->frame_type = KEY_FRAME;
+ cm->error_resilient_mode = 1;
+ } else {
+ cm->show_existing_frame = aom_rb_read_bit(rb);
+ cm->reset_decoder_state = 0;
+
+ if (cm->show_existing_frame) {
+ if (pbi->sequence_header_changed) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "New sequence header starts with a show_existing_frame.");
+ }
+ // Show an existing frame directly.
+ const int existing_frame_idx = aom_rb_read_literal(rb, 3);
+ const int frame_to_show = cm->ref_frame_map[existing_frame_idx];
+ if (seq_params->decoder_model_info_present_flag &&
+ cm->timing_info.equal_picture_interval == 0) {
+ av1_read_temporal_point_info(cm, rb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int display_frame_id = aom_rb_read_literal(rb, frame_id_length);
+ /* Compare display_frame_id with ref_frame_id and check valid for
+ * referencing */
+ if (display_frame_id != cm->ref_frame_id[existing_frame_idx] ||
+ cm->valid_for_referencing[existing_frame_idx] == 0)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference buffer frame ID mismatch");
+ }
+ lock_buffer_pool(pool);
+ if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer %d does not contain a decoded frame",
+ frame_to_show);
+ }
+ ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+ cm->reset_decoder_state =
+ frame_bufs[frame_to_show].frame_type == KEY_FRAME;
+ unlock_buffer_pool(pool);
+
+ cm->lf.filter_level[0] = 0;
+ cm->lf.filter_level[1] = 0;
+ cm->show_frame = 1;
+
+ if (!frame_bufs[frame_to_show].showable_frame) {
+ aom_merge_corrupted_flag(&xd->corrupted, 1);
+ }
+ if (cm->reset_decoder_state) frame_bufs[frame_to_show].showable_frame = 0;
+
+ cm->film_grain_params = frame_bufs[frame_to_show].film_grain_params;
+
+ if (cm->reset_decoder_state) {
+ show_existing_frame_reset(pbi, existing_frame_idx);
+ } else {
+ pbi->refresh_frame_flags = 0;
+ }
+
+ return 0;
+ }
+
+ cm->frame_type = (FRAME_TYPE)aom_rb_read_literal(rb, 2); // 2 bits
+ if (pbi->sequence_header_changed) {
+ if (pbi->common.frame_type == KEY_FRAME) {
+ // This is the start of a new coded video sequence.
+ pbi->sequence_header_changed = 0;
+ pbi->decoding_first_frame = 1;
+ reset_frame_buffers(&pbi->common);
+ } else {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Sequence header has changed without a keyframe.");
+ }
+ }
+
+ cm->show_frame = aom_rb_read_bit(rb);
+ if (seq_params->still_picture &&
+ (cm->frame_type != KEY_FRAME || !cm->show_frame)) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Still pictures must be coded as shown keyframes");
+ }
+ cm->showable_frame = cm->frame_type != KEY_FRAME;
+ if (cm->show_frame) {
+ if (seq_params->decoder_model_info_present_flag &&
+ cm->timing_info.equal_picture_interval == 0)
+ av1_read_temporal_point_info(cm, rb);
+ } else {
+ // See if this frame can be used as show_existing_frame in future
+ cm->showable_frame = aom_rb_read_bit(rb);
+ }
+ cm->cur_frame->showable_frame = cm->showable_frame;
+ cm->intra_only = cm->frame_type == INTRA_ONLY_FRAME;
+ cm->error_resilient_mode =
+ frame_is_sframe(cm) || (cm->frame_type == KEY_FRAME && cm->show_frame)
+ ? 1
+ : aom_rb_read_bit(rb);
+ }
+
+ cm->disable_cdf_update = aom_rb_read_bit(rb);
+ if (seq_params->force_screen_content_tools == 2) {
+ cm->allow_screen_content_tools = aom_rb_read_bit(rb);
+ } else {
+ cm->allow_screen_content_tools = seq_params->force_screen_content_tools;
+ }
+
+ if (cm->allow_screen_content_tools) {
+ if (seq_params->force_integer_mv == 2) {
+ cm->cur_frame_force_integer_mv = aom_rb_read_bit(rb);
+ } else {
+ cm->cur_frame_force_integer_mv = seq_params->force_integer_mv;
+ }
+ } else {
+ cm->cur_frame_force_integer_mv = 0;
+ }
+
+ cm->frame_refs_short_signaling = 0;
+ int frame_size_override_flag = 0;
+ cm->allow_intrabc = 0;
+ cm->primary_ref_frame = PRIMARY_REF_NONE;
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int prev_frame_id = 0;
+ int have_prev_frame_id = !pbi->decoding_first_frame &&
+ !(cm->frame_type == KEY_FRAME && cm->show_frame);
+ if (have_prev_frame_id) {
+ prev_frame_id = cm->current_frame_id;
+ }
+ cm->current_frame_id = aom_rb_read_literal(rb, frame_id_length);
+
+ if (have_prev_frame_id) {
+ int diff_frame_id;
+ if (cm->current_frame_id > prev_frame_id) {
+ diff_frame_id = cm->current_frame_id - prev_frame_id;
+ } else {
+ diff_frame_id =
+ (1 << frame_id_length) + cm->current_frame_id - prev_frame_id;
+ }
+ /* Check current_frame_id for conformance */
+ if (prev_frame_id == cm->current_frame_id ||
+ diff_frame_id >= (1 << (frame_id_length - 1))) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid value of current_frame_id");
+ }
+ }
+ /* Check if some frames need to be marked as not valid for referencing */
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+ cm->valid_for_referencing[i] = 0;
+ } else if (cm->current_frame_id - (1 << diff_len) > 0) {
+ if (cm->ref_frame_id[i] > cm->current_frame_id ||
+ cm->ref_frame_id[i] < cm->current_frame_id - (1 << diff_len))
+ cm->valid_for_referencing[i] = 0;
+ } else {
+ if (cm->ref_frame_id[i] > cm->current_frame_id &&
+ cm->ref_frame_id[i] < (1 << frame_id_length) +
+ cm->current_frame_id - (1 << diff_len))
+ cm->valid_for_referencing[i] = 0;
+ }
+ }
+ }
+
+ frame_size_override_flag = frame_is_sframe(cm) ? 1 : aom_rb_read_bit(rb);
+
+ cm->frame_offset =
+ aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
+ cm->current_video_frame = cm->frame_offset;
+
+ if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+ cm->primary_ref_frame = aom_rb_read_literal(rb, PRIMARY_REF_BITS);
+ }
+ }
+
+ if (seq_params->decoder_model_info_present_flag) {
+ cm->buffer_removal_time_present = aom_rb_read_bit(rb);
+ if (cm->buffer_removal_time_present) {
+ for (int op_num = 0;
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+ if (cm->op_params[op_num].decoder_model_param_present_flag) {
+ if ((((seq_params->operating_point_idc[op_num] >>
+ cm->temporal_layer_id) &
+ 0x1) &&
+ ((seq_params->operating_point_idc[op_num] >>
+ (cm->spatial_layer_id + 8)) &
+ 0x1)) ||
+ seq_params->operating_point_idc[op_num] == 0) {
+ cm->op_frame_timing[op_num].buffer_removal_time =
+ aom_rb_read_unsigned_literal(
+ rb, cm->buffer_model.buffer_removal_time_length);
+ } else {
+ cm->op_frame_timing[op_num].buffer_removal_time = 0;
+ }
+ } else {
+ cm->op_frame_timing[op_num].buffer_removal_time = 0;
+ }
+ }
+ }
+ }
+ if (cm->frame_type == KEY_FRAME) {
+ if (!cm->show_frame) // unshown keyframe (forward keyframe)
+ pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ else // shown keyframe
+ pbi->refresh_frame_flags = (1 << REF_FRAMES) - 1;
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ cm->frame_refs[i].idx = INVALID_IDX;
+ cm->frame_refs[i].buf = NULL;
+ }
+ if (pbi->need_resync) {
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ pbi->need_resync = 0;
+ }
+ } else {
+ if (cm->intra_only) {
+ pbi->refresh_frame_flags = aom_rb_read_literal(rb, REF_FRAMES);
+ if (pbi->refresh_frame_flags == 0xFF) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Intra only frames cannot have refresh flags 0xFF");
+ }
+ if (pbi->need_resync) {
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ pbi->need_resync = 0;
+ }
+ } else if (pbi->need_resync != 1) { /* Skip if need resync */
+ pbi->refresh_frame_flags =
+ frame_is_sframe(cm) ? 0xFF : aom_rb_read_literal(rb, REF_FRAMES);
+ if (!pbi->refresh_frame_flags) {
+ // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
+ // will not be used as a reference
+ cm->is_reference_frame = 0;
+ }
+ }
+ }
+
+ if (!frame_is_intra_only(cm) || pbi->refresh_frame_flags != 0xFF) {
+ // Read all ref frame order hints if error_resilient_mode == 1
+ if (cm->error_resilient_mode && seq_params->enable_order_hint) {
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ // Read order hint from bit stream
+ unsigned int frame_offset =
+ aom_rb_read_literal(rb, seq_params->order_hint_bits_minus_1 + 1);
+ // Get buffer index
+ int buf_idx = cm->ref_frame_map[ref_idx];
+ assert(buf_idx < FRAME_BUFFERS);
+ if (buf_idx == -1 ||
+ frame_offset != frame_bufs[buf_idx].cur_frame_offset) {
+ if (buf_idx >= 0) {
+ lock_buffer_pool(pool);
+ decrease_ref_count(buf_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ }
+ // If no corresponding buffer exists, allocate a new buffer with all
+ // pixels set to neutral grey.
+ buf_idx = get_free_fb(cm);
+ if (buf_idx == INVALID_IDX) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Unable to find free frame buffer");
+ }
+ lock_buffer_pool(pool);
+ if (aom_realloc_frame_buffer(
+ &frame_bufs[buf_idx].buf, seq_params->max_frame_width,
+ seq_params->max_frame_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment,
+ &pool->frame_bufs[buf_idx].raw_frame_buffer, pool->get_fb_cb,
+ pool->cb_priv)) {
+ unlock_buffer_pool(pool);
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ }
+ unlock_buffer_pool(pool);
+ set_planes_to_neutral_grey(seq_params, &frame_bufs[buf_idx].buf, 0);
+
+ cm->ref_frame_map[ref_idx] = buf_idx;
+ frame_bufs[buf_idx].cur_frame_offset = frame_offset;
+ }
+ }
+ }
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ setup_frame_size(cm, frame_size_override_flag, rb);
+
+ if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+ cm->allow_intrabc = aom_rb_read_bit(rb);
+ cm->allow_ref_frame_mvs = 0;
+ cm->prev_frame = NULL;
+ } else {
+ cm->allow_ref_frame_mvs = 0;
+
+ if (cm->intra_only) {
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
+ setup_frame_size(cm, frame_size_override_flag, rb);
+ if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+ cm->allow_intrabc = aom_rb_read_bit(rb);
+
+ } else if (pbi->need_resync != 1) { /* Skip if need resync */
+
+ // Frame refs short signaling is off when error resilient mode is on.
+ if (seq_params->enable_order_hint)
+ cm->frame_refs_short_signaling = aom_rb_read_bit(rb);
+
+ if (cm->frame_refs_short_signaling) {
+ // == LAST_FRAME ==
+ const int lst_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+ const int lst_idx = cm->ref_frame_map[lst_ref];
+
+ // == GOLDEN_FRAME ==
+ const int gld_ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+ const int gld_idx = cm->ref_frame_map[gld_ref];
+
+ // Most of the time, streams start with a keyframe. In that case,
+ // ref_frame_map will have been filled in at that point and will not
+ // contain any -1's. However, streams are explicitly allowed to start
+ // with an intra-only frame, so long as they don't then signal a
+ // reference to a slot that hasn't been set yet. That's what we are
+ // checking here.
+ if (lst_idx == -1)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+ if (gld_idx == -1)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+
+ av1_set_frame_refs(cm, lst_ref, gld_ref);
+ }
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ int ref = 0;
+ if (!cm->frame_refs_short_signaling) {
+ ref = aom_rb_read_literal(rb, REF_FRAMES_LOG2);
+ const int idx = cm->ref_frame_map[ref];
+
+ // Most of the time, streams start with a keyframe. In that case,
+ // ref_frame_map will have been filled in at that point and will not
+ // contain any -1's. However, streams are explicitly allowed to start
+ // with an intra-only frame, so long as they don't then signal a
+ // reference to a slot that hasn't been set yet. That's what we are
+ // checking here.
+ if (idx == -1)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Inter frame requests nonexistent reference");
+
+ RefBuffer *const ref_frame = &cm->frame_refs[i];
+ ref_frame->idx = idx;
+ ref_frame->buf = &frame_bufs[idx].buf;
+ ref_frame->map_idx = ref;
+ } else {
+ ref = cm->frame_refs[i].map_idx;
+ }
+
+ cm->ref_frame_sign_bias[LAST_FRAME + i] = 0;
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_length = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int delta_frame_id_minus_1 = aom_rb_read_literal(rb, diff_len);
+ int ref_frame_id =
+ ((cm->current_frame_id - (delta_frame_id_minus_1 + 1) +
+ (1 << frame_id_length)) %
+ (1 << frame_id_length));
+ // Compare values derived from delta_frame_id_minus_1 and
+ // refresh_frame_flags. Also, check valid for referencing
+ if (ref_frame_id != cm->ref_frame_id[ref] ||
+ cm->valid_for_referencing[ref] == 0)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference buffer frame ID mismatch");
+ }
+ }
+
+ if (!cm->error_resilient_mode && frame_size_override_flag) {
+ setup_frame_size_with_refs(cm, rb);
+ } else {
+ setup_frame_size(cm, frame_size_override_flag, rb);
+ }
+
+ if (cm->cur_frame_force_integer_mv) {
+ cm->allow_high_precision_mv = 0;
+ } else {
+ cm->allow_high_precision_mv = aom_rb_read_bit(rb);
+ }
+ cm->interp_filter = read_frame_interp_filter(rb);
+ cm->switchable_motion_mode = aom_rb_read_bit(rb);
+ }
+
+ cm->prev_frame = get_prev_frame(cm);
+ if (cm->primary_ref_frame != PRIMARY_REF_NONE &&
+ cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Reference frame containing this frame's initial "
+ "frame context is unavailable.");
+ }
+
+ if (!cm->intra_only && pbi->need_resync != 1) {
+ if (frame_might_allow_ref_frame_mvs(cm))
+ cm->allow_ref_frame_mvs = aom_rb_read_bit(rb);
+ else
+ cm->allow_ref_frame_mvs = 0;
+
+ for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) {
+ RefBuffer *const ref_buf = &cm->frame_refs[i];
+ av1_setup_scale_factors_for_frame(
+ &ref_buf->sf, ref_buf->buf->y_crop_width,
+ ref_buf->buf->y_crop_height, cm->width, cm->height);
+ if ((!av1_is_valid_scale(&ref_buf->sf)))
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Reference frame has invalid dimensions");
+ }
+ }
+ }
+
+ av1_setup_frame_buf_refs(cm);
+
+ av1_setup_frame_sign_bias(cm);
+
+ cm->cur_frame->intra_only = cm->frame_type == KEY_FRAME || cm->intra_only;
+ cm->cur_frame->frame_type = cm->frame_type;
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ /* If bitmask is set, update reference frame id values and
+ mark frames as valid for reference */
+ int refresh_frame_flags = pbi->refresh_frame_flags;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ if ((refresh_frame_flags >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ cm->valid_for_referencing[i] = 1;
+ }
+ }
+ }
+
+ const int might_bwd_adapt =
+ !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+ if (might_bwd_adapt) {
+ cm->refresh_frame_context = aom_rb_read_bit(rb)
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ } else {
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+ }
+
+ get_frame_new_buffer(cm)->bit_depth = seq_params->bit_depth;
+ get_frame_new_buffer(cm)->color_primaries = seq_params->color_primaries;
+ get_frame_new_buffer(cm)->transfer_characteristics =
+ seq_params->transfer_characteristics;
+ get_frame_new_buffer(cm)->matrix_coefficients =
+ seq_params->matrix_coefficients;
+ get_frame_new_buffer(cm)->monochrome = seq_params->monochrome;
+ get_frame_new_buffer(cm)->chroma_sample_position =
+ seq_params->chroma_sample_position;
+ get_frame_new_buffer(cm)->color_range = seq_params->color_range;
+ get_frame_new_buffer(cm)->render_width = cm->render_width;
+ get_frame_new_buffer(cm)->render_height = cm->render_height;
+
+ if (pbi->need_resync) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Keyframe / intra-only frame required to reset decoder"
+ " state");
+ }
+
+ // Generate next_ref_frame_map.
+ lock_buffer_pool(pool);
+ int ref_index = 0;
+ for (int mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ if (mask & 1) {
+ cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
+ ++frame_bufs[cm->new_fb_idx].ref_count;
+ } else {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+ }
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ ++ref_index;
+ }
+
+ for (; ref_index < REF_FRAMES; ++ref_index) {
+ cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
+ // Current thread holds the reference frame.
+ if (cm->ref_frame_map[ref_index] >= 0)
+ ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
+ }
+ unlock_buffer_pool(pool);
+ pbi->hold_ref_buf = 1;
+
+ if (cm->allow_intrabc) {
+ // Set parameters corresponding to no filtering.
+ struct loopfilter *lf = &cm->lf;
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->nb_cdef_strengths = 1;
+ cm->cdef_uv_strengths[0] = 0;
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ }
+
+ read_tile_info(pbi, rb);
+ setup_quantization(cm, rb);
+ xd->bd = (int)seq_params->bit_depth;
+
+ if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+ cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+ cm->num_allocated_above_contexts < cm->tile_rows) {
+ av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+ if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+ av1_setup_past_independence(cm);
+ }
+
+ setup_segmentation(cm, rb);
+
+ cm->delta_q_res = 1;
+ cm->delta_lf_res = 1;
+ cm->delta_lf_present_flag = 0;
+ cm->delta_lf_multi = 0;
+ cm->delta_q_present_flag = cm->base_qindex > 0 ? aom_rb_read_bit(rb) : 0;
+ if (cm->delta_q_present_flag) {
+ xd->current_qindex = cm->base_qindex;
+ cm->delta_q_res = 1 << aom_rb_read_literal(rb, 2);
+ if (!cm->allow_intrabc) cm->delta_lf_present_flag = aom_rb_read_bit(rb);
+ if (cm->delta_lf_present_flag) {
+ cm->delta_lf_res = 1 << aom_rb_read_literal(rb, 2);
+ cm->delta_lf_multi = aom_rb_read_bit(rb);
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+ for (int i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = cm->seg.enabled
+ ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+ : cm->base_qindex;
+ xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+ cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+ cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+ xd->qindex[i] = qindex;
+ }
+ cm->coded_lossless = is_coded_lossless(cm, xd);
+ cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+ setup_segmentation_dequant(cm);
+ if (cm->coded_lossless) {
+ cm->lf.filter_level[0] = 0;
+ cm->lf.filter_level[1] = 0;
+ }
+ if (cm->coded_lossless || !seq_params->enable_cdef) {
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->cdef_uv_strengths[0] = 0;
+ }
+ if (cm->all_lossless || !seq_params->enable_restoration) {
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ }
+ setup_loopfilter(cm, rb);
+
+ if (!cm->coded_lossless && seq_params->enable_cdef) {
+ setup_cdef(cm, rb);
+ }
+ if (!cm->all_lossless && seq_params->enable_restoration) {
+ decode_restoration_mode(cm, rb);
+ }
+
+ cm->tx_mode = read_tx_mode(cm, rb);
+ cm->reference_mode = read_frame_reference_mode(cm, rb);
+ if (cm->reference_mode != SINGLE_REFERENCE) setup_compound_reference_mode(cm);
+
+ av1_setup_skip_mode_allowed(cm);
+ cm->skip_mode_flag = cm->is_skip_mode_allowed ? aom_rb_read_bit(rb) : 0;
+
+ if (frame_might_allow_warped_motion(cm))
+ cm->allow_warped_motion = aom_rb_read_bit(rb);
+ else
+ cm->allow_warped_motion = 0;
+
+ cm->reduced_tx_set_used = aom_rb_read_bit(rb);
+
+ if (cm->allow_ref_frame_mvs && !frame_might_allow_ref_frame_mvs(cm)) {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Frame wrongly requests reference frame MVs");
+ }
+
+ if (!frame_is_intra_only(cm)) read_global_motion(cm, rb);
+
+ cm->cur_frame->film_grain_params_present =
+ seq_params->film_grain_params_present;
+ read_film_grain(cm, rb);
+
+#if EXT_TILE_DEBUG
+ if (pbi->ext_tile_debug && cm->large_scale_tile) {
+ read_ext_tile_info(pbi, rb);
+ av1_set_single_tile_decoding_mode(cm);
+ }
+#endif // EXT_TILE_DEBUG
+ return 0;
+}
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+ AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+ const uint8_t *data_end) {
+ rb->bit_offset = 0;
+ rb->error_handler = error_handler;
+ rb->error_handler_data = &pbi->common;
+ rb->bit_buffer = data;
+ rb->bit_buffer_end = data_end;
+ return rb;
+}
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+ int num_bits_height, int *width, int *height) {
+ *width = aom_rb_read_literal(rb, num_bits_width) + 1;
+ *height = aom_rb_read_literal(rb, num_bits_height) + 1;
+}
+
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb) {
+ int profile = aom_rb_read_literal(rb, PROFILE_BITS);
+ return (BITSTREAM_PROFILE)profile;
+}
+
+void superres_post_decode(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+
+ if (!av1_superres_scaled(cm)) return;
+ assert(!cm->all_lossless);
+
+ lock_buffer_pool(pool);
+ av1_superres_upscale(cm, pool);
+ unlock_buffer_pool(pool);
+}
+
+uint32_t av1_decode_frame_headers_and_setup(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &pbi->mb;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_set_frame_read(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_r();
+#endif
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ cm->global_motion[i] = default_warp_params;
+ cm->cur_frame->global_motion[i] = default_warp_params;
+ }
+ xd->global_motion = cm->global_motion;
+
+ read_uncompressed_header(pbi, rb);
+
+ if (trailing_bits_present) av1_check_trailing_bits(pbi, rb);
+
+ // If cm->single_tile_decoding = 0, the independent decoding of a single tile
+ // or a section of a frame is not allowed.
+ if (!cm->single_tile_decoding &&
+ (pbi->dec_tile_row >= 0 || pbi->dec_tile_col >= 0)) {
+ pbi->dec_tile_row = -1;
+ pbi->dec_tile_col = -1;
+ }
+
+ const uint32_t uncomp_hdr_size =
+ (uint32_t)aom_rb_bytes_read(rb); // Size of the uncompressed header
+ YV12_BUFFER_CONFIG *new_fb = get_frame_new_buffer(cm);
+ xd->cur_buf = new_fb;
+ if (av1_allow_intrabc(cm)) {
+ av1_setup_scale_factors_for_frame(
+ &cm->sf_identity, xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height,
+ xd->cur_buf->y_crop_width, xd->cur_buf->y_crop_height);
+ }
+
+ if (cm->show_existing_frame) {
+ // showing a frame directly
+ *p_data_end = data + uncomp_hdr_size;
+ if (cm->reset_decoder_state) {
+ // Use the default frame context values.
+ *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+ if (!cm->fc->initialized)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+ }
+ return uncomp_hdr_size;
+ }
+
+ cm->setup_mi(cm);
+
+ cm->current_frame_seg_map = cm->cur_frame->seg_map;
+
+ av1_setup_motion_field(cm);
+
+ av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, num_planes);
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+ // use the default frame context values
+ *cm->fc = cm->frame_contexts[FRAME_CONTEXT_DEFAULTS];
+ } else {
+ *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
+ }
+ if (!cm->fc->initialized)
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Uninitialized entropy context.");
+
+ xd->corrupted = 0;
+ return uncomp_hdr_size;
+}
+
+// Once-per-frame initialization
+static void setup_frame_info(AV1Decoder *pbi) {
+ AV1_COMMON *const cm = &pbi->common;
+
+ if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+ av1_alloc_restoration_buffers(cm);
+ }
+ const int use_highbd = cm->seq_params.use_highbitdepth ? 1 : 0;
+ const int buf_size = MC_TEMP_BUF_PELS << use_highbd;
+ if (pbi->td.mc_buf_size != buf_size) {
+ av1_free_mc_tmp_buf(&pbi->td);
+ allocate_mc_tmp_buf(cm, &pbi->td, buf_size, use_highbd);
+ }
+}
+
+void av1_decode_tg_tiles_and_wrapup(AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end, int start_tile,
+ int end_tile, int initialize_flag) {
+ AV1_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const int tile_count_tg = end_tile - start_tile + 1;
+
+ if (initialize_flag) setup_frame_info(pbi);
+ const int num_planes = av1_num_planes(cm);
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame_init(cm, 0, num_planes);
+ av1_zero_array(cm->lf.lfm, cm->lf.lfm_num);
+#endif
+
+ if (pbi->max_threads > 1 && !(cm->large_scale_tile && !pbi->ext_tile_debug) &&
+ pbi->row_mt)
+ *p_data_end =
+ decode_tiles_row_mt(pbi, data, data_end, start_tile, end_tile);
+ else if (pbi->max_threads > 1 && tile_count_tg > 1 &&
+ !(cm->large_scale_tile && !pbi->ext_tile_debug))
+ *p_data_end = decode_tiles_mt(pbi, data, data_end, start_tile, end_tile);
+ else
+ *p_data_end = decode_tiles(pbi, data, data_end, start_tile, end_tile);
+
+ // If the bit stream is monochrome, set the U and V buffers to a constant.
+ if (num_planes < 3) {
+ set_planes_to_neutral_grey(&cm->seq_params, xd->cur_buf, 1);
+ }
+
+ if (end_tile != cm->tile_rows * cm->tile_cols - 1) {
+ return;
+ }
+
+ if (!cm->allow_intrabc && !cm->single_tile_decoding) {
+ if (cm->lf.filter_level[0] || cm->lf.filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 1, 0,
+ num_planes, 0);
+#else
+ if (pbi->num_workers > 1) {
+ av1_loop_filter_frame_mt(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+ num_planes, 0, pbi->tile_workers,
+ pbi->num_workers, &pbi->lf_row_sync);
+ } else {
+ av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, 0,
+ num_planes, 0);
+ }
+#endif
+ }
+
+ const int do_loop_restoration =
+ cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
+ const int do_cdef =
+ !cm->skip_loop_filter && !cm->coded_lossless &&
+ (cm->cdef_bits || cm->cdef_strengths[0] || cm->cdef_uv_strengths[0]);
+ const int do_superres = av1_superres_scaled(cm);
+ const int optimized_loop_restoration = !do_cdef && !do_superres;
+
+ if (!optimized_loop_restoration) {
+ if (do_loop_restoration)
+ av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 0);
+
+ if (do_cdef) av1_cdef_frame(&pbi->cur_buf->buf, cm, &pbi->mb);
+
+ superres_post_decode(pbi);
+
+ if (do_loop_restoration) {
+ av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
+ if (pbi->num_workers > 1) {
+ av1_loop_restoration_filter_frame_mt(
+ (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+ pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+ &pbi->lr_ctxt);
+ } else {
+ av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+ cm, optimized_loop_restoration,
+ &pbi->lr_ctxt);
+ }
+ }
+ } else {
+ // In no cdef and no superres case. Provide an optimized version of
+ // loop_restoration_filter.
+ if (do_loop_restoration) {
+ if (pbi->num_workers > 1) {
+ av1_loop_restoration_filter_frame_mt(
+ (YV12_BUFFER_CONFIG *)xd->cur_buf, cm, optimized_loop_restoration,
+ pbi->tile_workers, pbi->num_workers, &pbi->lr_row_sync,
+ &pbi->lr_ctxt);
+ } else {
+ av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
+ cm, optimized_loop_restoration,
+ &pbi->lr_ctxt);
+ }
+ }
+ }
+ }
+
+ if (!xd->corrupted) {
+ if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ assert(cm->context_update_tile_id < pbi->allocated_tiles);
+ *cm->fc = pbi->tile_data[cm->context_update_tile_id].tctx;
+ av1_reset_cdf_symbol_counters(cm->fc);
+ }
+ } else {
+ aom_internal_error(&cm->error, AOM_CODEC_CORRUPT_FRAME,
+ "Decode failed. Frame data is corrupted.");
+ }
+
+#if CONFIG_INSPECTION
+ if (pbi->inspect_cb != NULL) {
+ (*pbi->inspect_cb)(pbi, pbi->inspect_ctx);
+ }
+#endif
+
+ // Non frame parallel update frame context here.
+ if (!cm->large_scale_tile) {
+ cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+ }
+}
diff --git a/third_party/aom/av1/decoder/decodeframe.h b/third_party/aom/av1/decoder/decodeframe.h
new file mode 100644
index 0000000000..ddad273f18
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodeframe.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEFRAME_H_
+#define AOM_AV1_DECODER_DECODEFRAME_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Decoder;
+struct aom_read_bit_buffer;
+struct ThreadData;
+
+// Reads the middle part of the sequence header OBU (from
+// frame_width_bits_minus_1 to enable_restoration) into seq_params.
+// Reports errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_sequence_header(AV1_COMMON *cm, struct aom_read_bit_buffer *rb,
+ SequenceHeader *seq_params);
+
+void av1_read_frame_size(struct aom_read_bit_buffer *rb, int num_bits_width,
+ int num_bits_height, int *width, int *height);
+BITSTREAM_PROFILE av1_read_profile(struct aom_read_bit_buffer *rb);
+
+// Returns 0 on success. Sets pbi->common.error.error_code and returns -1 on
+// failure.
+int av1_check_trailing_bits(struct AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb);
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+// TODO(wtc): Figure out and document the p_data_end parameter.
+uint32_t av1_decode_frame_headers_and_setup(struct AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present);
+
+void av1_decode_tg_tiles_and_wrapup(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end, int startTile,
+ int endTile, int initialize_flag);
+
+// Implements the color_config() function in the spec. Reports errors by
+// calling rb->error_handler() or aom_internal_error().
+void av1_read_color_config(struct aom_read_bit_buffer *rb,
+ int allow_lowbitdepth, SequenceHeader *seq_params,
+ struct aom_internal_error_info *error_info);
+
+// Implements the timing_info() function in the spec. Reports errors by calling
+// rb->error_handler().
+void av1_read_timing_info_header(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb);
+
+// Implements the decoder_model_info() function in the spec. Reports errors by
+// calling rb->error_handler().
+void av1_read_decoder_model_info(AV1_COMMON *cm,
+ struct aom_read_bit_buffer *rb);
+
+// Implements the operating_parameters_info() function in the spec. Reports
+// errors by calling rb->error_handler() or aom_internal_error().
+void av1_read_op_parameters_info(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *rb, int op_num);
+
+struct aom_read_bit_buffer *av1_init_read_bit_buffer(
+ struct AV1Decoder *pbi, struct aom_read_bit_buffer *rb, const uint8_t *data,
+ const uint8_t *data_end);
+
+void av1_free_mc_tmp_buf(struct ThreadData *thread_data);
+
+void av1_set_single_tile_decoding_mode(AV1_COMMON *const cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DECODEFRAME_H_
diff --git a/third_party/aom/av1/decoder/decodemv.c b/third_party/aom/av1/decoder/decodemv.c
new file mode 100644
index 0000000000..551e4d5437
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#define ACCT_STR __func__
+
+#define DEC_MISMATCH_DEBUG 0
+
+static PREDICTION_MODE read_intra_mode(aom_reader *r, aom_cdf_prob *cdf) {
+ return (PREDICTION_MODE)aom_read_symbol(r, cdf, INTRA_MODES, ACCT_STR);
+}
+
+static void read_cdef(AV1_COMMON *cm, aom_reader *r, MACROBLOCKD *const xd,
+ int mi_col, int mi_row) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ if (cm->coded_lossless) return;
+ if (cm->allow_intrabc) {
+ assert(cm->cdef_bits == 0);
+ return;
+ }
+
+ if (!(mi_col & (cm->seq_params.mib_size - 1)) &&
+ !(mi_row & (cm->seq_params.mib_size - 1))) { // Top left?
+ xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+ xd->cdef_preset[3] = -1;
+ }
+ // Read CDEF param at the first non-skip coding block
+ const int mask = (1 << (6 - MI_SIZE_LOG2));
+ const int m = ~(mask - 1);
+ const int index = cm->seq_params.sb_size == BLOCK_128X128
+ ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+ : 0;
+ cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)]
+ ->cdef_strength = xd->cdef_preset[index] =
+ xd->cdef_preset[index] == -1 && !mbmi->skip
+ ? aom_read_literal(r, cm->cdef_bits, ACCT_STR)
+ : xd->cdef_preset[index];
+}
+
+static int read_delta_qindex(AV1_COMMON *cm, const MACROBLOCKD *xd,
+ aom_reader *r, MB_MODE_INFO *const mbmi,
+ int mi_col, int mi_row) {
+ int sign, abs, reduced_delta_qindex = 0;
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+ const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+ const int read_delta_q_flag = (b_col == 0 && b_row == 0);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+ read_delta_q_flag) {
+ abs = aom_read_symbol(r, ec_ctx->delta_q_cdf, DELTA_Q_PROBS + 1, ACCT_STR);
+ const int smallval = (abs < DELTA_Q_SMALL);
+
+ if (!smallval) {
+ const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+ const int thr = (1 << rem_bits) + 1;
+ abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+ }
+
+ if (abs) {
+ sign = aom_read_bit(r, ACCT_STR);
+ } else {
+ sign = 1;
+ }
+
+ reduced_delta_qindex = sign ? -abs : abs;
+ }
+ return reduced_delta_qindex;
+}
+static int read_delta_lflevel(const AV1_COMMON *const cm, aom_reader *r,
+ aom_cdf_prob *const cdf,
+ const MB_MODE_INFO *const mbmi, int mi_col,
+ int mi_row) {
+ int reduced_delta_lflevel = 0;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int b_col = mi_col & (cm->seq_params.mib_size - 1);
+ const int b_row = mi_row & (cm->seq_params.mib_size - 1);
+ const int read_delta_lf_flag = (b_col == 0 && b_row == 0);
+
+ if ((bsize != cm->seq_params.sb_size || mbmi->skip == 0) &&
+ read_delta_lf_flag) {
+ int abs = aom_read_symbol(r, cdf, DELTA_LF_PROBS + 1, ACCT_STR);
+ const int smallval = (abs < DELTA_LF_SMALL);
+ if (!smallval) {
+ const int rem_bits = aom_read_literal(r, 3, ACCT_STR) + 1;
+ const int thr = (1 << rem_bits) + 1;
+ abs = aom_read_literal(r, rem_bits, ACCT_STR) + thr;
+ }
+ const int sign = abs ? aom_read_bit(r, ACCT_STR) : 1;
+ reduced_delta_lflevel = sign ? -abs : abs;
+ }
+ return reduced_delta_lflevel;
+}
+
+static UV_PREDICTION_MODE read_intra_mode_uv(FRAME_CONTEXT *ec_ctx,
+ aom_reader *r,
+ CFL_ALLOWED_TYPE cfl_allowed,
+ PREDICTION_MODE y_mode) {
+ const UV_PREDICTION_MODE uv_mode =
+ aom_read_symbol(r, ec_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+ UV_INTRA_MODES - !cfl_allowed, ACCT_STR);
+ return uv_mode;
+}
+
+static int read_cfl_alphas(FRAME_CONTEXT *const ec_ctx, aom_reader *r,
+ int *signs_out) {
+ const int joint_sign =
+ aom_read_symbol(r, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS, "cfl:signs");
+ int idx = 0;
+ // Magnitudes are only coded for nonzero values
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ idx = aom_read_symbol(r, cdf_u, CFL_ALPHABET_SIZE, "cfl:alpha_u")
+ << CFL_ALPHABET_SIZE_LOG2;
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ idx += aom_read_symbol(r, cdf_v, CFL_ALPHABET_SIZE, "cfl:alpha_v");
+ }
+ *signs_out = joint_sign;
+ return idx;
+}
+
+static INTERINTRA_MODE read_interintra_mode(MACROBLOCKD *xd, aom_reader *r,
+ int size_group) {
+ const INTERINTRA_MODE ii_mode = (INTERINTRA_MODE)aom_read_symbol(
+ r, xd->tile_ctx->interintra_mode_cdf[size_group], INTERINTRA_MODES,
+ ACCT_STR);
+ return ii_mode;
+}
+
+static PREDICTION_MODE read_inter_mode(FRAME_CONTEXT *ec_ctx, aom_reader *r,
+ int16_t ctx) {
+ int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+ int is_newmv, is_zeromv, is_refmv;
+ is_newmv = aom_read_symbol(r, ec_ctx->newmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_newmv) return NEWMV;
+
+ mode_ctx = (ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ is_zeromv =
+ aom_read_symbol(r, ec_ctx->zeromv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_zeromv) return GLOBALMV;
+
+ mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ is_refmv = aom_read_symbol(r, ec_ctx->refmv_cdf[mode_ctx], 2, ACCT_STR) == 0;
+ if (is_refmv)
+ return NEARESTMV;
+ else
+ return NEARMV;
+}
+
+static void read_drl_idx(FRAME_CONTEXT *ec_ctx, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi, aom_reader *r) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ mbmi->ref_mv_idx = 0;
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+ int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+ mbmi->ref_mv_idx = idx + drl_idx;
+ if (!drl_idx) return;
+ }
+ }
+ }
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ // Offset the NEARESTMV mode.
+ // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+ // mode is factored in.
+ for (int idx = 1; idx < 3; ++idx) {
+ if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx = av1_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+ int drl_idx = aom_read_symbol(r, ec_ctx->drl_cdf[drl_ctx], 2, ACCT_STR);
+ mbmi->ref_mv_idx = idx + drl_idx - 1;
+ if (!drl_idx) return;
+ }
+ }
+ }
+}
+
+static MOTION_MODE read_motion_mode(AV1_COMMON *cm, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi, aom_reader *r) {
+ if (cm->switchable_motion_mode == 0) return SIMPLE_TRANSLATION;
+ if (mbmi->skip_mode) return SIMPLE_TRANSLATION;
+
+ const MOTION_MODE last_motion_mode_allowed =
+ motion_mode_allowed(xd->global_motion, xd, mbmi, cm->allow_warped_motion);
+ int motion_mode;
+
+ if (last_motion_mode_allowed == SIMPLE_TRANSLATION) return SIMPLE_TRANSLATION;
+
+ if (last_motion_mode_allowed == OBMC_CAUSAL) {
+ motion_mode =
+ aom_read_symbol(r, xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2, ACCT_STR);
+ return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+ } else {
+ motion_mode =
+ aom_read_symbol(r, xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+ MOTION_MODES, ACCT_STR);
+ return (MOTION_MODE)(SIMPLE_TRANSLATION + motion_mode);
+ }
+}
+
+static PREDICTION_MODE read_inter_compound_mode(MACROBLOCKD *xd, aom_reader *r,
+ int16_t ctx) {
+ const int mode =
+ aom_read_symbol(r, xd->tile_ctx->inter_compound_mode_cdf[ctx],
+ INTER_COMPOUND_MODES, ACCT_STR);
+ assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+ return NEAREST_NEARESTMV + mode;
+}
+
+int av1_neg_deinterleave(int diff, int ref, int max) {
+ if (!ref) return diff;
+ if (ref >= (max - 1)) return max - diff - 1;
+ if (2 * ref < max) {
+ if (diff <= 2 * ref) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return diff;
+ } else {
+ if (diff <= 2 * (max - ref - 1)) {
+ if (diff & 1)
+ return ref + ((diff + 1) >> 1);
+ else
+ return ref - (diff >> 1);
+ }
+ return max - (diff + 1);
+ }
+}
+
+static int read_segment_id(AV1_COMMON *const cm, const MACROBLOCKD *const xd,
+ int mi_row, int mi_col, aom_reader *r, int skip) {
+ int cdf_num;
+ const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+ if (skip) return pred;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+ const int coded_id = aom_read_symbol(r, pred_cdf, MAX_SEGMENTS, ACCT_STR);
+ const int segment_id =
+ av1_neg_deinterleave(coded_id, pred, seg->last_active_segid + 1);
+
+ if (segment_id < 0 || segment_id > seg->last_active_segid) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Corrupted segment_ids");
+ }
+ return segment_id;
+}
+
+static int dec_get_segment_id(const AV1_COMMON *cm, const uint8_t *segment_ids,
+ int mi_offset, int x_mis, int y_mis) {
+ int segment_id = INT_MAX;
+
+ for (int y = 0; y < y_mis; y++)
+ for (int x = 0; x < x_mis; x++)
+ segment_id =
+ AOMMIN(segment_id, segment_ids[mi_offset + y * cm->mi_cols + x]);
+
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+ return segment_id;
+}
+
+static void set_segment_id(AV1_COMMON *cm, int mi_offset, int x_mis, int y_mis,
+ int segment_id) {
+ assert(segment_id >= 0 && segment_id < MAX_SEGMENTS);
+
+ for (int y = 0; y < y_mis; y++)
+ for (int x = 0; x < x_mis; x++)
+ cm->current_frame_seg_map[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+static int read_intra_segment_id(AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int mi_row,
+ int mi_col, int bsize, aom_reader *r,
+ int skip) {
+ struct segmentation *const seg = &cm->seg;
+ if (!seg->enabled) return 0; // Default for disabled segmentation
+
+ assert(seg->update_map && !seg->temporal_update);
+
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+ const int segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, skip);
+ set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+ return segment_id;
+}
+
+static void copy_segment_id(const AV1_COMMON *cm,
+ const uint8_t *last_segment_ids,
+ uint8_t *current_segment_ids, int mi_offset,
+ int x_mis, int y_mis) {
+ for (int y = 0; y < y_mis; y++)
+ for (int x = 0; x < x_mis; x++)
+ current_segment_ids[mi_offset + y * cm->mi_cols + x] =
+ last_segment_ids ? last_segment_ids[mi_offset + y * cm->mi_cols + x]
+ : 0;
+}
+
+static int get_predicted_segment_id(AV1_COMMON *const cm, int mi_offset,
+ int x_mis, int y_mis) {
+ return cm->last_frame_seg_map ? dec_get_segment_id(cm, cm->last_frame_seg_map,
+ mi_offset, x_mis, y_mis)
+ : 0;
+}
+
+static int read_inter_segment_id(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, int preskip,
+ aom_reader *r) {
+ struct segmentation *const seg = &cm->seg;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = mi_size_wide[mbmi->sb_type];
+ const int bh = mi_size_high[mbmi->sb_type];
+
+ // TODO(slavarnway): move x_mis, y_mis into xd ?????
+ const int x_mis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int y_mis = AOMMIN(cm->mi_rows - mi_row, bh);
+
+ if (!seg->enabled) return 0; // Default for disabled segmentation
+
+ if (!seg->update_map) {
+ copy_segment_id(cm, cm->last_frame_seg_map, cm->current_frame_seg_map,
+ mi_offset, x_mis, y_mis);
+ return get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+ }
+
+ int segment_id;
+ if (preskip) {
+ if (!seg->segid_preskip) return 0;
+ } else {
+ if (seg->segid_preskip) return mbmi->segment_id;
+ if (mbmi->skip) {
+ if (seg->temporal_update) {
+ mbmi->seg_id_predicted = 0;
+ }
+ segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 1);
+ set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+ return segment_id;
+ }
+ }
+
+ if (seg->temporal_update) {
+ const int ctx = av1_get_pred_context_seg_id(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ aom_cdf_prob *pred_cdf = segp->pred_cdf[ctx];
+ mbmi->seg_id_predicted = aom_read_symbol(r, pred_cdf, 2, ACCT_STR);
+ if (mbmi->seg_id_predicted) {
+ segment_id = get_predicted_segment_id(cm, mi_offset, x_mis, y_mis);
+ } else {
+ segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+ }
+ } else {
+ segment_id = read_segment_id(cm, xd, mi_row, mi_col, r, 0);
+ }
+ set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
+ return segment_id;
+}
+
+static int read_skip_mode(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+ aom_reader *r) {
+ if (!cm->skip_mode_flag) return 0;
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 0;
+ }
+
+ if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return 0;
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ // These features imply single-reference mode, while skip mode implies
+ // compound reference. Hence, the two are mutually exclusive.
+ // In other words, skip_mode is implicitly 0 here.
+ return 0;
+ }
+
+ const int ctx = av1_get_skip_mode_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int skip_mode =
+ aom_read_symbol(r, ec_ctx->skip_mode_cdfs[ctx], 2, ACCT_STR);
+ return skip_mode;
+}
+
+static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
+ aom_reader *r) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int ctx = av1_get_skip_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int skip = aom_read_symbol(r, ec_ctx->skip_cdfs[ctx], 2, ACCT_STR);
+ return skip;
+ }
+}
+
+// Merge the sorted list of cached colors(cached_colors[0...n_cached_colors-1])
+// and the sorted list of transmitted colors(colors[n_cached_colors...n-1]) into
+// one single sorted list(colors[...]).
+static void merge_colors(uint16_t *colors, uint16_t *cached_colors,
+ int n_colors, int n_cached_colors) {
+ if (n_cached_colors == 0) return;
+ int cache_idx = 0, trans_idx = n_cached_colors;
+ for (int i = 0; i < n_colors; ++i) {
+ if (cache_idx < n_cached_colors &&
+ (trans_idx >= n_colors ||
+ cached_colors[cache_idx] <= colors[trans_idx])) {
+ colors[i] = cached_colors[cache_idx++];
+ } else {
+ assert(trans_idx < n_colors);
+ colors[i] = colors[trans_idx++];
+ }
+ }
+}
+
+static void read_palette_colors_y(MACROBLOCKD *const xd, int bit_depth,
+ PALETTE_MODE_INFO *const pmi, aom_reader *r) {
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ uint16_t cached_colors[PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ const int n = pmi->palette_size[0];
+ int idx = 0;
+ for (int i = 0; i < n_cache && idx < n; ++i)
+ if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+ if (idx < n) {
+ const int n_cached_colors = idx;
+ pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+ if (idx < n) {
+ const int min_bits = bit_depth - 3;
+ int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+ int range = (1 << bit_depth) - pmi->palette_colors[idx - 1] - 1;
+ for (; idx < n; ++idx) {
+ assert(range >= 0);
+ const int delta = aom_read_literal(r, bits, ACCT_STR) + 1;
+ pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+ 0, (1 << bit_depth) - 1);
+ range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+ }
+ merge_colors(pmi->palette_colors, cached_colors, n, n_cached_colors);
+ } else {
+ memcpy(pmi->palette_colors, cached_colors, n * sizeof(cached_colors[0]));
+ }
+}
+
+static void read_palette_colors_uv(MACROBLOCKD *const xd, int bit_depth,
+ PALETTE_MODE_INFO *const pmi,
+ aom_reader *r) {
+ const int n = pmi->palette_size[1];
+ // U channel colors.
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ uint16_t cached_colors[PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ int idx = 0;
+ for (int i = 0; i < n_cache && idx < n; ++i)
+ if (aom_read_bit(r, ACCT_STR)) cached_colors[idx++] = color_cache[i];
+ if (idx < n) {
+ const int n_cached_colors = idx;
+ idx += PALETTE_MAX_SIZE;
+ pmi->palette_colors[idx++] = aom_read_literal(r, bit_depth, ACCT_STR);
+ if (idx < PALETTE_MAX_SIZE + n) {
+ const int min_bits = bit_depth - 3;
+ int bits = min_bits + aom_read_literal(r, 2, ACCT_STR);
+ int range = (1 << bit_depth) - pmi->palette_colors[idx - 1];
+ for (; idx < PALETTE_MAX_SIZE + n; ++idx) {
+ assert(range >= 0);
+ const int delta = aom_read_literal(r, bits, ACCT_STR);
+ pmi->palette_colors[idx] = clamp(pmi->palette_colors[idx - 1] + delta,
+ 0, (1 << bit_depth) - 1);
+ range -= (pmi->palette_colors[idx] - pmi->palette_colors[idx - 1]);
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+ }
+ merge_colors(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors, n,
+ n_cached_colors);
+ } else {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE, cached_colors,
+ n * sizeof(cached_colors[0]));
+ }
+
+ // V channel colors.
+ if (aom_read_bit(r, ACCT_STR)) { // Delta encoding.
+ const int min_bits_v = bit_depth - 4;
+ const int max_val = 1 << bit_depth;
+ int bits = min_bits_v + aom_read_literal(r, 2, ACCT_STR);
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE] =
+ aom_read_literal(r, bit_depth, ACCT_STR);
+ for (int i = 1; i < n; ++i) {
+ int delta = aom_read_literal(r, bits, ACCT_STR);
+ if (delta && aom_read_bit(r, ACCT_STR)) delta = -delta;
+ int val = (int)pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1] + delta;
+ if (val < 0) val += max_val;
+ if (val >= max_val) val -= max_val;
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] = val;
+ }
+ } else {
+ for (int i = 0; i < n; ++i) {
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+ aom_read_literal(r, bit_depth, ACCT_STR);
+ }
+ }
+}
+
+static void read_palette_mode_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, aom_reader *r) {
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ if (mbmi->mode == DC_PRED) {
+ const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+ const int modev = aom_read_symbol(
+ r, xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_mode_ctx], 2,
+ ACCT_STR);
+ if (modev) {
+ pmi->palette_size[0] =
+ aom_read_symbol(r, xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+ PALETTE_SIZES, ACCT_STR) +
+ 2;
+ read_palette_colors_y(xd, cm->seq_params.bit_depth, pmi, r);
+ }
+ }
+ if (num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y)) {
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ const int modev = aom_read_symbol(
+ r, xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2, ACCT_STR);
+ if (modev) {
+ pmi->palette_size[1] =
+ aom_read_symbol(r, xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+ PALETTE_SIZES, ACCT_STR) +
+ 2;
+ read_palette_colors_uv(xd, cm->seq_params.bit_depth, pmi, r);
+ }
+ }
+}
+
+static int read_angle_delta(aom_reader *r, aom_cdf_prob *cdf) {
+ const int sym = aom_read_symbol(r, cdf, 2 * MAX_ANGLE_DELTA + 1, ACCT_STR);
+ return sym - MAX_ANGLE_DELTA;
+}
+
+static void read_filter_intra_mode_info(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *r) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ FILTER_INTRA_MODE_INFO *filter_intra_mode_info =
+ &mbmi->filter_intra_mode_info;
+
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ filter_intra_mode_info->use_filter_intra = aom_read_symbol(
+ r, xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2, ACCT_STR);
+ if (filter_intra_mode_info->use_filter_intra) {
+ filter_intra_mode_info->filter_intra_mode = aom_read_symbol(
+ r, xd->tile_ctx->filter_intra_mode_cdf, FILTER_INTRA_MODES, ACCT_STR);
+ }
+ } else {
+ filter_intra_mode_info->use_filter_intra = 0;
+ }
+}
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+ int blk_col, TX_SIZE tx_size, aom_reader *r) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+ TX_TYPE *tx_type = &mbmi->txk_type[txk_type_idx];
+ *tx_type = DCT_DCT;
+
+ // No need to read transform type if block is skipped.
+ if (mbmi->skip || segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+ return;
+
+ // No need to read transform type for lossless mode(qindex==0).
+ const int qindex =
+ cm->seg.enabled ? xd->qindex[mbmi->segment_id] : cm->base_qindex;
+ if (qindex <= 0) return;
+
+ const int inter_block = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, inter_block, cm->reduced_tx_set_used) > 1) {
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, inter_block, cm->reduced_tx_set_used);
+ const int eset =
+ get_ext_tx_set(tx_size, inter_block, cm->reduced_tx_set_used);
+ // eset == 0 should correspond to a set with only DCT_DCT and
+ // there is no need to read the tx_type
+ assert(eset != 0);
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ if (inter_block) {
+ *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+ r, ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+ } else {
+ const PREDICTION_MODE intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra
+ ? fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode]
+ : mbmi->mode;
+ *tx_type = av1_ext_tx_inv[tx_set_type][aom_read_symbol(
+ r, ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_mode],
+ av1_num_ext_tx_set[tx_set_type], ACCT_STR)];
+ }
+ }
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+ nmv_context *ctx, MvSubpelPrecision precision);
+
+static INLINE int is_mv_valid(const MV *mv);
+
+static INLINE int assign_dv(AV1_COMMON *cm, MACROBLOCKD *xd, int_mv *mv,
+ const int_mv *ref_mv, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ read_mv(r, &mv->as_mv, &ref_mv->as_mv, &ec_ctx->ndvc, MV_SUBPEL_NONE);
+ // DV should not have sub-pel.
+ assert((mv->as_mv.col & 7) == 0);
+ assert((mv->as_mv.row & 7) == 0);
+ mv->as_mv.col = (mv->as_mv.col >> 3) * 8;
+ mv->as_mv.row = (mv->as_mv.row >> 3) * 8;
+ int valid = is_mv_valid(&mv->as_mv) &&
+ av1_is_dv_valid(mv->as_mv, cm, xd, mi_row, mi_col, bsize,
+ cm->seq_params.mib_size_log2);
+ return valid;
+}
+
+static void read_intrabc_info(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int mi_row, int mi_col, aom_reader *r) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ mbmi->use_intrabc = aom_read_symbol(r, ec_ctx->intrabc_cdf, 2, ACCT_STR);
+ if (mbmi->use_intrabc) {
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+
+ int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+ int_mv ref_mvs[INTRA_FRAME + 1][MAX_MV_REF_CANDIDATES];
+ int_mv global_mvs[REF_FRAMES];
+
+ av1_find_mv_refs(cm, xd, mbmi, INTRA_FRAME, xd->ref_mv_count,
+ xd->ref_mv_stack, ref_mvs, global_mvs, mi_row, mi_col,
+ inter_mode_ctx);
+
+ int_mv nearestmv, nearmv;
+
+ av1_find_best_ref_mvs(0, ref_mvs[INTRA_FRAME], &nearestmv, &nearmv, 0);
+ int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+ if (dv_ref.as_int == 0)
+ av1_find_ref_dv(&dv_ref, &xd->tile, cm->seq_params.mib_size, mi_row,
+ mi_col);
+ // Ref DV should not have sub-pel.
+ int valid_dv = (dv_ref.as_mv.col & 7) == 0 && (dv_ref.as_mv.row & 7) == 0;
+ dv_ref.as_mv.col = (dv_ref.as_mv.col >> 3) * 8;
+ dv_ref.as_mv.row = (dv_ref.as_mv.row >> 3) * 8;
+ valid_dv = valid_dv && assign_dv(cm, xd, &mbmi->mv[0], &dv_ref, mi_row,
+ mi_col, bsize, r);
+ if (!valid_dv) {
+ // Intra bc motion vectors are not valid - signal corrupt frame
+ aom_merge_corrupted_flag(&xd->corrupted, 1);
+ }
+ }
+}
+
+// If delta q is present, reads delta_q index.
+// Also reads delta_q loop filter levels, if present.
+static void read_delta_q_params(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ const int mi_row, const int mi_col,
+ aom_reader *r) {
+ if (cm->delta_q_present_flag) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ xd->current_qindex +=
+ read_delta_qindex(cm, xd, r, mbmi, mi_col, mi_row) * cm->delta_q_res;
+ /* Normative: Clamp to [1,MAXQ] to not interfere with lossless mode */
+ xd->current_qindex = clamp(xd->current_qindex, 1, MAXQ);
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ if (cm->delta_lf_present_flag) {
+ if (cm->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ const int tmp_lvl =
+ xd->delta_lf[lf_id] +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_multi_cdf[lf_id], mbmi,
+ mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id] =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ } else {
+ const int tmp_lvl = xd->delta_lf_from_base +
+ read_delta_lflevel(cm, r, ec_ctx->delta_lf_cdf,
+ mbmi, mi_col, mi_row) *
+ cm->delta_lf_res;
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base =
+ clamp(tmp_lvl, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ }
+ }
+}
+
+static void read_intra_frame_mode_info(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, aom_reader *r) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ struct segmentation *const seg = &cm->seg;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (seg->segid_preskip)
+ mbmi->segment_id =
+ read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, 0);
+
+ mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+ if (!seg->segid_preskip)
+ mbmi->segment_id =
+ read_intra_segment_id(cm, xd, mi_row, mi_col, bsize, r, mbmi->skip);
+
+ read_cdef(cm, r, xd, mi_col, mi_row);
+
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
+
+ mbmi->current_qindex = xd->current_qindex;
+
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ if (av1_allow_intrabc(cm)) {
+ read_intrabc_info(cm, xd, mi_row, mi_col, r);
+ if (is_intrabc_block(mbmi)) return;
+ }
+
+ mbmi->mode = read_intra_mode(r, get_y_mode_cdf(ec_ctx, above_mi, left_mi));
+
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ (use_angle_delta && av1_is_directional_mode(mbmi->mode))
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+ : 0;
+
+ if (!cm->seq_params.monochrome &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y)) {
+ xd->cfl.is_chroma_reference = 1;
+ mbmi->uv_mode =
+ read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mbmi->cfl_alpha_idx = read_cfl_alphas(ec_ctx, r, &mbmi->cfl_alpha_signs);
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] =
+ (use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode)))
+ ? read_angle_delta(r,
+ ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+ : 0;
+ } else {
+ // Avoid decoding angle_info if there is is no chroma prediction
+ mbmi->uv_mode = UV_DC_PRED;
+ xd->cfl.is_chroma_reference = 0;
+ }
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+ read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+ read_filter_intra_mode_info(cm, xd, r);
+}
+
+static int read_mv_component(aom_reader *r, nmv_component *mvcomp,
+ int use_subpel, int usehp) {
+ int mag, d, fr, hp;
+ const int sign = aom_read_symbol(r, mvcomp->sign_cdf, 2, ACCT_STR);
+ const int mv_class =
+ aom_read_symbol(r, mvcomp->classes_cdf, MV_CLASSES, ACCT_STR);
+ const int class0 = mv_class == MV_CLASS_0;
+
+ // Integer part
+ if (class0) {
+ d = aom_read_symbol(r, mvcomp->class0_cdf, CLASS0_SIZE, ACCT_STR);
+ mag = 0;
+ } else {
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ d = 0;
+ for (int i = 0; i < n; ++i)
+ d |= aom_read_symbol(r, mvcomp->bits_cdf[i], 2, ACCT_STR) << i;
+ mag = CLASS0_SIZE << (mv_class + 2);
+ }
+
+ if (use_subpel) {
+ // Fractional part
+ fr = aom_read_symbol(r, class0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE, ACCT_STR);
+
+ // High precision part (if hp is not used, the default value of the hp is 1)
+ hp = usehp ? aom_read_symbol(
+ r, class0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf, 2,
+ ACCT_STR)
+ : 1;
+ } else {
+ fr = 3;
+ hp = 1;
+ }
+
+ // Result
+ mag += ((d << 3) | (fr << 1) | hp) + 1;
+ return sign ? -mag : mag;
+}
+
+static INLINE void read_mv(aom_reader *r, MV *mv, const MV *ref,
+ nmv_context *ctx, MvSubpelPrecision precision) {
+ MV diff = kZeroMv;
+ const MV_JOINT_TYPE joint_type =
+ (MV_JOINT_TYPE)aom_read_symbol(r, ctx->joints_cdf, MV_JOINTS, ACCT_STR);
+
+ if (mv_joint_vertical(joint_type))
+ diff.row = read_mv_component(r, &ctx->comps[0], precision > MV_SUBPEL_NONE,
+ precision > MV_SUBPEL_LOW_PRECISION);
+
+ if (mv_joint_horizontal(joint_type))
+ diff.col = read_mv_component(r, &ctx->comps[1], precision > MV_SUBPEL_NONE,
+ precision > MV_SUBPEL_LOW_PRECISION);
+
+ mv->row = ref->row + diff.row;
+ mv->col = ref->col + diff.col;
+}
+
+static REFERENCE_MODE read_block_reference_mode(AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ aom_reader *r) {
+ if (!is_comp_ref_allowed(xd->mi[0]->sb_type)) return SINGLE_REFERENCE;
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ const int ctx = av1_get_reference_mode_context(xd);
+ const REFERENCE_MODE mode = (REFERENCE_MODE)aom_read_symbol(
+ r, xd->tile_ctx->comp_inter_cdf[ctx], 2, ACCT_STR);
+ return mode; // SINGLE_REFERENCE or COMPOUND_REFERENCE
+ } else {
+ assert(cm->reference_mode == SINGLE_REFERENCE);
+ return cm->reference_mode;
+ }
+}
+
+#define READ_REF_BIT(pname) \
+ aom_read_symbol(r, av1_get_pred_cdf_##pname(xd), 2, ACCT_STR)
+
+static COMP_REFERENCE_TYPE read_comp_reference_type(const MACROBLOCKD *xd,
+ aom_reader *r) {
+ const int ctx = av1_get_comp_reference_type_context(xd);
+ const COMP_REFERENCE_TYPE comp_ref_type =
+ (COMP_REFERENCE_TYPE)aom_read_symbol(
+ r, xd->tile_ctx->comp_ref_type_cdf[ctx], 2, ACCT_STR);
+ return comp_ref_type; // UNIDIR_COMP_REFERENCE or BIDIR_COMP_REFERENCE
+}
+
+static void set_ref_frames_for_skip_mode(AV1_COMMON *const cm,
+ MV_REFERENCE_FRAME ref_frame[2]) {
+ ref_frame[0] = LAST_FRAME + cm->ref_frame_idx_0;
+ ref_frame[1] = LAST_FRAME + cm->ref_frame_idx_1;
+}
+
+// Read the referncence frame
+static void read_ref_frames(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *r, int segment_id,
+ MV_REFERENCE_FRAME ref_frame[2]) {
+ if (xd->mi[0]->skip_mode) {
+ set_ref_frames_for_skip_mode(cm, ref_frame);
+ return;
+ }
+
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
+ SEG_LVL_REF_FRAME);
+ ref_frame[1] = NONE_FRAME;
+ } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = NONE_FRAME;
+ } else {
+ const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
+
+ if (mode == COMPOUND_REFERENCE) {
+ const COMP_REFERENCE_TYPE comp_ref_type = read_comp_reference_type(xd, r);
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = READ_REF_BIT(uni_comp_ref_p);
+ if (bit) {
+ ref_frame[0] = BWDREF_FRAME;
+ ref_frame[1] = ALTREF_FRAME;
+ } else {
+ const int bit1 = READ_REF_BIT(uni_comp_ref_p1);
+ if (bit1) {
+ const int bit2 = READ_REF_BIT(uni_comp_ref_p2);
+ if (bit2) {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = GOLDEN_FRAME;
+ } else {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = LAST3_FRAME;
+ }
+ } else {
+ ref_frame[0] = LAST_FRAME;
+ ref_frame[1] = LAST2_FRAME;
+ }
+ }
+
+ return;
+ }
+
+ assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+ const int idx = 1;
+ const int bit = READ_REF_BIT(comp_ref_p);
+ // Decode forward references.
+ if (!bit) {
+ const int bit1 = READ_REF_BIT(comp_ref_p1);
+ ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 1 : 0];
+ } else {
+ const int bit2 = READ_REF_BIT(comp_ref_p2);
+ ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
+ }
+
+ // Decode backward references.
+ const int bit_bwd = READ_REF_BIT(comp_bwdref_p);
+ if (!bit_bwd) {
+ const int bit1_bwd = READ_REF_BIT(comp_bwdref_p1);
+ ref_frame[idx] = cm->comp_bwd_ref[bit1_bwd];
+ } else {
+ ref_frame[idx] = cm->comp_bwd_ref[2];
+ }
+ } else if (mode == SINGLE_REFERENCE) {
+ const int bit0 = READ_REF_BIT(single_ref_p1);
+ if (bit0) {
+ const int bit1 = READ_REF_BIT(single_ref_p2);
+ if (!bit1) {
+ const int bit5 = READ_REF_BIT(single_ref_p6);
+ ref_frame[0] = bit5 ? ALTREF2_FRAME : BWDREF_FRAME;
+ } else {
+ ref_frame[0] = ALTREF_FRAME;
+ }
+ } else {
+ const int bit2 = READ_REF_BIT(single_ref_p3);
+ if (bit2) {
+ const int bit4 = READ_REF_BIT(single_ref_p5);
+ ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+ } else {
+ const int bit3 = READ_REF_BIT(single_ref_p4);
+ ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+ }
+ }
+
+ ref_frame[1] = NONE_FRAME;
+ } else {
+ assert(0 && "Invalid prediction mode.");
+ }
+ }
+}
+
+static INLINE void read_mb_interp_filter(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ MB_MODE_INFO *const mbmi,
+ aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!av1_is_interp_needed(xd)) {
+ set_default_interp_filters(mbmi, cm->interp_filter);
+ return;
+ }
+
+ if (cm->interp_filter != SWITCHABLE) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(cm->interp_filter);
+ } else {
+ InterpFilter ref0_filter[2] = { EIGHTTAP_REGULAR, EIGHTTAP_REGULAR };
+ for (int dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ ref0_filter[dir] = (InterpFilter)aom_read_symbol(
+ r, ec_ctx->switchable_interp_cdf[ctx], SWITCHABLE_FILTERS, ACCT_STR);
+ if (cm->seq_params.enable_dual_filter == 0) {
+ ref0_filter[1] = ref0_filter[0];
+ break;
+ }
+ }
+ // The index system works as: (0, 1) -> (vertical, horizontal) filter types
+ mbmi->interp_filters =
+ av1_make_interp_filters(ref0_filter[0], ref0_filter[1]);
+ }
+}
+
+static void read_intra_block_mode_info(AV1_COMMON *const cm, const int mi_row,
+ const int mi_col, MACROBLOCKD *const xd,
+ MB_MODE_INFO *const mbmi,
+ aom_reader *r) {
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ mbmi->mode = read_intra_mode(r, ec_ctx->y_mode_cdf[size_group_lookup[bsize]]);
+
+ mbmi->angle_delta[PLANE_TYPE_Y] =
+ use_angle_delta && av1_is_directional_mode(mbmi->mode)
+ ? read_angle_delta(r, ec_ctx->angle_delta_cdf[mbmi->mode - V_PRED])
+ : 0;
+ const int has_chroma =
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+ xd->cfl.is_chroma_reference = has_chroma;
+ if (!cm->seq_params.monochrome && has_chroma) {
+ mbmi->uv_mode =
+ read_intra_mode_uv(ec_ctx, r, is_cfl_allowed(xd), mbmi->mode);
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mbmi->cfl_alpha_idx =
+ read_cfl_alphas(xd->tile_ctx, r, &mbmi->cfl_alpha_signs);
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] =
+ use_angle_delta && av1_is_directional_mode(get_uv_mode(mbmi->uv_mode))
+ ? read_angle_delta(r,
+ ec_ctx->angle_delta_cdf[mbmi->uv_mode - V_PRED])
+ : 0;
+ } else {
+ // Avoid decoding angle_info if there is is no chroma prediction
+ mbmi->uv_mode = UV_DC_PRED;
+ }
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+ read_palette_mode_info(cm, xd, mi_row, mi_col, r);
+
+ read_filter_intra_mode_info(cm, xd, r);
+}
+
+static INLINE int is_mv_valid(const MV *mv) {
+ return mv->row > MV_LOW && mv->row < MV_UPP && mv->col > MV_LOW &&
+ mv->col < MV_UPP;
+}
+
+static INLINE int assign_mv(AV1_COMMON *cm, MACROBLOCKD *xd,
+ PREDICTION_MODE mode,
+ MV_REFERENCE_FRAME ref_frame[2], int_mv mv[2],
+ int_mv ref_mv[2], int_mv nearest_mv[2],
+ int_mv near_mv[2], int mi_row, int mi_col,
+ int is_compound, int allow_hp, aom_reader *r) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ if (cm->cur_frame_force_integer_mv) {
+ allow_hp = MV_SUBPEL_NONE;
+ }
+ switch (mode) {
+ case NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ break;
+ }
+ case NEARESTMV: {
+ mv[0].as_int = nearest_mv[0].as_int;
+ break;
+ }
+ case NEARMV: {
+ mv[0].as_int = near_mv[0].as_int;
+ break;
+ }
+ case GLOBALMV: {
+ mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+ cm->allow_high_precision_mv, bsize, mi_col,
+ mi_row, cm->cur_frame_force_integer_mv)
+ .as_int;
+ break;
+ }
+ case NEW_NEWMV: {
+ assert(is_compound);
+ for (int i = 0; i < 2; ++i) {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, nmvc, allow_hp);
+ }
+ break;
+ }
+ case NEAREST_NEARESTMV: {
+ assert(is_compound);
+ mv[0].as_int = nearest_mv[0].as_int;
+ mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ }
+ case NEAR_NEARMV: {
+ assert(is_compound);
+ mv[0].as_int = near_mv[0].as_int;
+ mv[1].as_int = near_mv[1].as_int;
+ break;
+ }
+ case NEW_NEARESTMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ mv[1].as_int = nearest_mv[1].as_int;
+ break;
+ }
+ case NEAREST_NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ mv[0].as_int = nearest_mv[0].as_int;
+ read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ break;
+ }
+ case NEAR_NEWMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ mv[0].as_int = near_mv[0].as_int;
+ read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ break;
+ }
+ case NEW_NEARMV: {
+ nmv_context *const nmvc = &ec_ctx->nmvc;
+ read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, nmvc, allow_hp);
+ assert(is_compound);
+ mv[1].as_int = near_mv[1].as_int;
+ break;
+ }
+ case GLOBAL_GLOBALMV: {
+ assert(is_compound);
+ mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[ref_frame[0]],
+ cm->allow_high_precision_mv, bsize, mi_col,
+ mi_row, cm->cur_frame_force_integer_mv)
+ .as_int;
+ mv[1].as_int =
+ gm_get_motion_vector(&cm->global_motion[ref_frame[1]],
+ cm->allow_high_precision_mv, bsize, mi_col,
+ mi_row, cm->cur_frame_force_integer_mv)
+ .as_int;
+ break;
+ }
+ default: { return 0; }
+ }
+
+ int ret = is_mv_valid(&mv[0].as_mv);
+ if (is_compound) {
+ ret = ret && is_mv_valid(&mv[1].as_mv);
+ }
+ return ret;
+}
+
+static int read_is_inter_block(AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ int segment_id, aom_reader *r) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ const int frame = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (frame < LAST_FRAME) return 0;
+ return frame != INTRA_FRAME;
+ }
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ return 1;
+ }
+ const int ctx = av1_get_intra_inter_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int is_inter =
+ aom_read_symbol(r, ec_ctx->intra_inter_cdf[ctx], 2, ACCT_STR);
+ return is_inter;
+}
+
+#if DEC_MISMATCH_DEBUG
+static void dec_dump_logs(AV1_COMMON *cm, MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, int16_t mode_ctx) {
+ int_mv mv[2] = { { 0 } };
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref)
+ mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ int16_t zeromv_ctx = -1;
+ int16_t refmv_ctx = -1;
+ if (mbmi->mode != NEWMV) {
+ zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mbmi->mode != GLOBALMV)
+ refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ }
+
+#define FRAME_TO_CHECK 11
+ if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+ printf(
+ "=== DECODER ===: "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+ "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+ "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+ "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+ cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+ mbmi->sb_type, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+ mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+ mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx, zeromv_ctx,
+ refmv_ctx, mbmi->tx_size);
+ }
+}
+#endif // DEC_MISMATCH_DEBUG
+
+static void read_inter_block_mode_info(AV1Decoder *const pbi,
+ MACROBLOCKD *const xd,
+ MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, aom_reader *r) {
+ AV1_COMMON *const cm = &pbi->common;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ int_mv nearestmv[2], nearmv[2];
+ int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES] = { { { 0 } } };
+ int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
+ const int is_compound = has_second_ref(mbmi);
+
+ MV_REFERENCE_FRAME ref_frame = av1_ref_frame_type(mbmi->ref_frame);
+ int_mv global_mvs[REF_FRAMES];
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, xd->ref_mv_count, xd->ref_mv_stack,
+ ref_mvs, global_mvs, mi_row, mi_col, inter_mode_ctx);
+
+ int mode_ctx = av1_mode_context_analyzer(inter_mode_ctx, mbmi->ref_frame);
+ mbmi->ref_mv_idx = 0;
+
+ if (mbmi->skip_mode) {
+ assert(is_compound);
+ mbmi->mode = NEAREST_NEARESTMV;
+ } else {
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_GLOBALMV)) {
+ mbmi->mode = GLOBALMV;
+ } else {
+ if (is_compound)
+ mbmi->mode = read_inter_compound_mode(xd, r, mode_ctx);
+ else
+ mbmi->mode = read_inter_mode(ec_ctx, r, mode_ctx);
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(mbmi->mode))
+ read_drl_idx(ec_ctx, xd, mbmi, r);
+ }
+ }
+
+ if (is_compound != is_inter_compound_mode(mbmi->mode)) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Prediction mode %d invalid with ref frame %d %d",
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ }
+
+ if (!is_compound && mbmi->mode != GLOBALMV) {
+ av1_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[0]], &nearestmv[0],
+ &nearmv[0], cm->cur_frame_force_integer_mv);
+ }
+
+ if (is_compound && mbmi->mode != GLOBAL_GLOBALMV) {
+ int ref_mv_idx = mbmi->ref_mv_idx + 1;
+ nearestmv[0] = xd->ref_mv_stack[ref_frame][0].this_mv;
+ nearestmv[1] = xd->ref_mv_stack[ref_frame][0].comp_mv;
+ nearmv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+ nearmv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+ lower_mv_precision(&nearestmv[0].as_mv, allow_hp,
+ cm->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearestmv[1].as_mv, allow_hp,
+ cm->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearmv[0].as_mv, allow_hp,
+ cm->cur_frame_force_integer_mv);
+ lower_mv_precision(&nearmv[1].as_mv, allow_hp,
+ cm->cur_frame_force_integer_mv);
+ } else if (mbmi->ref_mv_idx > 0 && mbmi->mode == NEARMV) {
+ int_mv cur_mv =
+ xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+ nearmv[0] = cur_mv;
+ }
+
+ int_mv ref_mv[2];
+ ref_mv[0] = nearestmv[0];
+ ref_mv[1] = nearestmv[1];
+
+ if (is_compound) {
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ // Special case: NEAR_NEWMV and NEW_NEARMV modes use
+ // 1 + mbmi->ref_mv_idx (like NEARMV) instead of
+ // mbmi->ref_mv_idx (like NEWMV)
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV)
+ ref_mv_idx = 1 + mbmi->ref_mv_idx;
+
+ // TODO(jingning, yunqing): Do we need a lower_mv_precision() call here?
+ if (compound_ref0_mode(mbmi->mode) == NEWMV)
+ ref_mv[0] = xd->ref_mv_stack[ref_frame][ref_mv_idx].this_mv;
+
+ if (compound_ref1_mode(mbmi->mode) == NEWMV)
+ ref_mv[1] = xd->ref_mv_stack[ref_frame][ref_mv_idx].comp_mv;
+ } else {
+ if (mbmi->mode == NEWMV) {
+ if (xd->ref_mv_count[ref_frame] > 1)
+ ref_mv[0] = xd->ref_mv_stack[ref_frame][mbmi->ref_mv_idx].this_mv;
+ }
+ }
+
+ if (mbmi->skip_mode) {
+ assert(mbmi->mode == NEAREST_NEARESTMV);
+ mbmi->mv[0].as_int = nearestmv[0].as_int;
+ mbmi->mv[1].as_int = nearestmv[1].as_int;
+ } else {
+ int mv_corrupted_flag =
+ !assign_mv(cm, xd, mbmi->mode, mbmi->ref_frame, mbmi->mv, ref_mv,
+ nearestmv, nearmv, mi_row, mi_col, is_compound, allow_hp, r);
+ aom_merge_corrupted_flag(&xd->corrupted, mv_corrupted_flag);
+ }
+
+ mbmi->use_wedge_interintra = 0;
+ if (cm->seq_params.enable_interintra_compound && !mbmi->skip_mode &&
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ const int interintra =
+ aom_read_symbol(r, ec_ctx->interintra_cdf[bsize_group], 2, ACCT_STR);
+ assert(mbmi->ref_frame[1] == NONE_FRAME);
+ if (interintra) {
+ const INTERINTRA_MODE interintra_mode =
+ read_interintra_mode(xd, r, bsize_group);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->interintra_mode = interintra_mode;
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ if (is_interintra_wedge_used(bsize)) {
+ mbmi->use_wedge_interintra = aom_read_symbol(
+ r, ec_ctx->wedge_interintra_cdf[bsize], 2, ACCT_STR);
+ if (mbmi->use_wedge_interintra) {
+ mbmi->interintra_wedge_index =
+ aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+ mbmi->interintra_wedge_sign = 0;
+ }
+ }
+ }
+ }
+
+ for (int ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+ RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+ xd->block_refs[ref] = ref_buf;
+ }
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ if (is_motion_variation_allowed_bsize(mbmi->sb_type) && !mbmi->skip_mode &&
+ !has_second_ref(mbmi))
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME)
+ mbmi->motion_mode = read_motion_mode(cm, xd, mbmi, r);
+
+ // init
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+
+ if (has_second_ref(mbmi) && !mbmi->skip_mode) {
+ // Read idx to indicate current compound inter prediction mode group
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params.enable_masked_compound;
+
+ if (masked_compound_used) {
+ const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+ mbmi->comp_group_idx = aom_read_symbol(
+ r, ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2, ACCT_STR);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ if (cm->seq_params.enable_jnt_comp) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ mbmi->compound_idx = aom_read_symbol(
+ r, ec_ctx->compound_index_cdf[comp_index_ctx], 2, ACCT_STR);
+ } else {
+ // Distance-weighted compound is disabled, so always use average
+ mbmi->compound_idx = 1;
+ }
+ } else {
+ assert(cm->reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+ assert(masked_compound_used);
+
+ // compound_diffwtd, wedge
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+ mbmi->interinter_comp.type =
+ 1 + aom_read_symbol(r, ec_ctx->compound_type_cdf[bsize],
+ COMPOUND_TYPES - 1, ACCT_STR);
+ else
+ mbmi->interinter_comp.type = COMPOUND_DIFFWTD;
+
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ mbmi->interinter_comp.wedge_index =
+ aom_read_symbol(r, ec_ctx->wedge_idx_cdf[bsize], 16, ACCT_STR);
+ mbmi->interinter_comp.wedge_sign = aom_read_bit(r, ACCT_STR);
+ } else {
+ assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+ mbmi->interinter_comp.mask_type =
+ aom_read_literal(r, MAX_DIFFWTD_MASK_BITS, ACCT_STR);
+ }
+ }
+ }
+
+ read_mb_interp_filter(cm, xd, mbmi, r);
+
+ if (mbmi->motion_mode == WARPED_CAUSAL) {
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->wm_params.invalid = 0;
+
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+
+ if (find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params, mi_row, mi_col)) {
+#if WARPED_MOTION_DEBUG
+ printf("Warning: unexpected warped model from aomenc\n");
+#endif
+ mbmi->wm_params.invalid = 1;
+ }
+ }
+
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+
+#if DEC_MISMATCH_DEBUG
+ dec_dump_logs(cm, mi, mi_row, mi_col, mode_ctx);
+#endif // DEC_MISMATCH_DEBUG
+}
+
+static void read_inter_frame_mode_info(AV1Decoder *const pbi,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, aom_reader *r) {
+ AV1_COMMON *const cm = &pbi->common;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int inter_block = 1;
+
+ mbmi->mv[0].as_int = 0;
+ mbmi->mv[1].as_int = 0;
+ mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 1, r);
+
+ mbmi->skip_mode = read_skip_mode(cm, xd, mbmi->segment_id, r);
+
+ if (mbmi->skip_mode)
+ mbmi->skip = 1;
+ else
+ mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+
+ mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, 0, r);
+
+ read_cdef(cm, r, xd, mi_col, mi_row);
+
+ read_delta_q_params(cm, xd, mi_row, mi_col, r);
+
+ if (!mbmi->skip_mode)
+ inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+ mbmi->current_qindex = xd->current_qindex;
+
+ xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ if (inter_block)
+ read_inter_block_mode_info(pbi, xd, mbmi, mi_row, mi_col, r);
+ else
+ read_intra_block_mode_info(cm, mi_row, mi_col, xd, mbmi, r);
+}
+
+static void intra_copy_frame_mvs(AV1_COMMON *const cm, int mi_row, int mi_col,
+ int x_mis, int y_mis) {
+ const int frame_mvs_stride = ROUND_POWER_OF_TWO(cm->mi_cols, 1);
+ MV_REF *frame_mvs =
+ cm->cur_frame->mvs + (mi_row >> 1) * frame_mvs_stride + (mi_col >> 1);
+ x_mis = ROUND_POWER_OF_TWO(x_mis, 1);
+ y_mis = ROUND_POWER_OF_TWO(y_mis, 1);
+
+ for (int h = 0; h < y_mis; h++) {
+ MV_REF *mv = frame_mvs;
+ for (int w = 0; w < x_mis; w++) {
+ mv->ref_frame = NONE_FRAME;
+ mv++;
+ }
+ frame_mvs += frame_mvs_stride;
+ }
+}
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd, int mi_row,
+ int mi_col, aom_reader *r, int x_mis, int y_mis) {
+ AV1_COMMON *const cm = &pbi->common;
+ MB_MODE_INFO *const mi = xd->mi[0];
+ mi->use_intrabc = 0;
+
+ if (frame_is_intra_only(cm)) {
+ read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+ intra_copy_frame_mvs(cm, mi_row, mi_col, x_mis, y_mis);
+ } else {
+ read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
+ av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+ }
+}
diff --git a/third_party/aom/av1/decoder/decodemv.h b/third_party/aom/av1/decoder/decodemv.h
new file mode 100644
index 0000000000..1625e5bd23
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodemv.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODEMV_H_
+#define AOM_AV1_DECODER_DECODEMV_H_
+
+#include "aom_dsp/bitreader.h"
+
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_read_mode_info(AV1Decoder *const pbi, MACROBLOCKD *xd,
+
+ int mi_row, int mi_col, aom_reader *r, int x_mis,
+ int y_mis);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+void av1_read_tx_type(const AV1_COMMON *const cm, MACROBLOCKD *xd, int blk_row,
+ int blk_col, TX_SIZE tx_size, aom_reader *r);
+
+#endif // AOM_AV1_DECODER_DECODEMV_H_
diff --git a/third_party/aom/av1/decoder/decoder.c b/third_party/aom/av1/decoder/decoder.c
new file mode 100644
index 0000000000..a5f4fd67fa
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.c
@@ -0,0 +1,575 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_ports/aom_once.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/detokenize.h"
+#include "av1/decoder/obu.h"
+
+static void initialize_dec(void) {
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_wedge_masks();
+}
+
+static void dec_setup_mi(AV1_COMMON *cm) {
+ cm->mi = cm->mip;
+ cm->mi_grid_visible = cm->mi_grid_base;
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * cm->mi_rows * sizeof(*cm->mi_grid_base));
+}
+
+static int av1_dec_alloc_mi(AV1_COMMON *cm, int mi_size) {
+ cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip) return 1;
+ cm->mi_alloc_size = mi_size;
+ cm->mi_grid_base =
+ (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+ if (!cm->mi_grid_base) return 1;
+ return 0;
+}
+
+static void dec_free_mi(AV1_COMMON *cm) {
+ aom_free(cm->mip);
+ cm->mip = NULL;
+ aom_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+ cm->mi_alloc_size = 0;
+}
+
+AV1Decoder *av1_decoder_create(BufferPool *const pool) {
+ AV1Decoder *volatile const pbi = aom_memalign(32, sizeof(*pbi));
+ AV1_COMMON *volatile const cm = pbi ? &pbi->common : NULL;
+
+ if (!cm) return NULL;
+
+ av1_zero(*pbi);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error.jmp)) {
+ cm->error.setjmp = 0;
+ av1_decoder_remove(pbi);
+ return NULL;
+ }
+
+ cm->error.setjmp = 1;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(cm, cm->frame_contexts,
+ (FRAME_CONTEXT *)aom_memalign(
+ 32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+ pbi->need_resync = 1;
+ aom_once(initialize_dec);
+
+ // Initialize the references to not point to any frame buffers.
+ memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
+ memset(&cm->next_ref_frame_map, -1, sizeof(cm->next_ref_frame_map));
+
+ cm->current_video_frame = 0;
+ pbi->decoding_first_frame = 1;
+ pbi->common.buffer_pool = pool;
+
+ cm->seq_params.bit_depth = AOM_BITS_8;
+ cm->dequant_bit_depth = AOM_BITS_8;
+
+ cm->alloc_mi = av1_dec_alloc_mi;
+ cm->free_mi = dec_free_mi;
+ cm->setup_mi = dec_setup_mi;
+
+ av1_loop_filter_init(cm);
+
+ av1_qm_init(cm);
+ av1_loop_restoration_precal();
+#if CONFIG_ACCOUNTING
+ pbi->acct_enabled = 1;
+ aom_accounting_init(&pbi->accounting);
+#endif
+
+ cm->error.setjmp = 0;
+
+ aom_get_worker_interface()->init(&pbi->lf_worker);
+
+ return pbi;
+}
+
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_mt_info) {
+ if (tile_mt_info != NULL) {
+#if CONFIG_MULTITHREAD
+ if (tile_mt_info->job_mutex != NULL) {
+ pthread_mutex_destroy(tile_mt_info->job_mutex);
+ aom_free(tile_mt_info->job_mutex);
+ }
+#endif
+ aom_free(tile_mt_info->job_queue);
+ // clear the structure as the source of this call may be a resize in which
+ // case this call will be followed by an _alloc() which may fail.
+ av1_zero(*tile_mt_info);
+ }
+}
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi) {
+ aom_free(pbi->cb_buffer_base);
+ pbi->cb_buffer_base = NULL;
+ pbi->cb_buffer_alloc_size = 0;
+}
+
+void av1_decoder_remove(AV1Decoder *pbi) {
+ int i;
+
+ if (!pbi) return;
+
+ // Free the tile list output buffer.
+ if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+ pbi->tile_list_output = NULL;
+
+ aom_get_worker_interface()->end(&pbi->lf_worker);
+ aom_free(pbi->lf_worker.data1);
+
+ if (pbi->thread_data) {
+ for (int worker_idx = 0; worker_idx < pbi->max_threads - 1; worker_idx++) {
+ DecWorkerData *const thread_data = pbi->thread_data + worker_idx;
+ av1_free_mc_tmp_buf(thread_data->td);
+ aom_free(thread_data->td);
+ }
+ aom_free(pbi->thread_data);
+ }
+
+ for (i = 0; i < pbi->num_workers; ++i) {
+ AVxWorker *const worker = &pbi->tile_workers[i];
+ aom_get_worker_interface()->end(worker);
+ }
+#if CONFIG_MULTITHREAD
+ if (pbi->row_mt_mutex_ != NULL) {
+ pthread_mutex_destroy(pbi->row_mt_mutex_);
+ aom_free(pbi->row_mt_mutex_);
+ }
+ if (pbi->row_mt_cond_ != NULL) {
+ pthread_cond_destroy(pbi->row_mt_cond_);
+ aom_free(pbi->row_mt_cond_);
+ }
+#endif
+ for (i = 0; i < pbi->allocated_tiles; i++) {
+ TileDataDec *const tile_data = pbi->tile_data + i;
+ av1_dec_row_mt_dealloc(&tile_data->dec_row_mt_sync);
+ }
+ aom_free(pbi->tile_data);
+ aom_free(pbi->tile_workers);
+
+ if (pbi->num_workers > 0) {
+ av1_loop_filter_dealloc(&pbi->lf_row_sync);
+ av1_loop_restoration_dealloc(&pbi->lr_row_sync, pbi->num_workers);
+ av1_dealloc_dec_jobs(&pbi->tile_mt_info);
+ }
+
+ av1_dec_free_cb_buf(pbi);
+#if CONFIG_ACCOUNTING
+ aom_accounting_clear(&pbi->accounting);
+#endif
+ av1_free_mc_tmp_buf(&pbi->td);
+
+ aom_free(pbi);
+}
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+ int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+ palette_visitor_fn_t visit) {
+ if (!is_inter_block(xd->mi[0])) {
+ for (int plane = 0; plane < AOMMIN(2, av1_num_planes(&pbi->common));
+ ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y)) {
+ if (xd->mi[0]->palette_mode_info.palette_size[plane])
+ visit(xd, plane, r);
+ } else {
+ assert(xd->mi[0]->palette_mode_info.palette_size[plane] == 0);
+ }
+ }
+ }
+}
+
+static int equal_dimensions(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width;
+}
+
+aom_codec_err_t av1_copy_reference_dec(AV1Decoder *pbi, int idx,
+ YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *cm = &pbi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ const YV12_BUFFER_CONFIG *const cfg = get_ref_frame(cm, idx);
+ if (cfg == NULL) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+ return AOM_CODEC_ERROR;
+ }
+ if (!equal_dimensions(cfg, sd))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(cfg, sd, num_planes);
+
+ return cm->error.error_code;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+ a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+ a->border == b->border &&
+ (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+ int use_external_ref,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *ref_buf = NULL;
+
+ // Get the destination reference buffer.
+ ref_buf = get_ref_frame(cm, idx);
+
+ if (ref_buf == NULL) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR, "No reference frame");
+ return AOM_CODEC_ERROR;
+ }
+
+ if (!use_external_ref) {
+ if (!equal_dimensions(ref_buf, sd)) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else {
+ // Overwrite the reference frame buffer.
+ aom_yv12_copy_frame(sd, ref_buf, num_planes);
+ }
+ } else {
+ if (!equal_dimensions_and_border(ref_buf, sd)) {
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ } else {
+ // Overwrite the reference frame buffer pointers.
+ // Once we no longer need the external reference buffer, these pointers
+ // are restored.
+ ref_buf->store_buf_adr[0] = ref_buf->y_buffer;
+ ref_buf->store_buf_adr[1] = ref_buf->u_buffer;
+ ref_buf->store_buf_adr[2] = ref_buf->v_buffer;
+ ref_buf->y_buffer = sd->y_buffer;
+ ref_buf->u_buffer = sd->u_buffer;
+ ref_buf->v_buffer = sd->v_buffer;
+ ref_buf->use_external_reference_buffers = 1;
+ }
+ }
+
+ return cm->error.error_code;
+}
+
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+
+ if (!equal_dimensions_and_border(new_frame, sd))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+ return cm->error.error_code;
+}
+
+/* If any buffer updating is signaled it should be done here.
+ Consumes a reference to cm->new_fb_idx.
+*/
+static void swap_frame_buffers(AV1Decoder *pbi, int frame_decoded) {
+ int ref_index = 0, mask;
+ AV1_COMMON *const cm = &pbi->common;
+ BufferPool *const pool = cm->buffer_pool;
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+
+ if (frame_decoded) {
+ lock_buffer_pool(pool);
+
+ // In ext-tile decoding, the camera frame header is only decoded once. So,
+ // we don't release the references here.
+ if (!pbi->camera_frame_header_ready) {
+ for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ // Current thread releases the holding of reference frame.
+ decrease_ref_count(old_idx, frame_bufs, pool);
+
+ // Release the reference frame holding in the reference map for the
+ // decoding of the next frame.
+ if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+ cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+ ++ref_index;
+ }
+
+ // Current thread releases the holding of reference frame.
+ const int check_on_show_existing_frame =
+ !cm->show_existing_frame || cm->reset_decoder_state;
+ for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+ ++ref_index) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ decrease_ref_count(old_idx, frame_bufs, pool);
+ cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
+ }
+ }
+
+ YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+
+ if (cm->show_existing_frame || cm->show_frame) {
+ if (pbi->output_all_layers) {
+ // Append this frame to the output queue
+ if (pbi->num_output_frames >= MAX_NUM_SPATIAL_LAYERS) {
+ // We can't store the new frame anywhere, so drop it and return an
+ // error
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ } else {
+ pbi->output_frames[pbi->num_output_frames] = cur_frame;
+ pbi->output_frame_index[pbi->num_output_frames] = cm->new_fb_idx;
+ pbi->num_output_frames++;
+ }
+ } else {
+ // Replace any existing output frame
+ assert(pbi->num_output_frames == 0 || pbi->num_output_frames == 1);
+ if (pbi->num_output_frames > 0) {
+ decrease_ref_count((int)pbi->output_frame_index[0], frame_bufs, pool);
+ }
+ pbi->output_frames[0] = cur_frame;
+ pbi->output_frame_index[0] = cm->new_fb_idx;
+ pbi->num_output_frames = 1;
+ }
+ } else {
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ }
+
+ unlock_buffer_pool(pool);
+ } else {
+ // Nothing was decoded, so just drop this frame buffer
+ lock_buffer_pool(pool);
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ }
+
+ if (!pbi->camera_frame_header_ready) {
+ pbi->hold_ref_buf = 0;
+
+ // Invalidate these references until the next frame starts.
+ for (ref_index = 0; ref_index < INTER_REFS_PER_FRAME; ref_index++) {
+ cm->frame_refs[ref_index].idx = INVALID_IDX;
+ cm->frame_refs[ref_index].buf = NULL;
+ }
+ }
+}
+
+int av1_receive_compressed_data(AV1Decoder *pbi, size_t size,
+ const uint8_t **psource) {
+ AV1_COMMON *volatile const cm = &pbi->common;
+ BufferPool *volatile const pool = cm->buffer_pool;
+ RefCntBuffer *volatile const frame_bufs = cm->buffer_pool->frame_bufs;
+ const uint8_t *source = *psource;
+ cm->error.error_code = AOM_CODEC_OK;
+
+ if (size == 0) {
+ // This is used to signal that we are missing frames.
+ // We do not know if the missing frame(s) was supposed to update
+ // any of the reference buffers, but we act conservative and
+ // mark only the last buffer as corrupted.
+ //
+ // TODO(jkoleszar): Error concealment is undefined and non-normative
+ // at this point, but if it becomes so, [0] may not always be the correct
+ // thing to do here.
+ if (cm->frame_refs[0].idx > 0) {
+ assert(cm->frame_refs[0].buf != NULL);
+ cm->frame_refs[0].buf->corrupted = 1;
+ }
+ }
+
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+
+ // Find a free frame buffer. Return error if can not find any.
+ cm->new_fb_idx = get_free_fb(cm);
+ if (cm->new_fb_idx == INVALID_IDX) {
+ cm->error.error_code = AOM_CODEC_MEM_ERROR;
+ return 1;
+ }
+
+ // Assign a MV array to the frame buffer.
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+
+ if (!pbi->camera_frame_header_ready) pbi->hold_ref_buf = 0;
+
+ pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error.jmp)) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ int i;
+
+ cm->error.setjmp = 0;
+
+ // Synchronize all threads immediately as a subsequent decode call may
+ // cause a resize invalidating some allocations.
+ winterface->sync(&pbi->lf_worker);
+ for (i = 0; i < pbi->num_workers; ++i) {
+ winterface->sync(&pbi->tile_workers[i]);
+ }
+
+ lock_buffer_pool(pool);
+ // Release all the reference buffers if worker thread is holding them.
+ if (pbi->hold_ref_buf == 1) {
+ int ref_index = 0, mask;
+ for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ // Current thread releases the holding of reference frame.
+ decrease_ref_count(old_idx, frame_bufs, pool);
+
+ // Release the reference frame holding in the reference map for the
+ // decoding of the next frame.
+ if (mask & 1) decrease_ref_count(old_idx, frame_bufs, pool);
+ ++ref_index;
+ }
+
+ // Current thread releases the holding of reference frame.
+ const int check_on_show_existing_frame =
+ !cm->show_existing_frame || cm->reset_decoder_state;
+ for (; ref_index < REF_FRAMES && check_on_show_existing_frame;
+ ++ref_index) {
+ const int old_idx = cm->ref_frame_map[ref_index];
+ decrease_ref_count(old_idx, frame_bufs, pool);
+ }
+ pbi->hold_ref_buf = 0;
+ }
+ // Release current frame.
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+
+ aom_clear_system_state();
+ return -1;
+ }
+
+ cm->error.setjmp = 1;
+
+ int frame_decoded =
+ aom_decode_frame_from_obus(pbi, source, source + size, psource);
+
+ if (cm->error.error_code != AOM_CODEC_OK) {
+ lock_buffer_pool(pool);
+ decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
+ unlock_buffer_pool(pool);
+ cm->error.setjmp = 0;
+ return 1;
+ }
+
+#if TXCOEFF_TIMER
+ cm->cum_txcoeff_timer += cm->txcoeff_timer;
+ fprintf(stderr,
+ "txb coeff block number: %d, frame time: %ld, cum time %ld in us\n",
+ cm->txb_count, cm->txcoeff_timer, cm->cum_txcoeff_timer);
+ cm->txcoeff_timer = 0;
+ cm->txb_count = 0;
+#endif
+
+ // Note: At this point, this function holds a reference to cm->new_fb_idx
+ // in the buffer pool. This reference is consumed by swap_frame_buffers().
+ swap_frame_buffers(pbi, frame_decoded);
+
+ if (frame_decoded) {
+ pbi->decoding_first_frame = 0;
+ }
+
+ if (cm->error.error_code != AOM_CODEC_OK) {
+ cm->error.setjmp = 0;
+ return 1;
+ }
+
+ aom_clear_system_state();
+
+ if (!cm->show_existing_frame) {
+ cm->last_show_frame = cm->show_frame;
+
+ if (cm->seg.enabled) {
+ if (cm->prev_frame && (cm->mi_rows == cm->prev_frame->mi_rows) &&
+ (cm->mi_cols == cm->prev_frame->mi_cols)) {
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ } else {
+ cm->last_frame_seg_map = NULL;
+ }
+ }
+ }
+
+ // Update progress in frame parallel decode.
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+ cm->last_tile_cols = cm->tile_cols;
+ cm->last_tile_rows = cm->tile_rows;
+ cm->error.setjmp = 0;
+
+ return 0;
+}
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+ aom_film_grain_t **grain_params) {
+ RefCntBuffer *const frame_bufs = pbi->common.buffer_pool->frame_bufs;
+
+ if (index >= pbi->num_output_frames) return -1;
+ *sd = pbi->output_frames[index];
+ *grain_params = &frame_bufs[pbi->output_frame_index[index]].film_grain_params;
+ aom_clear_system_state();
+ return 0;
+}
+
+// Get the highest-spatial-layer output
+// TODO(david.barker): What should this do?
+int av1_get_frame_to_show(AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame) {
+ if (pbi->num_output_frames == 0) return -1;
+
+ *frame = *pbi->output_frames[pbi->num_output_frames - 1];
+ return 0;
+}
diff --git a/third_party/aom/av1/decoder/decoder.h b/third_party/aom/av1/decoder/decoder.h
new file mode 100644
index 0000000000..5ca939c245
--- /dev/null
+++ b/third_party/aom/av1/decoder/decoder.h
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODER_H_
+#define AOM_AV1_DECODER_DECODER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/decoder/dthread.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+#if CONFIG_INSPECTION
+#include "av1/decoder/inspection.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*decode_block_visitor_fn_t)(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ aom_reader *const r, const int plane,
+ const int row, const int col,
+ const TX_SIZE tx_size);
+
+typedef void (*predict_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bsize);
+
+typedef void (*cfl_store_inter_block_visitor_fn_t)(AV1_COMMON *const cm,
+ MACROBLOCKD *const xd);
+
+typedef struct ThreadData {
+ aom_reader *bit_reader;
+ DECLARE_ALIGNED(32, MACROBLOCKD, xd);
+ /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+ DECLARE_ALIGNED(32, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+ CB_BUFFER cb_buffer_base;
+ uint8_t *mc_buf[2];
+ int32_t mc_buf_size;
+ int mc_buf_use_highbd; // Boolean: whether the byte pointers stored in
+ // mc_buf were converted from highbd pointers.
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+
+ decode_block_visitor_fn_t read_coeffs_tx_intra_block_visit;
+ decode_block_visitor_fn_t predict_and_recon_intra_block_visit;
+ decode_block_visitor_fn_t read_coeffs_tx_inter_block_visit;
+ decode_block_visitor_fn_t inverse_tx_inter_block_visit;
+ predict_inter_block_visitor_fn_t predict_inter_block_visit;
+ cfl_store_inter_block_visitor_fn_t cfl_store_inter_block_visit;
+} ThreadData;
+
+typedef struct AV1DecRowMTJobInfo {
+ int tile_row;
+ int tile_col;
+ int mi_row;
+} AV1DecRowMTJobInfo;
+
+typedef struct AV1DecRowMTSyncData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *mutex_;
+ pthread_cond_t *cond_;
+#endif
+ int allocated_sb_rows;
+ int *cur_sb_col;
+ int sync_range;
+ int mi_rows;
+ int mi_cols;
+ int mi_rows_parse_done;
+ int mi_rows_decode_started;
+ int num_threads_working;
+} AV1DecRowMTSync;
+
+typedef struct AV1DecRowMTInfo {
+ int tile_rows_start;
+ int tile_rows_end;
+ int tile_cols_start;
+ int tile_cols_end;
+ int start_tile;
+ int end_tile;
+ int mi_rows_parse_done;
+ int mi_rows_decode_started;
+ int mi_rows_to_decode;
+ int row_mt_exit;
+} AV1DecRowMTInfo;
+
+typedef struct TileDataDec {
+ TileInfo tile_info;
+ aom_reader bit_reader;
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ AV1DecRowMTSync dec_row_mt_sync;
+} TileDataDec;
+
+typedef struct TileBufferDec {
+ const uint8_t *data;
+ size_t size;
+} TileBufferDec;
+
+typedef struct DataBuffer {
+ const uint8_t *data;
+ size_t size;
+} DataBuffer;
+
+typedef struct EXTERNAL_REFERENCES {
+ YV12_BUFFER_CONFIG refs[MAX_EXTERNAL_REFERENCES];
+ int num;
+} EXTERNAL_REFERENCES;
+
+typedef struct TileJobsDec {
+ TileBufferDec *tile_buffer;
+ TileDataDec *tile_data;
+} TileJobsDec;
+
+typedef struct AV1DecTileMTData {
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *job_mutex;
+#endif
+ TileJobsDec *job_queue;
+ int jobs_enqueued;
+ int jobs_dequeued;
+ int alloc_tile_rows;
+ int alloc_tile_cols;
+} AV1DecTileMT;
+
+typedef struct AV1Decoder {
+ DECLARE_ALIGNED(32, MACROBLOCKD, mb);
+
+ DECLARE_ALIGNED(32, AV1_COMMON, common);
+
+ int refresh_frame_flags;
+
+ // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
+ // the same.
+ RefCntBuffer *cur_buf; // Current decoding frame buffer.
+
+ AVxWorker *frame_worker_owner; // frame_worker that owns this pbi.
+ AVxWorker lf_worker;
+ AV1LfSync lf_row_sync;
+ AV1LrSync lr_row_sync;
+ AV1LrStruct lr_ctxt;
+ AVxWorker *tile_workers;
+ int num_workers;
+ DecWorkerData *thread_data;
+ ThreadData td;
+ TileDataDec *tile_data;
+ int allocated_tiles;
+
+ TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+ AV1DecTileMT tile_mt_info;
+
+ // Each time the decoder is called, we expect to receive a full temporal unit.
+ // This can contain up to one shown frame per spatial layer in the current
+ // operating point (note that some layers may be entirely omitted).
+ // If the 'output_all_layers' option is true, we save all of these shown
+ // frames so that they can be returned to the application. If the
+ // 'output_all_layers' option is false, then we only output one image per
+ // temporal unit.
+ //
+ // Note: The saved buffers are released at the start of the next time the
+ // application calls aom_codec_decode().
+ int output_all_layers;
+ YV12_BUFFER_CONFIG *output_frames[MAX_NUM_SPATIAL_LAYERS];
+ size_t output_frame_index[MAX_NUM_SPATIAL_LAYERS]; // Buffer pool indices
+ size_t num_output_frames; // How many frames are queued up so far?
+
+ // In order to properly support random-access decoding, we need
+ // to behave slightly differently for the very first frame we decode.
+ // So we track whether this is the first frame or not.
+ int decoding_first_frame;
+
+ int allow_lowbitdepth;
+ int max_threads;
+ int inv_tile_order;
+ int need_resync; // wait for key/intra-only frame.
+ int hold_ref_buf; // hold the reference buffer.
+
+ int tile_size_bytes;
+ int tile_col_size_bytes;
+ int dec_tile_row, dec_tile_col; // always -1 for non-VR tile encoding
+#if CONFIG_ACCOUNTING
+ int acct_enabled;
+ Accounting accounting;
+#endif
+ int tg_size; // Number of tiles in the current tilegroup
+ int tg_start; // First tile in the current tilegroup
+ int tg_size_bit_offset;
+ int sequence_header_ready;
+ int sequence_header_changed;
+#if CONFIG_INSPECTION
+ aom_inspect_cb inspect_cb;
+ void *inspect_ctx;
+#endif
+ int operating_point;
+ int current_operating_point;
+ int seen_frame_header;
+
+ // State if the camera frame header is already decoded while
+ // large_scale_tile = 1.
+ int camera_frame_header_ready;
+ size_t frame_header_size;
+ DataBuffer obu_size_hdr;
+ int output_frame_width_in_tiles_minus_1;
+ int output_frame_height_in_tiles_minus_1;
+ int tile_count_minus_1;
+ uint32_t coded_tile_data_size;
+ unsigned int ext_tile_debug; // for ext-tile software debug & testing
+ unsigned int row_mt;
+ EXTERNAL_REFERENCES ext_refs;
+ size_t tile_list_size;
+ uint8_t *tile_list_output;
+ size_t buffer_sz;
+
+ CB_BUFFER *cb_buffer_base;
+ int cb_buffer_alloc_size;
+
+ int allocated_row_mt_sync_rows;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t *row_mt_mutex_;
+ pthread_cond_t *row_mt_cond_;
+#endif
+
+ AV1DecRowMTInfo frame_row_mt_info;
+} AV1Decoder;
+
+// Returns 0 on success. Sets pbi->common.error.error_code to a nonzero error
+// code and returns a nonzero value on failure.
+int av1_receive_compressed_data(struct AV1Decoder *pbi, size_t size,
+ const uint8_t **dest);
+
+// Get the frame at a particular index in the output queue
+int av1_get_raw_frame(AV1Decoder *pbi, size_t index, YV12_BUFFER_CONFIG **sd,
+ aom_film_grain_t **grain_params);
+
+int av1_get_frame_to_show(struct AV1Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_reference_dec(struct AV1Decoder *pbi, int idx,
+ YV12_BUFFER_CONFIG *sd);
+
+aom_codec_err_t av1_set_reference_dec(AV1_COMMON *cm, int idx,
+ int use_external_ref,
+ YV12_BUFFER_CONFIG *sd);
+aom_codec_err_t av1_copy_new_frame_dec(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd);
+
+struct AV1Decoder *av1_decoder_create(BufferPool *const pool);
+
+void av1_decoder_remove(struct AV1Decoder *pbi);
+void av1_dealloc_dec_jobs(struct AV1DecTileMTData *tile_jobs_sync);
+
+void av1_dec_row_mt_dealloc(AV1DecRowMTSync *dec_row_mt_sync);
+
+void av1_dec_free_cb_buf(AV1Decoder *pbi);
+
+static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
+ BufferPool *const pool) {
+ if (idx >= 0) {
+ --frame_bufs[idx].ref_count;
+ // A worker may only get a free framebuffer index when calling get_free_fb.
+ // But the private buffer is not set up until finish decoding header.
+ // So any error happens during decoding header, the frame_bufs will not
+ // have valid priv buffer.
+ if (frame_bufs[idx].ref_count == 0 &&
+ frame_bufs[idx].raw_frame_buffer.priv) {
+ pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+ }
+ }
+}
+
+#define ACCT_STR __func__
+static INLINE int av1_read_uniform(aom_reader *r, int n) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ const int v = aom_read_literal(r, l - 1, ACCT_STR);
+ assert(l != 0);
+ if (v < m)
+ return v;
+ else
+ return (v << 1) - m + aom_read_literal(r, 1, ACCT_STR);
+}
+
+typedef void (*palette_visitor_fn_t)(MACROBLOCKD *const xd, int plane,
+ aom_reader *r);
+
+void av1_visit_palette(AV1Decoder *const pbi, MACROBLOCKD *const xd, int mi_row,
+ int mi_col, aom_reader *r, BLOCK_SIZE bsize,
+ palette_visitor_fn_t visit);
+
+typedef void (*block_visitor_fn_t)(AV1Decoder *const pbi, ThreadData *const td,
+ int mi_row, int mi_col, aom_reader *r,
+ PARTITION_TYPE partition, BLOCK_SIZE bsize);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DECODER_H_
diff --git a/third_party/aom/av1/decoder/decodetxb.c b/third_party/aom/av1/decoder/decodetxb.c
new file mode 100644
index 0000000000..f3ef2d55e4
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/decoder/decodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/idct.h"
+#include "av1/common/scan.h"
+#include "av1/common/txb_common.h"
+#include "av1/decoder/decodemv.h"
+
+#define ACCT_STR __func__
+
+static int read_golomb(MACROBLOCKD *xd, aom_reader *r) {
+ int x = 1;
+ int length = 0;
+ int i = 0;
+
+ while (!i) {
+ i = aom_read_bit(r, ACCT_STR);
+ ++length;
+ if (length > 20) {
+ aom_internal_error(xd->error_info, AOM_CODEC_CORRUPT_FRAME,
+ "Invalid length in read_golomb");
+ break;
+ }
+ }
+
+ for (i = 0; i < length - 1; ++i) {
+ x <<= 1;
+ x += aom_read_bit(r, ACCT_STR);
+ }
+
+ return x - 1;
+}
+
+static INLINE int rec_eob_pos(const int eob_token, const int extra) {
+ int eob = k_eob_group_start[eob_token];
+ if (eob > 2) {
+ eob += extra;
+ }
+ return eob;
+}
+
+static INLINE int get_dqv(const int16_t *dequant, int coeff_idx,
+ const qm_val_t *iqmatrix) {
+ int dqv = dequant[!!coeff_idx];
+ if (iqmatrix != NULL)
+ dqv =
+ ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ return dqv;
+}
+
+static INLINE void read_coeffs_reverse_2d(aom_reader *r, TX_SIZE tx_size,
+ int start_si, int end_si,
+ const int16_t *scan, int bwl,
+ uint8_t *levels,
+ base_cdf_arr base_cdf,
+ br_cdf_arr br_cdf) {
+ for (int c = end_si; c >= start_si; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = get_lower_levels_ctx_2d(levels, pos, bwl, tx_size);
+ const int nsymbs = 4;
+ int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx_2d(levels, pos, bwl);
+ aom_cdf_prob *cdf = br_cdf[br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bwl)] = level;
+ }
+}
+
+static INLINE void read_coeffs_reverse(aom_reader *r, TX_SIZE tx_size,
+ TX_CLASS tx_class, int start_si,
+ int end_si, const int16_t *scan, int bwl,
+ uint8_t *levels, base_cdf_arr base_cdf,
+ br_cdf_arr br_cdf) {
+ for (int c = end_si; c >= start_si; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, pos, bwl, tx_size, tx_class);
+ const int nsymbs = 4;
+ int level = aom_read_symbol(r, base_cdf[coeff_ctx], nsymbs, ACCT_STR);
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ aom_cdf_prob *cdf = br_cdf[br_ctx];
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(r, cdf, BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bwl)] = level;
+ }
+}
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *const r, const int blk_row,
+ const int blk_col, const int plane,
+ const TXB_CTX *const txb_ctx,
+ const TX_SIZE tx_size) {
+ FRAME_CONTEXT *const ec_ctx = xd->tile_ctx;
+ const int32_t max_value = (1 << (7 + xd->bd)) - 1;
+ const int32_t min_value = -(1 << (7 + xd->bd));
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int16_t *const dequant = pd->seg_dequant_QTX[mbmi->segment_id];
+ tran_low_t *const tcoeffs = pd->dqcoeff_block + xd->cb_offset[plane];
+ const int shift = av1_get_tx_scale(tx_size);
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ int cul_level = 0;
+ int dc_val = 0;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+ const int all_zero = aom_read_symbol(
+ r, ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2, ACCT_STR);
+ eob_info *eob_data = pd->eob_data + xd->txb_offset[plane];
+ uint16_t *const eob = &(eob_data->eob);
+ uint16_t *const max_scan_line = &(eob_data->max_scan_line);
+ *max_scan_line = 0;
+ *eob = 0;
+ if (all_zero) {
+ *max_scan_line = 0;
+ if (plane == 0) {
+ const int txk_type_idx =
+ av1_get_txk_type_index(mbmi->sb_type, blk_row, blk_col);
+ mbmi->txk_type[txk_type_idx] = DCT_DCT;
+ }
+ return 0;
+ }
+
+ memset(levels_buf, 0,
+ sizeof(*levels_buf) *
+ ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
+ if (plane == AOM_PLANE_Y) {
+ // only y plane's tx_type is transmitted
+ av1_read_tx_type(cm, xd, blk_row, blk_col, tx_size, r);
+ }
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ const qm_val_t *iqmatrix =
+ IS_2D_TRANSFORM(tx_type)
+ ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+ : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ int eob_extra = 0;
+ int eob_pt = 1;
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ switch (eob_multi_size) {
+ case 0:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx],
+ 5, ACCT_STR) +
+ 1;
+ break;
+ case 1:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx],
+ 6, ACCT_STR) +
+ 1;
+ break;
+ case 2:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx],
+ 7, ACCT_STR) +
+ 1;
+ break;
+ case 3:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx],
+ 8, ACCT_STR) +
+ 1;
+ break;
+ case 4:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx],
+ 9, ACCT_STR) +
+ 1;
+ break;
+ case 5:
+ eob_pt =
+ aom_read_symbol(r, ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx],
+ 10, ACCT_STR) +
+ 1;
+ break;
+ case 6:
+ default:
+ eob_pt = aom_read_symbol(
+ r, ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11,
+ ACCT_STR) +
+ 1;
+ break;
+ }
+
+ if (k_eob_offset_bits[eob_pt] > 0) {
+ const int eob_ctx = eob_pt - 3;
+ int bit = aom_read_symbol(
+ r, ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2, ACCT_STR);
+ if (bit) {
+ eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1));
+ }
+
+ for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+ bit = aom_read_bit(r, ACCT_STR);
+ if (bit) {
+ eob_extra += (1 << (k_eob_offset_bits[eob_pt] - 1 - i));
+ }
+ }
+ }
+ *eob = rec_eob_pos(eob_pt, eob_extra);
+
+ {
+ // Read the non-zero coefficient with scan index eob-1
+ // TODO(angiebird): Put this into a function
+ const int c = *eob - 1;
+ const int pos = scan[c];
+ const int coeff_ctx = get_lower_levels_ctx_eob(bwl, height, c);
+ const int nsymbs = 3;
+ aom_cdf_prob *cdf =
+ ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx];
+ int level = aom_read_symbol(r, cdf, nsymbs, ACCT_STR) + 1;
+ if (level > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = aom_read_symbol(
+ r,
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+ BR_CDF_SIZE, ACCT_STR);
+ level += k;
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ levels[get_padded_idx(pos, bwl)] = level;
+ }
+ if (*eob > 1) {
+ base_cdf_arr base_cdf = ec_ctx->coeff_base_cdf[txs_ctx][plane_type];
+ br_cdf_arr br_cdf =
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type];
+ if (tx_class == TX_CLASS_2D) {
+ read_coeffs_reverse_2d(r, tx_size, 1, *eob - 1 - 1, scan, bwl, levels,
+ base_cdf, br_cdf);
+ read_coeffs_reverse(r, tx_size, tx_class, 0, 0, scan, bwl, levels,
+ base_cdf, br_cdf);
+ } else {
+ read_coeffs_reverse(r, tx_size, tx_class, 0, *eob - 1 - 1, scan, bwl,
+ levels, base_cdf, br_cdf);
+ }
+ }
+
+ int16_t num_zero_coeffs = 0;
+ for (int c = 0; c < *eob; ++c) {
+ const int pos = scan[c];
+ num_zero_coeffs = AOMMAX(num_zero_coeffs, pos);
+ }
+ memset(tcoeffs, 0, (num_zero_coeffs + 1) * sizeof(tcoeffs[0]));
+
+ for (int c = 0; c < *eob; ++c) {
+ const int pos = scan[c];
+ uint8_t sign;
+ tran_low_t level = levels[get_padded_idx(pos, bwl)];
+ if (level) {
+ *max_scan_line = AOMMAX(*max_scan_line, pos);
+ if (c == 0) {
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ sign = aom_read_symbol(r, ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx],
+ 2, ACCT_STR);
+ } else {
+ sign = aom_read_bit(r, ACCT_STR);
+ }
+ if (level >= MAX_BASE_BR_RANGE) {
+ level += read_golomb(xd, r);
+ }
+
+ if (c == 0) dc_val = sign ? -level : level;
+
+ // Bitmasking to clamp level to valid range:
+ // The valid range for 8/10/12 bit vdieo is at most 14/16/18 bit
+ level &= 0xfffff;
+ cul_level += level;
+ tran_low_t dq_coeff;
+ // Bitmasking to clamp dq_coeff to valid range:
+ // The valid range for 8/10/12 bit video is at most 17/19/21 bit
+ dq_coeff = (tran_low_t)(
+ (int64_t)level * get_dqv(dequant, scan[c], iqmatrix) & 0xffffff);
+ dq_coeff = dq_coeff >> shift;
+ if (sign) {
+ dq_coeff = -dq_coeff;
+ }
+ tcoeffs[pos] = clamp(dq_coeff, min_value, max_value);
+ }
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+
+ // DC value
+ set_dc_sign(&cul_level, dc_val);
+
+ return cul_level;
+}
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col,
+ const TX_SIZE tx_size) {
+#if TXCOEFF_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + col,
+ pd->left_context + row, &txb_ctx);
+ const uint8_t cul_level =
+ av1_read_coeffs_txb(cm, xd, r, row, col, plane, &txb_ctx, tx_size);
+ av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, col, row);
+
+ if (is_inter_block(mbmi)) {
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ // tx_type will be read out in av1_read_coeffs_txb_facade
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, row, col, tx_size,
+ cm->reduced_tx_set_used);
+
+ if (plane == 0)
+ update_txk_array(mbmi->txk_type, mbmi->sb_type, row, col, tx_size,
+ tx_type);
+ }
+
+#if TXCOEFF_TIMER
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ cm->txcoeff_timer += elapsed_time;
+ ++cm->txb_count;
+#endif
+}
diff --git a/third_party/aom/av1/decoder/decodetxb.h b/third_party/aom/av1/decoder/decodetxb.h
new file mode 100644
index 0000000000..fe04f6abdd
--- /dev/null
+++ b/third_party/aom/av1/decoder/decodetxb.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DECODETXB_H_
+#define AOM_AV1_DECODER_DECODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/bitreader.h"
+
+uint8_t av1_read_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *const xd,
+ aom_reader *const r, const int blk_row,
+ const int blk_col, const int plane,
+ const TXB_CTX *const txb_ctx,
+ const TX_SIZE tx_size);
+
+void av1_read_coeffs_txb_facade(const AV1_COMMON *const cm,
+ MACROBLOCKD *const xd, aom_reader *const r,
+ const int plane, const int row, const int col,
+ const TX_SIZE tx_size);
+#endif // AOM_AV1_DECODER_DECODETXB_H_
diff --git a/third_party/aom/av1/decoder/detokenize.c b/third_party/aom/av1/decoder/detokenize.c
new file mode 100644
index 0000000000..9d54bd13dd
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/decoder/detokenize.h"
+
+#define ACCT_STR __func__
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/idct.h"
+
+static void decode_color_map_tokens(Av1ColorMapParam *param, aom_reader *r) {
+ uint8_t color_order[PALETTE_MAX_SIZE];
+ const int n = param->n_colors;
+ uint8_t *const color_map = param->color_map;
+ MapCdf color_map_cdf = param->map_cdf;
+ int plane_block_width = param->plane_width;
+ int plane_block_height = param->plane_height;
+ int rows = param->rows;
+ int cols = param->cols;
+
+ // The first color index.
+ color_map[0] = av1_read_uniform(r, n);
+ assert(color_map[0] < n);
+
+ // Run wavefront on the palette map index decoding.
+ for (int i = 1; i < rows + cols - 1; ++i) {
+ for (int j = AOMMIN(i, cols - 1); j >= AOMMAX(0, i - rows + 1); --j) {
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, plane_block_width, (i - j), j, n, color_order, NULL);
+ const int color_idx = aom_read_symbol(
+ r, color_map_cdf[n - PALETTE_MIN_SIZE][color_ctx], n, ACCT_STR);
+ assert(color_idx >= 0 && color_idx < n);
+ color_map[(i - j) * plane_block_width + j] = color_order[color_idx];
+ }
+ }
+ // Copy last column to extra columns.
+ if (cols < plane_block_width) {
+ for (int i = 0; i < rows; ++i) {
+ memset(color_map + i * plane_block_width + cols,
+ color_map[i * plane_block_width + cols - 1],
+ (plane_block_width - cols));
+ }
+ }
+ // Copy last row to extra rows.
+ for (int i = rows; i < plane_block_height; ++i) {
+ memcpy(color_map + i * plane_block_width,
+ color_map + (rows - 1) * plane_block_width, plane_block_width);
+ }
+}
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+ aom_reader *r) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam params;
+ params.color_map =
+ xd->plane[plane].color_index_map + xd->color_index_map_offset[plane];
+ params.map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+ : xd->tile_ctx->palette_y_color_index_cdf;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ params.n_colors = mbmi->palette_mode_info.palette_size[plane];
+ av1_get_block_dimensions(mbmi->sb_type, plane, xd, &params.plane_width,
+ &params.plane_height, &params.rows, &params.cols);
+ decode_color_map_tokens(&params, r);
+}
diff --git a/third_party/aom/av1/decoder/detokenize.h b/third_party/aom/av1/decoder/detokenize.h
new file mode 100644
index 0000000000..173b437a94
--- /dev/null
+++ b/third_party/aom/av1/decoder/detokenize.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DETOKENIZE_H_
+#define AOM_AV1_DECODER_DETOKENIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/scan.h"
+#include "av1/decoder/decoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_decode_palette_tokens(MACROBLOCKD *const xd, int plane, aom_reader *r);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_DECODER_DETOKENIZE_H_
diff --git a/third_party/aom/av1/decoder/dthread.c b/third_party/aom/av1/decoder/dthread.c
new file mode 100644
index 0000000000..3946c787a1
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+
+#include "aom_mem/aom_mem.h"
+#include "av1/common/reconinter.h"
+#include "av1/decoder/dthread.h"
+#include "av1/decoder/decoder.h"
+
+// #define DEBUG_THREAD
+
+// TODO(hkuang): Clean up all the #ifdef in this file.
+void av1_frameworker_lock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ FrameWorkerData *const worker_data = worker->data1;
+ pthread_mutex_lock(&worker_data->stats_mutex);
+#else
+ (void)worker;
+#endif
+}
+
+void av1_frameworker_unlock_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ FrameWorkerData *const worker_data = worker->data1;
+ pthread_mutex_unlock(&worker_data->stats_mutex);
+#else
+ (void)worker;
+#endif
+}
+
+void av1_frameworker_signal_stats(AVxWorker *const worker) {
+#if CONFIG_MULTITHREAD
+ FrameWorkerData *const worker_data = worker->data1;
+
+// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
+#if defined(_WIN32) && !HAVE_PTHREAD_H
+ pthread_cond_signal(&worker_data->stats_cond);
+#else
+ pthread_cond_broadcast(&worker_data->stats_cond);
+#endif
+
+#else
+ (void)worker;
+#endif
+}
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#define BUILDING_WITH_TSAN
+#endif
+#endif
+
+// TODO(hkuang): Remove worker parameter as it is only used in debug code.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+ int row) {
+#if CONFIG_MULTITHREAD
+ if (!ref_buf) return;
+
+#ifndef BUILDING_WITH_TSAN
+ // The following line of code will get harmless tsan error but it is the key
+ // to get best performance.
+ if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
+#endif
+
+ {
+ // Find the worker thread that owns the reference frame. If the reference
+ // frame has been fully decoded, it may not have owner.
+ AVxWorker *const ref_worker = ref_buf->frame_worker_owner;
+ FrameWorkerData *const ref_worker_data =
+ (FrameWorkerData *)ref_worker->data1;
+ const AV1Decoder *const pbi = ref_worker_data->pbi;
+
+#ifdef DEBUG_THREAD
+ {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ printf("%d %p worker is waiting for %d %p worker (%d) ref %d \r\n",
+ worker_data->worker_id, worker, ref_worker_data->worker_id,
+ ref_buf->frame_worker_owner, row, ref_buf->row);
+ }
+#endif
+
+ av1_frameworker_lock_stats(ref_worker);
+ while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
+ ref_buf->buf.corrupted != 1) {
+ pthread_cond_wait(&ref_worker_data->stats_cond,
+ &ref_worker_data->stats_mutex);
+ }
+
+ if (ref_buf->buf.corrupted == 1) {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ av1_frameworker_unlock_stats(ref_worker);
+ aom_internal_error(&worker_data->pbi->common.error,
+ AOM_CODEC_CORRUPT_FRAME,
+ "Worker %p failed to decode frame", worker);
+ }
+ av1_frameworker_unlock_stats(ref_worker);
+ }
+#else
+ (void)worker;
+ (void)ref_buf;
+ (void)row;
+ (void)ref_buf;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row) {
+#if CONFIG_MULTITHREAD
+ AVxWorker *worker = buf->frame_worker_owner;
+
+#ifdef DEBUG_THREAD
+ {
+ FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
+ printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
+ buf->frame_worker_owner, row);
+ }
+#endif
+
+ av1_frameworker_lock_stats(worker);
+ buf->row = row;
+ av1_frameworker_signal_stats(worker);
+ av1_frameworker_unlock_stats(worker);
+#else
+ (void)buf;
+ (void)row;
+#endif // CONFIG_MULTITHREAD
+}
+
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+ AVxWorker *const src_worker) {
+#if CONFIG_MULTITHREAD
+ FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
+ FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
+ AV1_COMMON *const src_cm = &src_worker_data->pbi->common;
+ AV1_COMMON *const dst_cm = &dst_worker_data->pbi->common;
+ int i;
+
+ // Wait until source frame's context is ready.
+ av1_frameworker_lock_stats(src_worker);
+ while (!src_worker_data->frame_context_ready) {
+ pthread_cond_wait(&src_worker_data->stats_cond,
+ &src_worker_data->stats_mutex);
+ }
+
+ dst_cm->last_frame_seg_map = src_cm->seg.enabled
+ ? src_cm->current_frame_seg_map
+ : src_cm->last_frame_seg_map;
+ dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
+ av1_frameworker_unlock_stats(src_worker);
+
+ dst_cm->seq_params.bit_depth = src_cm->seq_params.bit_depth;
+ dst_cm->seq_params.use_highbitdepth = src_cm->seq_params.use_highbitdepth;
+ // TODO(zoeliu): To handle parallel decoding
+ dst_cm->prev_frame =
+ src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
+ dst_cm->last_width =
+ !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
+ dst_cm->last_height =
+ !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
+ dst_cm->seq_params.subsampling_x = src_cm->seq_params.subsampling_x;
+ dst_cm->seq_params.subsampling_y = src_cm->seq_params.subsampling_y;
+ dst_cm->frame_type = src_cm->frame_type;
+ dst_cm->last_show_frame = !src_cm->show_existing_frame
+ ? src_cm->show_frame
+ : src_cm->last_show_frame;
+ for (i = 0; i < REF_FRAMES; ++i)
+ dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
+
+ memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
+ (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
+ dst_cm->lf.sharpness_level = src_cm->lf.sharpness_level;
+ dst_cm->lf.filter_level[0] = src_cm->lf.filter_level[0];
+ dst_cm->lf.filter_level[1] = src_cm->lf.filter_level[1];
+ memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, REF_FRAMES);
+ memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+ dst_cm->seg = src_cm->seg;
+ memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
+ FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
+#else
+ (void)dst_worker;
+ (void)src_worker;
+#endif // CONFIG_MULTITHREAD
+}
diff --git a/third_party/aom/av1/decoder/dthread.h b/third_party/aom/av1/decoder/dthread.h
new file mode 100644
index 0000000000..1d264b07eb
--- /dev/null
+++ b/third_party/aom/av1/decoder/dthread.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_DTHREAD_H_
+#define AOM_AV1_DECODER_DTHREAD_H_
+
+#include "config/aom_config.h"
+
+#include "aom_util/aom_thread.h"
+#include "aom/internal/aom_codec_internal.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1Common;
+struct AV1Decoder;
+struct ThreadData;
+
+typedef struct DecWorkerData {
+ struct ThreadData *td;
+ const uint8_t *data_end;
+ struct aom_internal_error_info error_info;
+} DecWorkerData;
+
+// WorkerData for the FrameWorker thread. It contains all the information of
+// the worker and decode structures for decoding a frame.
+typedef struct FrameWorkerData {
+ struct AV1Decoder *pbi;
+ const uint8_t *data;
+ const uint8_t *data_end;
+ size_t data_size;
+ void *user_priv;
+ int worker_id;
+ int received_frame;
+
+ // scratch_buffer is used in frame parallel mode only.
+ // It is used to make a copy of the compressed data.
+ uint8_t *scratch_buffer;
+ size_t scratch_buffer_size;
+
+#if CONFIG_MULTITHREAD
+ pthread_mutex_t stats_mutex;
+ pthread_cond_t stats_cond;
+#endif
+
+ int frame_context_ready; // Current frame's context is ready to read.
+ int frame_decoded; // Finished decoding current frame.
+} FrameWorkerData;
+
+void av1_frameworker_lock_stats(AVxWorker *const worker);
+void av1_frameworker_unlock_stats(AVxWorker *const worker);
+void av1_frameworker_signal_stats(AVxWorker *const worker);
+
+// Wait until ref_buf has been decoded to row in real pixel unit.
+// Note: worker may already finish decoding ref_buf and release it in order to
+// start decoding next frame. So need to check whether worker is still decoding
+// ref_buf.
+void av1_frameworker_wait(AVxWorker *const worker, RefCntBuffer *const ref_buf,
+ int row);
+
+// FrameWorker broadcasts its decoding progress so other workers that are
+// waiting on it can resume decoding.
+void av1_frameworker_broadcast(RefCntBuffer *const buf, int row);
+
+// Copy necessary decoding context from src worker to dst worker.
+void av1_frameworker_copy_context(AVxWorker *const dst_worker,
+ AVxWorker *const src_worker);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_DECODER_DTHREAD_H_
diff --git a/third_party/aom/av1/decoder/inspection.c b/third_party/aom/av1/decoder/inspection.c
new file mode 100644
index 0000000000..e6c89298a4
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/inspection.h"
+#include "av1/common/enums.h"
+#include "av1/common/cdef.h"
+
+static void ifd_init_mi_rc(insp_frame_data *fd, int mi_cols, int mi_rows) {
+ fd->mi_cols = mi_cols;
+ fd->mi_rows = mi_rows;
+ fd->mi_grid = (insp_mi_data *)aom_malloc(sizeof(insp_mi_data) * fd->mi_rows *
+ fd->mi_cols);
+}
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height) {
+ int mi_cols = ALIGN_POWER_OF_TWO(frame_width, 3) >> MI_SIZE_LOG2;
+ int mi_rows = ALIGN_POWER_OF_TWO(frame_height, 3) >> MI_SIZE_LOG2;
+ ifd_init_mi_rc(fd, mi_cols, mi_rows);
+}
+
+void ifd_clear(insp_frame_data *fd) {
+ aom_free(fd->mi_grid);
+ fd->mi_grid = NULL;
+}
+
+/* TODO(negge) This function may be called by more than one thread when using
+ a multi-threaded decoder and this may cause a data race. */
+int ifd_inspect(insp_frame_data *fd, void *decoder) {
+ struct AV1Decoder *pbi = (struct AV1Decoder *)decoder;
+ AV1_COMMON *const cm = &pbi->common;
+ if (fd->mi_rows != cm->mi_rows || fd->mi_cols != cm->mi_cols) {
+ ifd_clear(fd);
+ ifd_init_mi_rc(fd, cm->mi_rows, cm->mi_cols);
+ }
+ fd->show_frame = cm->show_frame;
+ fd->frame_type = cm->frame_type;
+ fd->base_qindex = cm->base_qindex;
+ // Set width and height of the first tile until generic support can be added
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, 0);
+ av1_tile_set_col(&tile_info, cm, 0);
+ fd->tile_mi_cols = tile_info.mi_col_end - tile_info.mi_col_start;
+ fd->tile_mi_rows = tile_info.mi_row_end - tile_info.mi_row_start;
+ fd->delta_q_present_flag = cm->delta_q_present_flag;
+ fd->delta_q_res = cm->delta_q_res;
+#if CONFIG_ACCOUNTING
+ fd->accounting = &pbi->accounting;
+#endif
+ // TODO(negge): copy per frame CDEF data
+ int i, j;
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < 2; j++) {
+ fd->y_dequant[i][j] = cm->y_dequant_QTX[i][j];
+ fd->u_dequant[i][j] = cm->u_dequant_QTX[i][j];
+ fd->v_dequant[i][j] = cm->v_dequant_QTX[i][j];
+ }
+ }
+ for (j = 0; j < cm->mi_rows; j++) {
+ for (i = 0; i < cm->mi_cols; i++) {
+ const MB_MODE_INFO *mbmi = cm->mi_grid_visible[j * cm->mi_stride + i];
+ insp_mi_data *mi = &fd->mi_grid[j * cm->mi_cols + i];
+ // Segment
+ mi->segment_id = mbmi->segment_id;
+ // Motion Vectors
+ mi->mv[0].row = mbmi->mv[0].as_mv.row;
+ mi->mv[0].col = mbmi->mv[0].as_mv.col;
+ mi->mv[1].row = mbmi->mv[1].as_mv.row;
+ mi->mv[1].col = mbmi->mv[1].as_mv.col;
+ // Reference Frames
+ mi->ref_frame[0] = mbmi->ref_frame[0];
+ mi->ref_frame[1] = mbmi->ref_frame[1];
+ // Prediction Mode
+ mi->mode = mbmi->mode;
+ // Prediction Mode for Chromatic planes
+ if (mi->mode < INTRA_MODES) {
+ mi->uv_mode = mbmi->uv_mode;
+ } else {
+ mi->uv_mode = UV_MODE_INVALID;
+ }
+ // Block Size
+ mi->sb_type = mbmi->sb_type;
+ // Skip Flag
+ mi->skip = mbmi->skip;
+ mi->filter[0] = av1_extract_interp_filter(mbmi->interp_filters, 0);
+ mi->filter[1] = av1_extract_interp_filter(mbmi->interp_filters, 1);
+ mi->dual_filter_type = mi->filter[0] * 3 + mi->filter[1];
+ // Transform
+ // TODO(anyone): extract tx type info from mbmi->txk_type[].
+ mi->tx_type = DCT_DCT;
+ mi->tx_size = mbmi->tx_size;
+
+ mi->cdef_level =
+ cm->cdef_strengths[mbmi->cdef_strength] / CDEF_SEC_STRENGTHS;
+ mi->cdef_strength =
+ cm->cdef_strengths[mbmi->cdef_strength] % CDEF_SEC_STRENGTHS;
+ mi->cdef_strength += mi->cdef_strength == 3;
+ if (mbmi->uv_mode == UV_CFL_PRED) {
+ mi->cfl_alpha_idx = mbmi->cfl_alpha_idx;
+ mi->cfl_alpha_sign = mbmi->cfl_alpha_signs;
+ } else {
+ mi->cfl_alpha_idx = 0;
+ mi->cfl_alpha_sign = 0;
+ }
+ // delta_q
+ mi->current_qindex = mbmi->current_qindex;
+ }
+ }
+ return 1;
+}
diff --git a/third_party/aom/av1/decoder/inspection.h b/third_party/aom/av1/decoder/inspection.h
new file mode 100644
index 0000000000..7214a9beda
--- /dev/null
+++ b/third_party/aom/av1/decoder/inspection.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_DECODER_INSPECTION_H_
+#define AOM_AV1_DECODER_INSPECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+#include "av1/common/seg_common.h"
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#endif
+
+#ifndef AOM_AOMDX_H_
+typedef void (*aom_inspect_cb)(void *decoder, void *data);
+#endif
+
+typedef struct insp_mv insp_mv;
+
+struct insp_mv {
+ int16_t row;
+ int16_t col;
+};
+
+typedef struct insp_mi_data insp_mi_data;
+
+struct insp_mi_data {
+ insp_mv mv[2];
+ int16_t ref_frame[2];
+ int16_t mode;
+ int16_t uv_mode;
+ int16_t sb_type;
+ int16_t skip;
+ int16_t segment_id;
+ int16_t dual_filter_type;
+ int16_t filter[2];
+ int16_t tx_type;
+ int16_t tx_size;
+ int16_t cdef_level;
+ int16_t cdef_strength;
+ int16_t cfl_alpha_idx;
+ int16_t cfl_alpha_sign;
+ int16_t current_qindex;
+};
+
+typedef struct insp_frame_data insp_frame_data;
+
+struct insp_frame_data {
+#if CONFIG_ACCOUNTING
+ Accounting *accounting;
+#endif
+ insp_mi_data *mi_grid;
+ int show_frame;
+ int frame_type;
+ int base_qindex;
+ int mi_rows;
+ int mi_cols;
+ int tile_mi_rows;
+ int tile_mi_cols;
+ int16_t y_dequant[MAX_SEGMENTS][2];
+ int16_t u_dequant[MAX_SEGMENTS][2];
+ int16_t v_dequant[MAX_SEGMENTS][2];
+ // TODO(negge): add per frame CDEF data
+ int delta_q_present_flag;
+ int delta_q_res;
+};
+
+void ifd_init(insp_frame_data *fd, int frame_width, int frame_height);
+void ifd_clear(insp_frame_data *fd);
+int ifd_inspect(insp_frame_data *fd, void *decoder);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif // __cplusplus
+#endif // AOM_AV1_DECODER_INSPECTION_H_
diff --git a/third_party/aom/av1/decoder/obu.c b/third_party/aom/av1/decoder/obu.c
new file mode 100644
index 0000000000..44ecf818e7
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.c
@@ -0,0 +1,839 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_codec.h"
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_ports/mem_ops.h"
+
+#include "av1/common/common.h"
+#include "av1/common/obu_util.h"
+#include "av1/common/timing.h"
+#include "av1/decoder/decoder.h"
+#include "av1/decoder/decodeframe.h"
+#include "av1/decoder/obu.h"
+
+// Picture prediction structures (0-12 are predefined) in scalability metadata.
+typedef enum {
+ SCALABILITY_L1T2 = 0,
+ SCALABILITY_L1T3 = 1,
+ SCALABILITY_L2T1 = 2,
+ SCALABILITY_L2T2 = 3,
+ SCALABILITY_L2T3 = 4,
+ SCALABILITY_S2T1 = 5,
+ SCALABILITY_S2T2 = 6,
+ SCALABILITY_S2T3 = 7,
+ SCALABILITY_L2T1h = 8,
+ SCALABILITY_L2T2h = 9,
+ SCALABILITY_L2T3h = 10,
+ SCALABILITY_S2T1h = 11,
+ SCALABILITY_S2T2h = 12,
+ SCALABILITY_S2T3h = 13,
+ SCALABILITY_SS = 14
+} SCALABILITY_STRUCTURES;
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+ int operating_point_idc, unsigned int *number_spatial_layers,
+ unsigned int *number_temporal_layers) {
+ // derive number of spatial/temporal layers from operating_point_idc
+
+ if (!number_spatial_layers || !number_temporal_layers)
+ return AOM_CODEC_INVALID_PARAM;
+
+ if (operating_point_idc == 0) {
+ *number_temporal_layers = 1;
+ *number_spatial_layers = 1;
+ } else {
+ *number_spatial_layers = 0;
+ *number_temporal_layers = 0;
+ for (int j = 0; j < MAX_NUM_SPATIAL_LAYERS; j++) {
+ *number_spatial_layers +=
+ (operating_point_idc >> (j + MAX_NUM_TEMPORAL_LAYERS)) & 0x1;
+ }
+ for (int j = 0; j < MAX_NUM_TEMPORAL_LAYERS; j++) {
+ *number_temporal_layers += (operating_point_idc >> j) & 0x1;
+ }
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static int is_obu_in_current_operating_point(AV1Decoder *pbi,
+ ObuHeader obu_header) {
+ if (!pbi->current_operating_point) {
+ return 1;
+ }
+
+ if ((pbi->current_operating_point >> obu_header.temporal_layer_id) & 0x1 &&
+ (pbi->current_operating_point >> (obu_header.spatial_layer_id + 8)) &
+ 0x1) {
+ return 1;
+ }
+ return 0;
+}
+
+static int byte_alignment(AV1_COMMON *const cm,
+ struct aom_read_bit_buffer *const rb) {
+ while (rb->bit_offset & 7) {
+ if (aom_rb_read_bit(rb)) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static uint32_t read_temporal_delimiter_obu() { return 0; }
+
+// Returns a boolean that indicates success.
+static int read_bitstream_level(BitstreamLevel *bl,
+ struct aom_read_bit_buffer *rb) {
+ const uint8_t seq_level_idx = aom_rb_read_literal(rb, LEVEL_BITS);
+ if (!is_valid_seq_level_idx(seq_level_idx)) return 0;
+ bl->major = (seq_level_idx >> LEVEL_MINOR_BITS) + LEVEL_MAJOR_MIN;
+ bl->minor = seq_level_idx & ((1 << LEVEL_MINOR_BITS) - 1);
+ return 1;
+}
+
+// Returns whether two sequence headers are consistent with each other.
+// TODO(huisu,wtc@google.com): make sure the code matches the spec exactly.
+static int are_seq_headers_consistent(const SequenceHeader *seq_params_old,
+ const SequenceHeader *seq_params_new) {
+ return !memcmp(seq_params_old, seq_params_new, sizeof(SequenceHeader));
+}
+
+// On success, sets pbi->sequence_header_ready to 1 and returns the number of
+// bytes read from 'rb'.
+// On failure, sets pbi->common.error.error_code and returns 0.
+static uint32_t read_sequence_header_obu(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb) {
+ AV1_COMMON *const cm = &pbi->common;
+ const uint32_t saved_bit_offset = rb->bit_offset;
+
+ // Verify rb has been configured to report errors.
+ assert(rb->error_handler);
+
+ // Use a local variable to store the information as we decode. At the end,
+ // if no errors have occurred, cm->seq_params is updated.
+ SequenceHeader sh = cm->seq_params;
+ SequenceHeader *const seq_params = &sh;
+
+ seq_params->profile = av1_read_profile(rb);
+ if (seq_params->profile > CONFIG_MAX_DECODE_PROFILE) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+
+ // Still picture or not
+ seq_params->still_picture = aom_rb_read_bit(rb);
+ seq_params->reduced_still_picture_hdr = aom_rb_read_bit(rb);
+ // Video must have reduced_still_picture_hdr = 0
+ if (!seq_params->still_picture && seq_params->reduced_still_picture_hdr) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+
+ if (seq_params->reduced_still_picture_hdr) {
+ cm->timing_info_present = 0;
+ seq_params->decoder_model_info_present_flag = 0;
+ seq_params->display_model_info_present_flag = 0;
+ seq_params->operating_points_cnt_minus_1 = 0;
+ seq_params->operating_point_idc[0] = 0;
+ if (!read_bitstream_level(&seq_params->level[0], rb)) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+ seq_params->tier[0] = 0;
+ cm->op_params[0].decoder_model_param_present_flag = 0;
+ cm->op_params[0].display_model_param_present_flag = 0;
+ } else {
+ cm->timing_info_present = aom_rb_read_bit(rb); // timing_info_present_flag
+ if (cm->timing_info_present) {
+ av1_read_timing_info_header(cm, rb);
+
+ seq_params->decoder_model_info_present_flag = aom_rb_read_bit(rb);
+ if (seq_params->decoder_model_info_present_flag)
+ av1_read_decoder_model_info(cm, rb);
+ } else {
+ seq_params->decoder_model_info_present_flag = 0;
+ }
+ seq_params->display_model_info_present_flag = aom_rb_read_bit(rb);
+ seq_params->operating_points_cnt_minus_1 =
+ aom_rb_read_literal(rb, OP_POINTS_CNT_MINUS_1_BITS);
+ for (int i = 0; i < seq_params->operating_points_cnt_minus_1 + 1; i++) {
+ seq_params->operating_point_idc[i] =
+ aom_rb_read_literal(rb, OP_POINTS_IDC_BITS);
+ if (!read_bitstream_level(&seq_params->level[i], rb)) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return 0;
+ }
+ // This is the seq_level_idx[i] > 7 check in the spec. seq_level_idx 7
+ // is equivalent to level 3.3.
+ if (seq_params->level[i].major > 3)
+ seq_params->tier[i] = aom_rb_read_bit(rb);
+ else
+ seq_params->tier[i] = 0;
+ if (seq_params->decoder_model_info_present_flag) {
+ cm->op_params[i].decoder_model_param_present_flag = aom_rb_read_bit(rb);
+ if (cm->op_params[i].decoder_model_param_present_flag)
+ av1_read_op_parameters_info(cm, rb, i);
+ } else {
+ cm->op_params[i].decoder_model_param_present_flag = 0;
+ }
+ if (cm->timing_info_present &&
+ (cm->timing_info.equal_picture_interval ||
+ cm->op_params[i].decoder_model_param_present_flag)) {
+ cm->op_params[i].bitrate = max_level_bitrate(
+ seq_params->profile,
+ major_minor_to_seq_level_idx(seq_params->level[i]),
+ seq_params->tier[i]);
+ // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass
+ // the check
+ if (cm->op_params[i].bitrate == 0)
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support this combination of "
+ "profile, level, and tier.");
+ // Buffer size in bits/s is bitrate in bits/s * 1 s
+ cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+ }
+ if (cm->timing_info_present && cm->timing_info.equal_picture_interval &&
+ !cm->op_params[i].decoder_model_param_present_flag) {
+ // When the decoder_model_parameters are not sent for this op, set
+ // the default ones that can be used with the resource availability mode
+ cm->op_params[i].decoder_buffer_delay = 70000;
+ cm->op_params[i].encoder_buffer_delay = 20000;
+ cm->op_params[i].low_delay_mode_flag = 0;
+ }
+
+ if (seq_params->display_model_info_present_flag) {
+ cm->op_params[i].display_model_param_present_flag = aom_rb_read_bit(rb);
+ if (cm->op_params[i].display_model_param_present_flag) {
+ cm->op_params[i].initial_display_delay =
+ aom_rb_read_literal(rb, 4) + 1;
+ if (cm->op_params[i].initial_display_delay > 10)
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support more than 10 decoded frames delay");
+ } else {
+ cm->op_params[i].initial_display_delay = 10;
+ }
+ } else {
+ cm->op_params[i].display_model_param_present_flag = 0;
+ cm->op_params[i].initial_display_delay = 10;
+ }
+ }
+ }
+ // This decoder supports all levels. Choose operating point provided by
+ // external means
+ int operating_point = pbi->operating_point;
+ if (operating_point < 0 ||
+ operating_point > seq_params->operating_points_cnt_minus_1)
+ operating_point = 0;
+ pbi->current_operating_point =
+ seq_params->operating_point_idc[operating_point];
+ if (aom_get_num_layers_from_operating_point_idc(
+ pbi->current_operating_point, &cm->number_spatial_layers,
+ &cm->number_temporal_layers) != AOM_CODEC_OK) {
+ cm->error.error_code = AOM_CODEC_ERROR;
+ return 0;
+ }
+
+ av1_read_sequence_header(cm, rb, seq_params);
+
+ av1_read_color_config(rb, pbi->allow_lowbitdepth, seq_params, &cm->error);
+ if (!(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) &&
+ !(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 0)) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Only 4:4:4, 4:2:2 and 4:2:0 are currently supported, "
+ "%d %d subsampling is not supported.\n",
+ seq_params->subsampling_x, seq_params->subsampling_y);
+ }
+
+ seq_params->film_grain_params_present = aom_rb_read_bit(rb);
+
+ if (av1_check_trailing_bits(pbi, rb) != 0) {
+ // cm->error.error_code is already set.
+ return 0;
+ }
+
+ // If a sequence header has been decoded before, we check if the new
+ // one is consistent with the old one.
+ if (pbi->sequence_header_ready) {
+ if (!are_seq_headers_consistent(&cm->seq_params, seq_params))
+ pbi->sequence_header_changed = 1;
+ }
+
+ cm->seq_params = *seq_params;
+ pbi->sequence_header_ready = 1;
+
+ return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+// On success, returns the frame header size. On failure, calls
+// aom_internal_error and does not return.
+static uint32_t read_frame_header_obu(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t **p_data_end,
+ int trailing_bits_present) {
+ return av1_decode_frame_headers_and_setup(pbi, rb, data, p_data_end,
+ trailing_bits_present);
+}
+
+static int32_t read_tile_group_header(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ int *start_tile, int *end_tile,
+ int tile_start_implicit) {
+ AV1_COMMON *const cm = &pbi->common;
+ uint32_t saved_bit_offset = rb->bit_offset;
+ int tile_start_and_end_present_flag = 0;
+ const int num_tiles = pbi->common.tile_rows * pbi->common.tile_cols;
+
+ if (!pbi->common.large_scale_tile && num_tiles > 1) {
+ tile_start_and_end_present_flag = aom_rb_read_bit(rb);
+ }
+ if (pbi->common.large_scale_tile || num_tiles == 1 ||
+ !tile_start_and_end_present_flag) {
+ *start_tile = 0;
+ *end_tile = num_tiles - 1;
+ return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+ }
+ if (tile_start_implicit && tile_start_and_end_present_flag) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "For OBU_FRAME type obu tile_start_and_end_present_flag must be 0");
+ return -1;
+ }
+ *start_tile =
+ aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+ *end_tile = aom_rb_read_literal(rb, cm->log2_tile_rows + cm->log2_tile_cols);
+
+ return ((rb->bit_offset - saved_bit_offset + 7) >> 3);
+}
+
+static uint32_t read_one_tile_group_obu(
+ AV1Decoder *pbi, struct aom_read_bit_buffer *rb, int is_first_tg,
+ const uint8_t *data, const uint8_t *data_end, const uint8_t **p_data_end,
+ int *is_last_tg, int tile_start_implicit) {
+ AV1_COMMON *const cm = &pbi->common;
+ int start_tile, end_tile;
+ int32_t header_size, tg_payload_size;
+
+ assert((rb->bit_offset & 7) == 0);
+ assert(rb->bit_buffer + aom_rb_bytes_read(rb) == data);
+
+ header_size = read_tile_group_header(pbi, rb, &start_tile, &end_tile,
+ tile_start_implicit);
+ if (header_size == -1 || byte_alignment(cm, rb)) return 0;
+ if (start_tile > end_tile) return header_size;
+ data += header_size;
+ av1_decode_tg_tiles_and_wrapup(pbi, data, data_end, p_data_end, start_tile,
+ end_tile, is_first_tg);
+
+ tg_payload_size = (uint32_t)(*p_data_end - data);
+
+ // TODO(shan): For now, assume all tile groups received in order
+ *is_last_tg = end_tile == cm->tile_rows * cm->tile_cols - 1;
+ return header_size + tg_payload_size;
+}
+
+static void alloc_tile_list_buffer(AV1Decoder *pbi) {
+ // TODO(yunqing): for now, copy each tile's decoded YUV data directly to the
+ // output buffer. This needs to be modified according to the application
+ // requirement.
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+ const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+ const int ssy = cm->seq_params.subsampling_y;
+ const int ssx = cm->seq_params.subsampling_x;
+ const int num_planes = av1_num_planes(cm);
+ const size_t yplane_tile_size = tile_height_in_pixels * tile_width_in_pixels;
+ const size_t uvplane_tile_size =
+ (num_planes > 1)
+ ? (tile_height_in_pixels >> ssy) * (tile_width_in_pixels >> ssx)
+ : 0;
+ const size_t tile_size = (cm->seq_params.use_highbitdepth ? 2 : 1) *
+ (yplane_tile_size + 2 * uvplane_tile_size);
+ pbi->tile_list_size = tile_size * (pbi->tile_count_minus_1 + 1);
+
+ if (pbi->tile_list_size > pbi->buffer_sz) {
+ if (pbi->tile_list_output != NULL) aom_free(pbi->tile_list_output);
+ pbi->tile_list_output = NULL;
+
+ pbi->tile_list_output = (uint8_t *)aom_memalign(32, pbi->tile_list_size);
+ if (pbi->tile_list_output == NULL)
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate the tile list output buffer");
+ pbi->buffer_sz = pbi->tile_list_size;
+ }
+}
+
+static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi,
+ uint8_t **output) {
+ AV1_COMMON *const cm = &pbi->common;
+ const int tile_width_in_pixels = cm->tile_width * MI_SIZE;
+ const int tile_height_in_pixels = cm->tile_height * MI_SIZE;
+ const int ssy = cm->seq_params.subsampling_y;
+ const int ssx = cm->seq_params.subsampling_x;
+ const int num_planes = av1_num_planes(cm);
+
+ // Copy decoded tile to the tile list output buffer.
+ YV12_BUFFER_CONFIG *cur_frame = get_frame_new_buffer(cm);
+ const int mi_row = pbi->dec_tile_row * cm->tile_height;
+ const int mi_col = pbi->dec_tile_col * cm->tile_width;
+ const int is_hbd = (cur_frame->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ uint8_t *bufs[MAX_MB_PLANE] = { NULL, NULL, NULL };
+ int strides[MAX_MB_PLANE] = { 0, 0, 0 };
+ int plane;
+
+ for (plane = 0; plane < num_planes; ++plane) {
+ int shift_x = plane > 0 ? ssx : 0;
+ int shift_y = plane > 0 ? ssy : 0;
+
+ bufs[plane] = cur_frame->buffers[plane];
+ strides[plane] =
+ (plane > 0) ? cur_frame->strides[1] : cur_frame->strides[0];
+
+ bufs[plane] += mi_row * (MI_SIZE >> shift_y) * strides[plane] +
+ mi_col * (MI_SIZE >> shift_x);
+
+ if (is_hbd) {
+ bufs[plane] = (uint8_t *)CONVERT_TO_SHORTPTR(bufs[plane]);
+ strides[plane] *= 2;
+ }
+
+ int w, h;
+ w = (plane > 0 && shift_x > 0) ? ((tile_width_in_pixels + 1) >> shift_x)
+ : tile_width_in_pixels;
+ w *= (1 + is_hbd);
+ h = (plane > 0 && shift_y > 0) ? ((tile_height_in_pixels + 1) >> shift_y)
+ : tile_height_in_pixels;
+ int j;
+
+ for (j = 0; j < h; ++j) {
+ memcpy(*output, bufs[plane], w);
+ bufs[plane] += strides[plane];
+ *output += w;
+ }
+ }
+}
+
+// Only called while large_scale_tile = 1.
+static uint32_t read_and_decode_one_tile_list(AV1Decoder *pbi,
+ struct aom_read_bit_buffer *rb,
+ const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end,
+ int *frame_decoding_finished) {
+ AV1_COMMON *const cm = &pbi->common;
+ uint32_t tile_list_payload_size = 0;
+ const int num_tiles = cm->tile_cols * cm->tile_rows;
+ const int start_tile = 0;
+ const int end_tile = num_tiles - 1;
+ int i = 0;
+
+ // Process the tile list info.
+ pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8);
+ pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16);
+ if (pbi->tile_count_minus_1 > MAX_TILES - 1) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ // Allocate output frame buffer for the tile list.
+ alloc_tile_list_buffer(pbi);
+
+ uint32_t tile_list_info_bytes = 4;
+ tile_list_payload_size += tile_list_info_bytes;
+ data += tile_list_info_bytes;
+ uint8_t *output = pbi->tile_list_output;
+
+ for (i = 0; i <= pbi->tile_count_minus_1; i++) {
+ // Process 1 tile.
+ // Reset the bit reader.
+ rb->bit_offset = 0;
+ rb->bit_buffer = data;
+
+ // Read out the tile info.
+ uint32_t tile_info_bytes = 5;
+ // Set reference for each tile.
+ int ref_idx = aom_rb_read_literal(rb, 8);
+ if (ref_idx >= MAX_EXTERNAL_REFERENCES) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+ av1_set_reference_dec(cm, 0, 1, &pbi->ext_refs.refs[ref_idx]);
+
+ pbi->dec_tile_row = aom_rb_read_literal(rb, 8);
+ pbi->dec_tile_col = aom_rb_read_literal(rb, 8);
+ if (pbi->dec_tile_row < 0 || pbi->dec_tile_col < 0 ||
+ pbi->dec_tile_row >= cm->tile_rows ||
+ pbi->dec_tile_col >= cm->tile_cols) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ pbi->coded_tile_data_size = aom_rb_read_literal(rb, 16) + 1;
+ data += tile_info_bytes;
+ if ((size_t)(data_end - data) < pbi->coded_tile_data_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return 0;
+ }
+
+ av1_decode_tg_tiles_and_wrapup(pbi, data, data + pbi->coded_tile_data_size,
+ p_data_end, start_tile, end_tile, 0);
+ uint32_t tile_payload_size = (uint32_t)(*p_data_end - data);
+
+ tile_list_payload_size += tile_info_bytes + tile_payload_size;
+
+ // Update data ptr for next tile decoding.
+ data = *p_data_end;
+ assert(data <= data_end);
+
+ // Copy the decoded tile to the tile list output buffer.
+ copy_decoded_tile_to_tile_list_buffer(pbi, &output);
+ }
+
+ *frame_decoding_finished = 1;
+ return tile_list_payload_size;
+}
+
+static void read_metadata_itut_t35(const uint8_t *data, size_t sz) {
+ struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+ for (size_t i = 0; i < sz; i++) {
+ aom_rb_read_literal(&rb, 8);
+ }
+}
+
+static void read_metadata_hdr_cll(const uint8_t *data, size_t sz) {
+ struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+ aom_rb_read_literal(&rb, 16); // max_cll
+ aom_rb_read_literal(&rb, 16); // max_fall
+}
+
+static void read_metadata_hdr_mdcv(const uint8_t *data, size_t sz) {
+ struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+ for (int i = 0; i < 3; i++) {
+ aom_rb_read_literal(&rb, 16); // primary_i_chromaticity_x
+ aom_rb_read_literal(&rb, 16); // primary_i_chromaticity_y
+ }
+
+ aom_rb_read_literal(&rb, 16); // white_point_chromaticity_x
+ aom_rb_read_literal(&rb, 16); // white_point_chromaticity_y
+
+ aom_rb_read_unsigned_literal(&rb, 32); // luminance_max
+ aom_rb_read_unsigned_literal(&rb, 32); // luminance_min
+}
+
+static void scalability_structure(struct aom_read_bit_buffer *rb) {
+ int spatial_layers_cnt = aom_rb_read_literal(rb, 2);
+ int spatial_layer_dimensions_present_flag = aom_rb_read_bit(rb);
+ int spatial_layer_description_present_flag = aom_rb_read_bit(rb);
+ int temporal_group_description_present_flag = aom_rb_read_bit(rb);
+ aom_rb_read_literal(rb, 3); // reserved
+
+ if (spatial_layer_dimensions_present_flag) {
+ int i;
+ for (i = 0; i < spatial_layers_cnt + 1; i++) {
+ aom_rb_read_literal(rb, 16);
+ aom_rb_read_literal(rb, 16);
+ }
+ }
+ if (spatial_layer_description_present_flag) {
+ int i;
+ for (i = 0; i < spatial_layers_cnt + 1; i++) {
+ aom_rb_read_literal(rb, 8);
+ }
+ }
+ if (temporal_group_description_present_flag) {
+ int i, j, temporal_group_size;
+ temporal_group_size = aom_rb_read_literal(rb, 8);
+ for (i = 0; i < temporal_group_size; i++) {
+ aom_rb_read_literal(rb, 3);
+ aom_rb_read_bit(rb);
+ aom_rb_read_bit(rb);
+ int temporal_group_ref_cnt = aom_rb_read_literal(rb, 3);
+ for (j = 0; j < temporal_group_ref_cnt; j++) {
+ aom_rb_read_literal(rb, 8);
+ }
+ }
+ }
+}
+
+static void read_metadata_scalability(const uint8_t *data, size_t sz) {
+ struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+ int scalability_mode_idc = aom_rb_read_literal(&rb, 8);
+ if (scalability_mode_idc == SCALABILITY_SS) {
+ scalability_structure(&rb);
+ }
+}
+
+static void read_metadata_timecode(const uint8_t *data, size_t sz) {
+ struct aom_read_bit_buffer rb = { data, data + sz, 0, NULL, NULL };
+ aom_rb_read_literal(&rb, 5); // counting_type f(5)
+ int full_timestamp_flag = aom_rb_read_bit(&rb); // full_timestamp_flag f(1)
+ aom_rb_read_bit(&rb); // discontinuity_flag (f1)
+ aom_rb_read_bit(&rb); // cnt_dropped_flag f(1)
+ aom_rb_read_literal(&rb, 9); // n_frames f(9)
+ if (full_timestamp_flag) {
+ aom_rb_read_literal(&rb, 6); // seconds_value f(6)
+ aom_rb_read_literal(&rb, 6); // minutes_value f(6)
+ aom_rb_read_literal(&rb, 5); // hours_value f(5)
+ } else {
+ int seconds_flag = aom_rb_read_bit(&rb); // seconds_flag f(1)
+ if (seconds_flag) {
+ aom_rb_read_literal(&rb, 6); // seconds_value f(6)
+ int minutes_flag = aom_rb_read_bit(&rb); // minutes_flag f(1)
+ if (minutes_flag) {
+ aom_rb_read_literal(&rb, 6); // minutes_value f(6)
+ int hours_flag = aom_rb_read_bit(&rb); // hours_flag f(1)
+ if (hours_flag) {
+ aom_rb_read_literal(&rb, 5); // hours_value f(5)
+ }
+ }
+ }
+ }
+ // time_offset_length f(5)
+ int time_offset_length = aom_rb_read_literal(&rb, 5);
+ if (time_offset_length) {
+ aom_rb_read_literal(&rb, time_offset_length); // f(time_offset_length)
+ }
+}
+
+static size_t read_metadata(const uint8_t *data, size_t sz) {
+ size_t type_length;
+ uint64_t type_value;
+ OBU_METADATA_TYPE metadata_type;
+ if (aom_uleb_decode(data, sz, &type_value, &type_length) < 0) {
+ return sz;
+ }
+ metadata_type = (OBU_METADATA_TYPE)type_value;
+ if (metadata_type == OBU_METADATA_TYPE_ITUT_T35) {
+ read_metadata_itut_t35(data + type_length, sz - type_length);
+ } else if (metadata_type == OBU_METADATA_TYPE_HDR_CLL) {
+ read_metadata_hdr_cll(data + type_length, sz - type_length);
+ } else if (metadata_type == OBU_METADATA_TYPE_HDR_MDCV) {
+ read_metadata_hdr_mdcv(data + type_length, sz - type_length);
+ } else if (metadata_type == OBU_METADATA_TYPE_SCALABILITY) {
+ read_metadata_scalability(data + type_length, sz - type_length);
+ } else if (metadata_type == OBU_METADATA_TYPE_TIMECODE) {
+ read_metadata_timecode(data + type_length, sz - type_length);
+ }
+
+ return sz;
+}
+
+// On success, returns a boolean that indicates whether the decoding of the
+// current frame is finished. On failure, sets cm->error.error_code and
+// returns -1.
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end) {
+ AV1_COMMON *const cm = &pbi->common;
+ int frame_decoding_finished = 0;
+ int is_first_tg_obu_received = 1;
+ uint32_t frame_header_size = 0;
+ ObuHeader obu_header;
+ memset(&obu_header, 0, sizeof(obu_header));
+ pbi->seen_frame_header = 0;
+
+ if (data_end < data) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ // Reset pbi->camera_frame_header_ready to 0 if cm->large_scale_tile = 0.
+ if (!cm->large_scale_tile) pbi->camera_frame_header_ready = 0;
+
+ // decode frame as a series of OBUs
+ while (!frame_decoding_finished && !cm->error.error_code) {
+ struct aom_read_bit_buffer rb;
+ size_t payload_size = 0;
+ size_t decoded_payload_size = 0;
+ size_t obu_payload_offset = 0;
+ size_t bytes_read = 0;
+ const size_t bytes_available = data_end - data;
+
+ if (bytes_available == 0 && !pbi->seen_frame_header) {
+ *p_data_end = data;
+ cm->error.error_code = AOM_CODEC_OK;
+ break;
+ }
+
+ aom_codec_err_t status =
+ aom_read_obu_header_and_size(data, bytes_available, cm->is_annexb,
+ &obu_header, &payload_size, &bytes_read);
+
+ if (status != AOM_CODEC_OK) {
+ cm->error.error_code = status;
+ return -1;
+ }
+
+ // Record obu size header information.
+ pbi->obu_size_hdr.data = data + obu_header.size;
+ pbi->obu_size_hdr.size = bytes_read - obu_header.size;
+
+ // Note: aom_read_obu_header_and_size() takes care of checking that this
+ // doesn't cause 'data' to advance past 'data_end'.
+ data += bytes_read;
+
+ if ((size_t)(data_end - data) < payload_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ cm->temporal_layer_id = obu_header.temporal_layer_id;
+ cm->spatial_layer_id = obu_header.spatial_layer_id;
+
+ if (obu_header.type != OBU_TEMPORAL_DELIMITER &&
+ obu_header.type != OBU_SEQUENCE_HEADER &&
+ obu_header.type != OBU_PADDING) {
+ // don't decode obu if it's not in current operating mode
+ if (!is_obu_in_current_operating_point(pbi, obu_header)) {
+ data += payload_size;
+ continue;
+ }
+ }
+
+ av1_init_read_bit_buffer(pbi, &rb, data, data + payload_size);
+
+ switch (obu_header.type) {
+ case OBU_TEMPORAL_DELIMITER:
+ decoded_payload_size = read_temporal_delimiter_obu();
+ pbi->seen_frame_header = 0;
+ break;
+ case OBU_SEQUENCE_HEADER:
+ decoded_payload_size = read_sequence_header_obu(pbi, &rb);
+ if (cm->error.error_code != AOM_CODEC_OK) return -1;
+ break;
+ case OBU_FRAME_HEADER:
+ case OBU_REDUNDANT_FRAME_HEADER:
+ case OBU_FRAME:
+ // Only decode first frame header received
+ if (!pbi->seen_frame_header ||
+ (cm->large_scale_tile && !pbi->camera_frame_header_ready)) {
+ frame_header_size = read_frame_header_obu(
+ pbi, &rb, data, p_data_end, obu_header.type != OBU_FRAME);
+ pbi->seen_frame_header = 1;
+ if (!pbi->ext_tile_debug && cm->large_scale_tile)
+ pbi->camera_frame_header_ready = 1;
+ } else {
+ // TODO(wtc): Verify that the frame_header_obu is identical to the
+ // original frame_header_obu. For now just skip frame_header_size
+ // bytes in the bit buffer.
+ if (frame_header_size > payload_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ assert(rb.bit_offset == 0);
+ rb.bit_offset = 8 * frame_header_size;
+ }
+
+ decoded_payload_size = frame_header_size;
+ pbi->frame_header_size = frame_header_size;
+
+ if (cm->show_existing_frame) {
+ if (obu_header.type == OBU_FRAME) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+ frame_decoding_finished = 1;
+ pbi->seen_frame_header = 0;
+ break;
+ }
+
+ // In large scale tile coding, decode the common camera frame header
+ // before any tile list OBU.
+ if (!pbi->ext_tile_debug && pbi->camera_frame_header_ready) {
+ frame_decoding_finished = 1;
+ // Skip the rest of the frame data.
+ decoded_payload_size = payload_size;
+ // Update data_end.
+ *p_data_end = data_end;
+ break;
+ }
+
+ if (obu_header.type != OBU_FRAME) break;
+ obu_payload_offset = frame_header_size;
+ // Byte align the reader before reading the tile group.
+ if (byte_alignment(cm, &rb)) return -1;
+ AOM_FALLTHROUGH_INTENDED; // fall through to read tile group.
+ case OBU_TILE_GROUP:
+ if (!pbi->seen_frame_header) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ if (obu_payload_offset > payload_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ decoded_payload_size += read_one_tile_group_obu(
+ pbi, &rb, is_first_tg_obu_received, data + obu_payload_offset,
+ data + payload_size, p_data_end, &frame_decoding_finished,
+ obu_header.type == OBU_FRAME);
+ is_first_tg_obu_received = 0;
+ if (frame_decoding_finished) pbi->seen_frame_header = 0;
+ break;
+ case OBU_METADATA:
+ decoded_payload_size = read_metadata(data, payload_size);
+ break;
+ case OBU_TILE_LIST:
+ if (CONFIG_NORMAL_TILE_MODE) {
+ cm->error.error_code = AOM_CODEC_UNSUP_BITSTREAM;
+ return -1;
+ }
+
+ // This OBU type is purely for the large scale tile coding mode.
+ // The common camera frame header has to be already decoded.
+ if (!pbi->camera_frame_header_ready) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ cm->large_scale_tile = 1;
+ av1_set_single_tile_decoding_mode(cm);
+ decoded_payload_size =
+ read_and_decode_one_tile_list(pbi, &rb, data, data + payload_size,
+ p_data_end, &frame_decoding_finished);
+ if (cm->error.error_code != AOM_CODEC_OK) return -1;
+ break;
+ case OBU_PADDING:
+ default:
+ // Skip unrecognized OBUs
+ decoded_payload_size = payload_size;
+ break;
+ }
+
+ // Check that the signalled OBU size matches the actual amount of data read
+ if (decoded_payload_size > payload_size) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+
+ // If there are extra padding bytes, they should all be zero
+ while (decoded_payload_size < payload_size) {
+ uint8_t padding_byte = data[decoded_payload_size++];
+ if (padding_byte != 0) {
+ cm->error.error_code = AOM_CODEC_CORRUPT_FRAME;
+ return -1;
+ }
+ }
+
+ data += payload_size;
+ }
+
+ return frame_decoding_finished;
+}
diff --git a/third_party/aom/av1/decoder/obu.h b/third_party/aom/av1/decoder/obu.h
new file mode 100644
index 0000000000..5ab243fc90
--- /dev/null
+++ b/third_party/aom/av1/decoder/obu.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_DECODER_OBU_H_
+#define AOM_AV1_DECODER_OBU_H_
+
+#include "aom/aom_codec.h"
+#include "av1/decoder/decoder.h"
+
+// Try to decode one frame from a buffer.
+// Returns 1 if we decoded a frame,
+// 0 if we didn't decode a frame but that's okay
+// (eg, if there was a frame but we skipped it),
+// or -1 on error
+int aom_decode_frame_from_obus(struct AV1Decoder *pbi, const uint8_t *data,
+ const uint8_t *data_end,
+ const uint8_t **p_data_end);
+
+aom_codec_err_t aom_get_num_layers_from_operating_point_idc(
+ int operating_point_idc, unsigned int *num_spatial_layers,
+ unsigned int *num_temporal_layers);
+
+#endif // AOM_AV1_DECODER_OBU_H_
diff --git a/third_party/aom/av1/encoder/aq_complexity.c b/third_party/aom/av1/encoder/aq_complexity.c
new file mode 100644
index 0000000000..80f8e2e66d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+#define AQ_C_SEGMENTS 5
+#define DEFAULT_AQ2_SEG 3 // Neutral Q segment
+#define AQ_C_STRENGTHS 3
+static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 1.75, 1.25, 1.05, 1.00, 0.90 },
+ { 2.00, 1.50, 1.15, 1.00, 0.85 },
+ { 2.50, 1.75, 1.25, 1.00, 0.80 }
+};
+static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { 0.15, 0.30, 0.55, 2.00, 100.0 },
+ { 0.20, 0.40, 0.65, 2.00, 100.0 },
+ { 0.25, 0.50, 0.75, 2.00, 100.0 }
+};
+static const double aq_c_var_thresholds[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {
+ { -4.0, -3.0, -2.0, 100.00, 100.0 },
+ { -3.5, -2.5, -1.5, 100.00, 100.0 },
+ { -3.0, -2.0, -1.0, 100.00, 100.0 }
+};
+
+static int get_aq_c_strength(int q_index, aom_bit_depth_t bit_depth) {
+ // Approximate base quatizer (truncated to int)
+ const int base_quant = av1_ac_quant_Q3(q_index, 0, bit_depth) / 4;
+ return (base_quant > 10) + (base_quant > 25);
+}
+
+void av1_setup_in_frame_q_adj(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct segmentation *const seg = &cm->seg;
+ int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+
+ // Make SURE use of floating point in this function is safe.
+ aom_clear_system_state();
+
+ if (resolution_change) {
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ av1_clearall_segfeatures(seg);
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ int segment;
+ const int aq_strength =
+ get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+
+ // Clear down the segment map.
+ memset(cpi->segmentation_map, DEFAULT_AQ2_SEG, cm->mi_rows * cm->mi_cols);
+
+ av1_clearall_segfeatures(seg);
+
+ // Segmentation only makes sense if the target bits per SB is above a
+ // threshold. Below this the overheads will usually outweigh any benefit.
+ if (cpi->rc.sb64_target_rate < 256) {
+ av1_disable_segmentation(seg);
+ return;
+ }
+
+ av1_enable_segmentation(seg);
+
+ // Default segment "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, DEFAULT_AQ2_SEG, SEG_LVL_ALT_Q);
+
+ // Use some of the segments for in frame Q adjustment.
+ for (segment = 0; segment < AQ_C_SEGMENTS; ++segment) {
+ int qindex_delta;
+
+ if (segment == DEFAULT_AQ2_SEG) continue;
+
+ qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex,
+ aq_c_q_adj_factor[aq_strength][segment], cm->seq_params.bit_depth);
+
+ // For AQ complexity mode, we dont allow Q0 in a segment if the base
+ // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+ if ((cm->base_qindex + qindex_delta) > 0) {
+ av1_enable_segfeature(seg, segment, SEG_LVL_ALT_Q);
+ av1_set_segdata(seg, segment, SEG_LVL_ALT_Q, qindex_delta);
+ }
+ }
+ }
+}
+
+#define DEFAULT_LV_THRESH 10.0
+#define MIN_DEFAULT_LV_THRESH 8.0
+// Select a segment for the current block.
+// The choice of segment for a block depends on the ratio of the projected
+// bits for the block vs a target average and its spatial complexity.
+void av1_caq_select_segment(const AV1_COMP *cpi, MACROBLOCK *mb, BLOCK_SIZE bs,
+ int mi_row, int mi_col, int projected_rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, mi_size_wide[bs]);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, mi_size_high[bs]);
+ int x, y;
+ int i;
+ unsigned char segment;
+
+ if (0) {
+ segment = DEFAULT_AQ2_SEG;
+ } else {
+ // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
+ // It is converted to bits << AV1_PROB_COST_SHIFT units.
+ const int64_t num = (int64_t)(cpi->rc.sb64_target_rate * xmis * ymis)
+ << AV1_PROB_COST_SHIFT;
+ const int denom = cm->seq_params.mib_size * cm->seq_params.mib_size;
+ const int target_rate = (int)(num / denom);
+ double logvar;
+ double low_var_thresh;
+ const int aq_strength =
+ get_aq_c_strength(cm->base_qindex, cm->seq_params.bit_depth);
+
+ aom_clear_system_state();
+ low_var_thresh =
+ (cpi->oxcf.pass == 2)
+ ? AOMMAX(exp(cpi->twopass.mb_av_energy), MIN_DEFAULT_LV_THRESH)
+ : DEFAULT_LV_THRESH;
+
+ av1_setup_src_planes(mb, cpi->source, mi_row, mi_col, num_planes);
+ logvar = av1_log_block_var(cpi, mb, bs);
+
+ segment = AQ_C_SEGMENTS - 1; // Just in case no break out below.
+ for (i = 0; i < AQ_C_SEGMENTS; ++i) {
+ // Test rate against a threshold value and variance against a threshold.
+ // Increasing segment number (higher variance and complexity) = higher Q.
+ if ((projected_rate < target_rate * aq_c_transitions[aq_strength][i]) &&
+ (logvar < (low_var_thresh + aq_c_var_thresholds[aq_strength][i]))) {
+ segment = i;
+ break;
+ }
+ }
+ }
+
+ // Fill in the entires in the segment map corresponding to this SB64.
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ cpi->segmentation_map[mi_offset + y * cm->mi_cols + x] = segment;
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/aq_complexity.h b/third_party/aom/av1/encoder/aq_complexity.h
new file mode 100644
index 0000000000..3421d74c93
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_complexity.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+#define AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/common/enums.h"
+
+struct AV1_COMP;
+struct macroblock;
+
+// Select a segment for the current Block.
+void av1_caq_select_segment(const struct AV1_COMP *cpi, struct macroblock *,
+ BLOCK_SIZE bs, int mi_row, int mi_col,
+ int projected_rate);
+
+// This function sets up a set of segments with delta Q values around
+// the baseline frame quantizer.
+void av1_setup_in_frame_q_adj(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_COMPLEXITY_H_
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.c b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
new file mode 100644
index 0000000000..f532d48da5
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+
+#include "av1/common/seg_common.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/segmentation.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/system_state.h"
+
+struct CYCLIC_REFRESH {
+ // Percentage of blocks per frame that are targeted as candidates
+ // for cyclic refresh.
+ int percent_refresh;
+ // Maximum q-delta as percentage of base q.
+ int max_qdelta_perc;
+ // Superblock starting index for cycling through the frame.
+ int sb_index;
+ // Controls how long block will need to wait to be refreshed again, in
+ // excess of the cycle time, i.e., in the case of all zero motion, block
+ // will be refreshed every (100/percent_refresh + time_for_refresh) frames.
+ int time_for_refresh;
+ // Target number of (8x8) blocks that are set for delta-q.
+ int target_num_seg_blocks;
+ // Actual number of (8x8) blocks that were applied delta-q.
+ int actual_num_seg1_blocks;
+ int actual_num_seg2_blocks;
+ // RD mult. parameters for segment 1.
+ int rdmult;
+ // Cyclic refresh map.
+ int8_t *map;
+ // Map of the last q a block was coded at.
+ uint8_t *last_coded_q_map;
+ // Thresholds applied to the projected rate/distortion of the coding block,
+ // when deciding whether block should be refreshed.
+ int64_t thresh_rate_sb;
+ int64_t thresh_dist_sb;
+ // Threshold applied to the motion vector (in units of 1/8 pel) of the
+ // coding block, when deciding whether block should be refreshed.
+ int16_t motion_thresh;
+ // Rate target ratio to set q delta.
+ double rate_ratio_qdelta;
+ // Boost factor for rate target ratio, for segment CR_SEGMENT_ID_BOOST2.
+ int rate_boost_fac;
+ double low_content_avg;
+ int qindex_delta[3];
+};
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
+ size_t last_coded_q_map_size;
+ CYCLIC_REFRESH *const cr = aom_calloc(1, sizeof(*cr));
+ if (cr == NULL) return NULL;
+
+ cr->map = aom_calloc(mi_rows * mi_cols, sizeof(*cr->map));
+ if (cr->map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ last_coded_q_map_size = mi_rows * mi_cols * sizeof(*cr->last_coded_q_map);
+ cr->last_coded_q_map = aom_malloc(last_coded_q_map_size);
+ if (cr->last_coded_q_map == NULL) {
+ av1_cyclic_refresh_free(cr);
+ return NULL;
+ }
+ assert(MAXQ <= 255);
+ memset(cr->last_coded_q_map, MAXQ, last_coded_q_map_size);
+
+ return cr;
+}
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr) {
+ if (cr != NULL) {
+ aom_free(cr->map);
+ aom_free(cr->last_coded_q_map);
+ aom_free(cr);
+ }
+}
+
+// Check if we should turn off cyclic refresh based on bitrate condition.
+static int apply_cyclic_refresh_bitrate(const AV1_COMMON *cm,
+ const RATE_CONTROL *rc) {
+ // Turn off cyclic refresh if bits available per frame is not sufficiently
+ // larger than bit cost of segmentation. Segment map bit cost should scale
+ // with number of seg blocks, so compare available bits to number of blocks.
+ // Average bits available per frame = avg_frame_bandwidth
+ // Number of (8x8) blocks in frame = mi_rows * mi_cols;
+ const float factor = 0.25;
+ const int number_blocks = cm->mi_rows * cm->mi_cols;
+ // The condition below corresponds to turning off at target bitrates:
+ // (at 30fps), ~12kbps for CIF, 36kbps for VGA, 100kps for HD/720p.
+ // Also turn off at very small frame sizes, to avoid too large fraction of
+ // superblocks to be refreshed per frame. Threshold below is less than QCIF.
+ if (rc->avg_frame_bandwidth < factor * number_blocks ||
+ number_blocks / 64 < 5)
+ return 0;
+ else
+ return 1;
+}
+
+// Check if this coding block, of size bsize, should be considered for refresh
+// (lower-qp coding). Decision can be based on various factors, such as
+// size of the coding block (i.e., below min_block size rejected), coding
+// mode, and rate/distortion.
+static int candidate_refresh_aq(const CYCLIC_REFRESH *cr,
+ const MB_MODE_INFO *mbmi, int64_t rate,
+ int64_t dist, int bsize) {
+ MV mv = mbmi->mv[0].as_mv;
+ // Reject the block for lower-qp coding if projected distortion
+ // is above the threshold, and any of the following is true:
+ // 1) mode uses large mv
+ // 2) mode is an intra-mode
+ // Otherwise accept for refresh.
+ if (dist > cr->thresh_dist_sb &&
+ (mv.row > cr->motion_thresh || mv.row < -cr->motion_thresh ||
+ mv.col > cr->motion_thresh || mv.col < -cr->motion_thresh ||
+ !is_inter_block(mbmi)))
+ return CR_SEGMENT_ID_BASE;
+ else if (bsize >= BLOCK_16X16 && rate < cr->thresh_rate_sb &&
+ is_inter_block(mbmi) && mbmi->mv[0].as_int == 0 &&
+ cr->rate_boost_fac > 10)
+ // More aggressive delta-q for bigger blocks with zero motion.
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BOOST1;
+}
+
+// Compute delta-q for the segment.
+static int compute_deltaq(const AV1_COMP *cpi, int q, double rate_factor) {
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int deltaq =
+ av1_compute_qdelta_by_rate(rc, cpi->common.frame_type, q, rate_factor,
+ cpi->common.seq_params.bit_depth);
+ if ((-deltaq) > cr->max_qdelta_perc * q / 100) {
+ deltaq = -cr->max_qdelta_perc * q / 100;
+ }
+ return deltaq;
+}
+
+// For the just encoded frame, estimate the bits, incorporating the delta-q
+// from non-base segment. For now ignore effect of multiple segments
+// (with different delta-q). Note this function is called in the postencode
+// (called from rc_update_rate_correction_factors()).
+int av1_cyclic_refresh_estimate_bits_at_q(const AV1_COMP *cpi,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int estimated_bits;
+ int mbs = cm->MBs;
+ int num8x8bl = mbs << 2;
+ // Weight for non-base segments: use actual number of blocks refreshed in
+ // previous/just encoded frame. Note number of blocks here is in 8x8 units.
+ double weight_segment1 = (double)cr->actual_num_seg1_blocks / num8x8bl;
+ double weight_segment2 = (double)cr->actual_num_seg2_blocks / num8x8bl;
+ // Take segment weighted average for estimated bits.
+ estimated_bits =
+ (int)((1.0 - weight_segment1 - weight_segment2) *
+ av1_estimate_bits_at_q(cm->frame_type, cm->base_qindex, mbs,
+ correction_factor,
+ cm->seq_params.bit_depth) +
+ weight_segment1 * av1_estimate_bits_at_q(
+ cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[1], mbs,
+ correction_factor, cm->seq_params.bit_depth) +
+ weight_segment2 * av1_estimate_bits_at_q(
+ cm->frame_type,
+ cm->base_qindex + cr->qindex_delta[2], mbs,
+ correction_factor, cm->seq_params.bit_depth));
+ return estimated_bits;
+}
+
+// Prior to encoding the frame, estimate the bits per mb, for a given q = i and
+// a corresponding delta-q (for segment 1). This function is called in the
+// rc_regulate_q() to set the base qp index.
+// Note: the segment map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or
+// to 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock, prior to encoding.
+int av1_cyclic_refresh_rc_bits_per_mb(const AV1_COMP *cpi, int i,
+ double correction_factor) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int bits_per_mb;
+ int num8x8bl = cm->MBs << 2;
+ // Weight for segment prior to encoding: take the average of the target
+ // number for the frame to be encoded and the actual from the previous frame.
+ double weight_segment =
+ (double)((cr->target_num_seg_blocks + cr->actual_num_seg1_blocks +
+ cr->actual_num_seg2_blocks) >>
+ 1) /
+ num8x8bl;
+ // Compute delta-q corresponding to qindex i.
+ int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta);
+ // Take segment weighted average for bits per mb.
+ bits_per_mb =
+ (int)((1.0 - weight_segment) *
+ av1_rc_bits_per_mb(cm->frame_type, i, correction_factor,
+ cm->seq_params.bit_depth) +
+ weight_segment * av1_rc_bits_per_mb(cm->frame_type, i + deltaq,
+ correction_factor,
+ cm->seq_params.bit_depth));
+ return bits_per_mb;
+}
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const AV1_COMP *cpi,
+ MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+ const int block_index = mi_row * cm->mi_cols + mi_col;
+ const int refresh_this_block =
+ candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
+ // Default is to not update the refresh map.
+ int new_map_value = cr->map[block_index];
+ int x = 0;
+ int y = 0;
+
+ // If this block is labeled for refresh, check if we should reset the
+ // segment_id.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ mbmi->segment_id = refresh_this_block;
+ // Reset segment_id if will be skipped.
+ if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
+ }
+
+ // Update the cyclic refresh map, to be used for setting segmentation map
+ // for the next frame. If the block will be refreshed this frame, mark it
+ // as clean. The magnitude of the -ve influences how long before we consider
+ // it for refresh again.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
+ new_map_value = -cr->time_for_refresh;
+ } else if (refresh_this_block) {
+ // Else if it is accepted as candidate for refresh, and has not already
+ // been refreshed (marked as 1) then mark it as a candidate for cleanup
+ // for future time (marked as 0), otherwise don't update it.
+ if (cr->map[block_index] == 1) new_map_value = 0;
+ } else {
+ // Leave it marked as block that is not candidate for refresh.
+ new_map_value = 1;
+ }
+
+ // Update entries in the cyclic refresh map with new_map_value, and
+ // copy mbmi->segment_id into global segmentation map.
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ int map_offset = block_index + y * cm->mi_cols + x;
+ cr->map[map_offset] = new_map_value;
+ cpi->segmentation_map[map_offset] = mbmi->segment_id;
+ // Inter skip blocks were clearly not coded at the current qindex, so
+ // don't update the map for them. For cases where motion is non-zero or
+ // the reference frame isn't the previous frame, the previous value in
+ // the map for this spatial location is not entirely correct.
+ if ((!is_inter_block(mbmi) || !skip) &&
+ mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] = clamp(
+ cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+ } else if (is_inter_block(mbmi) && skip &&
+ mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+ cr->last_coded_q_map[map_offset] =
+ AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+ 0, MAXQ),
+ cr->last_coded_q_map[map_offset]);
+ }
+ }
+}
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int mi_row, mi_col;
+ cr->actual_num_seg1_blocks = 0;
+ cr->actual_num_seg2_blocks = 0;
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++)
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ if (cyclic_refresh_segment_id(seg_map[mi_row * cm->mi_cols + mi_col]) ==
+ CR_SEGMENT_ID_BOOST1)
+ cr->actual_num_seg1_blocks++;
+ else if (cyclic_refresh_segment_id(
+ seg_map[mi_row * cm->mi_cols + mi_col]) ==
+ CR_SEGMENT_ID_BOOST2)
+ cr->actual_num_seg2_blocks++;
+ }
+}
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(AV1_COMP *const cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ // Set minimum gf_interval for GF update to a multiple (== 2) of refresh
+ // period. Depending on past encoding stats, GF flag may be reset and update
+ // may not occur until next baseline_gf_interval.
+ if (cr->percent_refresh > 0)
+ rc->baseline_gf_interval = 4 * (100 / cr->percent_refresh);
+ else
+ rc->baseline_gf_interval = 40;
+}
+
+// Update some encoding stats (from the just encoded frame). If this frame's
+// background has high motion, refresh the golden frame. Otherwise, if the
+// golden reference is to be updated check if we should NOT update the golden
+// ref.
+void av1_cyclic_refresh_check_golden_update(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ int mi_row, mi_col;
+ double fraction_low = 0.0;
+ int low_content_frame = 0;
+
+ MB_MODE_INFO **mi;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int rows = cm->mi_rows, cols = cm->mi_cols;
+ int cnt1 = 0, cnt2 = 0;
+ int force_gf_refresh = 0;
+
+ for (mi_row = 0; mi_row < rows; mi_row++) {
+ mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
+ for (mi_col = 0; mi_col < cols; mi_col++) {
+ int16_t abs_mvr = mi[0]->mv[0].as_mv.row >= 0
+ ? mi[0]->mv[0].as_mv.row
+ : -1 * mi[0]->mv[0].as_mv.row;
+ int16_t abs_mvc = mi[0]->mv[0].as_mv.col >= 0
+ ? mi[0]->mv[0].as_mv.col
+ : -1 * mi[0]->mv[0].as_mv.col;
+
+ // Calculate the motion of the background.
+ if (abs_mvr <= 16 && abs_mvc <= 16) {
+ cnt1++;
+ if (abs_mvr == 0 && abs_mvc == 0) cnt2++;
+ }
+ mi++;
+
+ // Accumulate low_content_frame.
+ if (cr->map[mi_row * cols + mi_col] < 1) low_content_frame++;
+ }
+ }
+
+ // For video conference clips, if the background has high motion in current
+ // frame because of the camera movement, set this frame as the golden frame.
+ // Use 70% and 5% as the thresholds for golden frame refreshing.
+ if (cnt1 * 10 > (70 * rows * cols) && cnt2 * 20 < cnt1) {
+ av1_cyclic_refresh_set_golden_update(cpi);
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ force_gf_refresh = 1;
+ }
+
+ fraction_low = (double)low_content_frame / (rows * cols);
+ // Update average.
+ cr->low_content_avg = (fraction_low + 3 * cr->low_content_avg) / 4;
+ if (!force_gf_refresh && cpi->refresh_golden_frame == 1) {
+ // Don't update golden reference if the amount of low_content for the
+ // current encoded frame is small, or if the recursive average of the
+ // low_content over the update interval window falls below threshold.
+ if (fraction_low < 0.8 || cr->low_content_avg < 0.7)
+ cpi->refresh_golden_frame = 0;
+ // Reset for next internal.
+ cr->low_content_avg = fraction_low;
+ }
+}
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
+// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
+// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
+// encoding of the superblock).
+static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
+ int xmis, ymis, x, y;
+ memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
+ sb_cols =
+ (cm->mi_cols + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+ sb_rows =
+ (cm->mi_rows + cm->seq_params.mib_size - 1) / cm->seq_params.mib_size;
+ sbs_in_frame = sb_cols * sb_rows;
+ // Number of target blocks to get the q delta (segment 1).
+ block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
+ // Set the segmentation map: cycle through the superblocks, starting at
+ // cr->mb_index, and stopping when either block_count blocks have been found
+ // to be refreshed, or we have passed through whole frame.
+ if (cr->sb_index >= sbs_in_frame) cr->sb_index = 0;
+ assert(cr->sb_index < sbs_in_frame);
+ i = cr->sb_index;
+ cr->target_num_seg_blocks = 0;
+ do {
+ int sum_map = 0;
+ // Get the mi_row/mi_col corresponding to superblock index i.
+ int sb_row_index = (i / sb_cols);
+ int sb_col_index = i - sb_row_index * sb_cols;
+ int mi_row = sb_row_index * cm->seq_params.mib_size;
+ int mi_col = sb_col_index * cm->seq_params.mib_size;
+ int qindex_thresh =
+ cpi->oxcf.content == AOM_CONTENT_SCREEN
+ ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
+ : 0;
+ assert(mi_row >= 0 && mi_row < cm->mi_rows);
+ assert(mi_col >= 0 && mi_col < cm->mi_cols);
+ bl_index = mi_row * cm->mi_cols + mi_col;
+ // Loop through all MI blocks in superblock and update map.
+ xmis = AOMMIN(cm->mi_cols - mi_col, cm->seq_params.mib_size);
+ ymis = AOMMIN(cm->mi_rows - mi_row, cm->seq_params.mib_size);
+ for (y = 0; y < ymis; y++) {
+ for (x = 0; x < xmis; x++) {
+ const int bl_index2 = bl_index + y * cm->mi_cols + x;
+ // If the block is as a candidate for clean up then mark it
+ // for possible boost/refresh (segment 1). The segment id may get
+ // reset to 0 later if block gets coded anything other than GLOBALMV.
+ if (cr->map[bl_index2] == 0) {
+ if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
+ } else if (cr->map[bl_index2] < 0) {
+ cr->map[bl_index2]++;
+ }
+ }
+ }
+ // Enforce constant segment over superblock.
+ // If segment is at least half of superblock, set to 1.
+ if (sum_map >= xmis * ymis / 2) {
+ for (y = 0; y < ymis; y++)
+ for (x = 0; x < xmis; x++) {
+ seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
+ }
+ cr->target_num_seg_blocks += xmis * ymis;
+ }
+ i++;
+ if (i == sbs_in_frame) {
+ i = 0;
+ }
+ } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
+ cr->sb_index = i;
+}
+
+// Set cyclic refresh parameters.
+void av1_cyclic_refresh_update_parameters(AV1_COMP *const cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ cr->percent_refresh = 10;
+ cr->max_qdelta_perc = 50;
+ cr->time_for_refresh = 0;
+ // Use larger delta-qp (increase rate_ratio_qdelta) for first few (~4)
+ // periods of the refresh cycle, after a key frame.
+ if (rc->frames_since_key < 4 * cr->percent_refresh)
+ cr->rate_ratio_qdelta = 3.0;
+ else
+ cr->rate_ratio_qdelta = 2.0;
+ // Adjust some parameters for low resolutions at low bitrates.
+ if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) {
+ cr->motion_thresh = 4;
+ cr->rate_boost_fac = 10;
+ } else {
+ cr->motion_thresh = 32;
+ cr->rate_boost_fac = 17;
+ }
+}
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ struct segmentation *const seg = &cm->seg;
+ const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
+ int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+ if (resolution_change) {
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ av1_clearall_segfeatures(seg);
+ aom_clear_system_state();
+ av1_disable_segmentation(seg);
+ return;
+ }
+ if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
+ // Don't apply refresh on key frame or enhancement layer frames.
+ if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
+ // Set segmentation map to 0 and disable.
+ unsigned char *const seg_map = cpi->segmentation_map;
+ memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
+ av1_disable_segmentation(&cm->seg);
+ if (cm->frame_type == KEY_FRAME) {
+ memset(cr->last_coded_q_map, MAXQ,
+ cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
+ cr->sb_index = 0;
+ }
+ return;
+ } else {
+ int qindex_delta = 0;
+ int qindex2;
+ const double q =
+ av1_convert_qindex_to_q(cm->base_qindex, cm->seq_params.bit_depth);
+ aom_clear_system_state();
+ // Set rate threshold to some multiple (set to 2 for now) of the target
+ // rate (target is given by sb64_target_rate and scaled by 256).
+ cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
+ // Distortion threshold, quadratic in Q, scale factor to be adjusted.
+ // q will not exceed 457, so (q * q) is within 32bit; see:
+ // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
+ cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;
+
+ // Set up segmentation.
+ // Clear down the segment map.
+ av1_enable_segmentation(&cm->seg);
+ av1_clearall_segfeatures(seg);
+
+ // Note: setting temporal_update has no effect, as the seg-map coding method
+ // (temporal or spatial) is determined in
+ // av1_choose_segmap_coding_method(),
+ // based on the coding cost of each method. For error_resilient mode on the
+ // last_frame_seg_map is set to 0, so if temporal coding is used, it is
+ // relative to 0 previous map.
+ // seg->temporal_update = 0;
+
+ // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
+ av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
+ // Use segment BOOST1 for in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
+ // Use segment BOOST2 for more aggressive in-frame Q adjustment.
+ av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);
+
+ // Set the q delta for segment BOOST1.
+ qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
+ cr->qindex_delta[1] = qindex_delta;
+
+ // Compute rd-mult for segment BOOST1.
+ qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);
+
+ cr->rdmult = av1_compute_rd_mult(cpi, qindex2);
+
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Set a more aggressive (higher) q delta for segment BOOST2.
+ qindex_delta = compute_deltaq(
+ cpi, cm->base_qindex,
+ AOMMIN(CR_MAX_RATE_TARGET_RATIO,
+ 0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
+ cr->qindex_delta[2] = qindex_delta;
+ av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);
+
+ // Update the segmentation and refresh map.
+ cyclic_refresh_update_map(cpi);
+ }
+}
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr) {
+ return cr->rdmult;
+}
+
+void av1_cyclic_refresh_reset_resize(AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
+ memset(cr->map, 0, cm->mi_rows * cm->mi_cols);
+ cr->sb_index = 0;
+ cpi->refresh_golden_frame = 1;
+}
diff --git a/third_party/aom/av1/encoder/aq_cyclicrefresh.h b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
new file mode 100644
index 0000000000..b45781983d
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_cyclicrefresh.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+#define AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The segment ids used in cyclic refresh: from base (no boost) to increasing
+// boost (higher delta-qp).
+#define CR_SEGMENT_ID_BASE 0
+#define CR_SEGMENT_ID_BOOST1 1
+#define CR_SEGMENT_ID_BOOST2 2
+
+// Maximum rate target ratio for setting segment delta-qp.
+#define CR_MAX_RATE_TARGET_RATIO 4.0
+
+struct AV1_COMP;
+
+struct CYCLIC_REFRESH;
+typedef struct CYCLIC_REFRESH CYCLIC_REFRESH;
+
+CYCLIC_REFRESH *av1_cyclic_refresh_alloc(int mi_rows, int mi_cols);
+
+void av1_cyclic_refresh_free(CYCLIC_REFRESH *cr);
+
+// Estimate the bits, incorporating the delta-q from segment 1, after encoding
+// the frame.
+int av1_cyclic_refresh_estimate_bits_at_q(const struct AV1_COMP *cpi,
+ double correction_factor);
+
+// Estimate the bits per mb, for a given q = i and a corresponding delta-q
+// (for segment 1), prior to encoding the frame.
+int av1_cyclic_refresh_rc_bits_per_mb(const struct AV1_COMP *cpi, int i,
+ double correction_factor);
+
+// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
+// check if we should reset the segment_id, and update the cyclic_refresh map
+// and segmentation map.
+void av1_cyclic_refresh_update_segment(const struct AV1_COMP *cpi,
+ MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ int64_t rate, int64_t dist, int skip);
+
+// Update the segmentation map, and related quantities: cyclic refresh map,
+// refresh sb_index, and target number of blocks to be refreshed.
+void av1_cyclic_refresh_update__map(struct AV1_COMP *const cpi);
+
+// Update the actual number of blocks that were applied the segment delta q.
+void av1_cyclic_refresh_postencode(struct AV1_COMP *const cpi);
+
+// Set golden frame update interval, for 1 pass CBR mode.
+void av1_cyclic_refresh_set_golden_update(struct AV1_COMP *const cpi);
+
+// Check if we should not update golden reference, based on past refresh stats.
+void av1_cyclic_refresh_check_golden_update(struct AV1_COMP *const cpi);
+
+// Set/update global/frame level refresh parameters.
+void av1_cyclic_refresh_update_parameters(struct AV1_COMP *const cpi);
+
+// Setup cyclic background refresh: set delta q and segmentation map.
+void av1_cyclic_refresh_setup(struct AV1_COMP *const cpi);
+
+int av1_cyclic_refresh_get_rdmult(const CYCLIC_REFRESH *cr);
+
+void av1_cyclic_refresh_reset_resize(struct AV1_COMP *const cpi);
+
+static INLINE int cyclic_refresh_segment_id_boosted(int segment_id) {
+ return segment_id == CR_SEGMENT_ID_BOOST1 ||
+ segment_id == CR_SEGMENT_ID_BOOST2;
+}
+
+static INLINE int cyclic_refresh_segment_id(int segment_id) {
+ if (segment_id == CR_SEGMENT_ID_BOOST1)
+ return CR_SEGMENT_ID_BOOST1;
+ else if (segment_id == CR_SEGMENT_ID_BOOST2)
+ return CR_SEGMENT_ID_BOOST2;
+ else
+ return CR_SEGMENT_ID_BASE;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_CYCLICREFRESH_H_
diff --git a/third_party/aom/av1/encoder/aq_variance.c b/third_party/aom/av1/encoder/aq_variance.c
new file mode 100644
index 0000000000..58f906bdc0
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/common/seg_common.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/dwt.h"
+#include "aom_ports/system_state.h"
+
+static const double rate_ratio[MAX_SEGMENTS] = { 2.2, 1.7, 1.3, 1.0,
+ 0.9, .8, .7, .6 };
+
+static const double deltaq_rate_ratio[MAX_SEGMENTS] = { 2.5, 2.0, 1.5, 1.0,
+ 0.75, 1.0, 1.0, 1.0 };
+#define ENERGY_MIN (-4)
+#define ENERGY_MAX (1)
+#define ENERGY_SPAN (ENERGY_MAX - ENERGY_MIN + 1)
+#define ENERGY_IN_BOUNDS(energy) \
+ assert((energy) >= ENERGY_MIN && (energy) <= ENERGY_MAX)
+
+DECLARE_ALIGNED(16, static const uint8_t, av1_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+DECLARE_ALIGNED(16, static const uint16_t,
+ av1_highbd_all_zeros[MAX_SB_SIZE]) = { 0 };
+
+static const int segment_id[ENERGY_SPAN] = { 0, 1, 1, 2, 3, 4 };
+
+#define SEGMENT_ID(i) segment_id[(i)-ENERGY_MIN]
+
+void av1_vaq_frame_setup(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ struct segmentation *seg = &cm->seg;
+ int i;
+
+ int resolution_change =
+ cm->prev_frame && (cm->width != cm->prev_frame->width ||
+ cm->height != cm->prev_frame->height);
+ int avg_energy = (int)(cpi->twopass.mb_av_energy - 2);
+ double avg_ratio;
+ if (avg_energy > 7) avg_energy = 7;
+ if (avg_energy < 0) avg_energy = 0;
+ avg_ratio = rate_ratio[avg_energy];
+
+ if (resolution_change) {
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ av1_clearall_segfeatures(seg);
+ aom_clear_system_state();
+ av1_disable_segmentation(seg);
+ return;
+ }
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+ cpi->vaq_refresh = 1;
+
+ av1_enable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ aom_clear_system_state();
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ // Set up avg segment id to be 1.0 and adjust the other segments around
+ // it.
+ int qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex, rate_ratio[i] / avg_ratio,
+ cm->seq_params.bit_depth);
+
+ // We don't allow qindex 0 in a segment if the base value is not 0.
+ // Q index 0 (lossless) implies 4x4 encoding only and in AQ mode a segment
+ // Q delta is sometimes applied without going back around the rd loop.
+ // This could lead to an illegal combination of partition size and q.
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+
+ av1_set_segdata(seg, i, SEG_LVL_ALT_Q, qindex_delta);
+ av1_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ }
+ }
+}
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs) {
+ // This functions returns a score for the blocks local variance as calculated
+ // by: sum of the log of the (4x4 variances) of each subblock to the current
+ // block (x,bs)
+ // * 32 / number of pixels in the block_size.
+ // This is used for segmentation because to avoid situations in which a large
+ // block with a gentle gradient gets marked high variance even though each
+ // subblock has a low variance. This allows us to assign the same segment
+ // number for the same sorts of area regardless of how the partitioning goes.
+
+ MACROBLOCKD *xd = &x->e_mbd;
+ double var = 0;
+ unsigned int sse;
+ int i, j;
+
+ int right_overflow =
+ (xd->mb_to_right_edge < 0) ? ((-xd->mb_to_right_edge) >> 3) : 0;
+ int bottom_overflow =
+ (xd->mb_to_bottom_edge < 0) ? ((-xd->mb_to_bottom_edge) >> 3) : 0;
+
+ const int bw = MI_SIZE * mi_size_wide[bs] - right_overflow;
+ const int bh = MI_SIZE * mi_size_high[bs] - bottom_overflow;
+
+ aom_clear_system_state();
+
+ for (i = 0; i < bh; i += 4) {
+ for (j = 0; j < bw; j += 4) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride,
+ CONVERT_TO_BYTEPTR(av1_highbd_all_zeros), 0, &sse) /
+ 16);
+ } else {
+ var +=
+ log(1.0 + cpi->fn_ptr[BLOCK_4X4].vf(
+ x->plane[0].src.buf + i * x->plane[0].src.stride + j,
+ x->plane[0].src.stride, av1_all_zeros, 0, &sse) /
+ 16);
+ }
+ }
+ }
+ // Use average of 4x4 log variance. The range for 8 bit 0 - 9.704121561.
+ var /= (bw / 4 * bh / 4);
+ if (var > 7) var = 7;
+
+ aom_clear_system_state();
+ return (int)(var);
+}
+
+#define DEFAULT_E_MIDPOINT 10.0
+
+unsigned int haar_ac_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int stride = x->plane[0].src.stride;
+ uint8_t *buf = x->plane[0].src.buf;
+ const int bw = MI_SIZE * mi_size_wide[bs];
+ const int bh = MI_SIZE * mi_size_high[bs];
+ int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+
+ int var = 0;
+ for (int r = 0; r < bh; r += 8)
+ for (int c = 0; c < bw; c += 8) {
+ var += av1_haar_ac_sad_8x8_uint8_input(buf + c + r * stride, stride, hbd);
+ }
+
+ return (unsigned int)((uint64_t)var * 256) >> num_pels_log2_lookup[bs];
+}
+
+double av1_log_block_wavelet_energy(MACROBLOCK *x, BLOCK_SIZE bs) {
+ unsigned int haar_sad = haar_ac_energy(x, bs);
+ aom_clear_system_state();
+ return log(haar_sad + 1.0);
+}
+
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs) {
+ double energy, energy_midpoint;
+ aom_clear_system_state();
+ energy_midpoint = (cpi->oxcf.pass == 2) ? cpi->twopass.frame_avg_haar_energy
+ : DEFAULT_E_MIDPOINT;
+ energy = av1_log_block_wavelet_energy(x, bs) - energy_midpoint;
+ return clamp((int)round(energy), ENERGY_MIN, ENERGY_MAX);
+}
+
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+ int block_var_level) {
+ int rate_level;
+ const AV1_COMMON *const cm = &cpi->common;
+
+ if (DELTAQ_MODULATION == 1) {
+ ENERGY_IN_BOUNDS(block_var_level);
+ rate_level = SEGMENT_ID(block_var_level);
+ } else {
+ rate_level = block_var_level;
+ }
+ int qindex_delta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, cm->base_qindex, deltaq_rate_ratio[rate_level],
+ cm->seq_params.bit_depth);
+
+ if ((cm->base_qindex != 0) && ((cm->base_qindex + qindex_delta) == 0)) {
+ qindex_delta = -cm->base_qindex + 1;
+ }
+ return qindex_delta;
+}
diff --git a/third_party/aom/av1/encoder/aq_variance.h b/third_party/aom/av1/encoder/aq_variance.h
new file mode 100644
index 0000000000..2d22b663e5
--- /dev/null
+++ b/third_party/aom/av1/encoder/aq_variance.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AQ_VARIANCE_H_
+#define AOM_AV1_ENCODER_AQ_VARIANCE_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_vaq_frame_setup(AV1_COMP *cpi);
+
+int av1_log_block_var(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bs);
+int av1_compute_deltaq_from_energy_level(const AV1_COMP *const cpi,
+ int block_var_level);
+int av1_block_wavelet_energy_level(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bs);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AQ_VARIANCE_H_
diff --git a/third_party/aom/av1/encoder/arm/neon/quantize_neon.c b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
new file mode 100644
index 0000000000..36e7d33702
--- /dev/null
+++ b/third_party/aom/av1/encoder/arm/neon/quantize_neon.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ int i;
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ // process dc and the first seven ac coeffs
+ {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ vst1q_s16(&qcoeff_ptr[0], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[0], v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ // now process the rest of the ac coeffs
+ for (i = 8; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+ const int32x4_t v_tmp_lo =
+ vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi =
+ vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 =
+ vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ }
+ {
+ const int16x4_t v_eobmax_3210 = vmax_s16(
+ vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+ } else {
+ memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ *eob_ptr = 0;
+ }
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.c b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
new file mode 100644
index 0000000000..98505e0b1a
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.c
@@ -0,0 +1,1885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/common/av1_txfm.h"
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 4;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[4];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[3];
+ bf1[1] = input[1] + input[2];
+ bf1[2] = -input[2] + input[1];
+ bf1[3] = -input[3] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[2];
+ bf1[2] = bf0[1];
+ bf1[3] = bf0[3];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[7];
+ bf1[1] = input[1] + input[6];
+ bf1[2] = input[2] + input[5];
+ bf1[3] = input[3] + input[4];
+ bf1[4] = -input[4] + input[3];
+ bf1[5] = -input[5] + input[2];
+ bf1[6] = -input[6] + input[1];
+ bf1[7] = -input[7] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[4];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[6];
+ bf1[4] = bf0[1];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[3];
+ bf1[7] = bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[15];
+ bf1[1] = input[1] + input[14];
+ bf1[2] = input[2] + input[13];
+ bf1[3] = input[3] + input[12];
+ bf1[4] = input[4] + input[11];
+ bf1[5] = input[5] + input[10];
+ bf1[6] = input[6] + input[9];
+ bf1[7] = input[7] + input[8];
+ bf1[8] = -input[8] + input[7];
+ bf1[9] = -input[9] + input[6];
+ bf1[10] = -input[10] + input[5];
+ bf1[11] = -input[11] + input[4];
+ bf1[12] = -input[12] + input[3];
+ bf1[13] = -input[13] + input[2];
+ bf1[14] = -input[14] + input[1];
+ bf1[15] = -input[15] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[8];
+ bf1[2] = bf0[4];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[2];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[14];
+ bf1[8] = bf0[1];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[5];
+ bf1[11] = bf0[13];
+ bf1[12] = bf0[3];
+ bf1[13] = bf0[11];
+ bf1[14] = bf0[7];
+ bf1[15] = bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 32;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[32];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[31];
+ bf1[1] = input[1] + input[30];
+ bf1[2] = input[2] + input[29];
+ bf1[3] = input[3] + input[28];
+ bf1[4] = input[4] + input[27];
+ bf1[5] = input[5] + input[26];
+ bf1[6] = input[6] + input[25];
+ bf1[7] = input[7] + input[24];
+ bf1[8] = input[8] + input[23];
+ bf1[9] = input[9] + input[22];
+ bf1[10] = input[10] + input[21];
+ bf1[11] = input[11] + input[20];
+ bf1[12] = input[12] + input[19];
+ bf1[13] = input[13] + input[18];
+ bf1[14] = input[14] + input[17];
+ bf1[15] = input[15] + input[16];
+ bf1[16] = -input[16] + input[15];
+ bf1[17] = -input[17] + input[14];
+ bf1[18] = -input[18] + input[13];
+ bf1[19] = -input[19] + input[12];
+ bf1[20] = -input[20] + input[11];
+ bf1[21] = -input[21] + input[10];
+ bf1[22] = -input[22] + input[9];
+ bf1[23] = -input[23] + input[8];
+ bf1[24] = -input[24] + input[7];
+ bf1[25] = -input[25] + input[6];
+ bf1[26] = -input[26] + input[5];
+ bf1[27] = -input[27] + input[4];
+ bf1[28] = -input[28] + input[3];
+ bf1[29] = -input[29] + input[2];
+ bf1[30] = -input[30] + input[1];
+ bf1[31] = -input[31] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[16];
+ bf1[2] = bf0[8];
+ bf1[3] = bf0[24];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[20];
+ bf1[6] = bf0[12];
+ bf1[7] = bf0[28];
+ bf1[8] = bf0[2];
+ bf1[9] = bf0[18];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[26];
+ bf1[12] = bf0[6];
+ bf1[13] = bf0[22];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[30];
+ bf1[16] = bf0[1];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[9];
+ bf1[19] = bf0[25];
+ bf1[20] = bf0[5];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[13];
+ bf1[23] = bf0[29];
+ bf1[24] = bf0[3];
+ bf1[25] = bf0[19];
+ bf1[26] = bf0[11];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[7];
+ bf1[29] = bf0[23];
+ bf1[30] = bf0[15];
+ bf1[31] = bf0[31];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ int bit = cos_bit;
+ const int32_t *sinpi = sinpi_arr(bit);
+ int32_t x0, x1, x2, x3;
+ int32_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+ // stage 0
+ av1_range_check_buf(0, input, input, 4, stage_range[0]);
+ x0 = input[0];
+ x1 = input[1];
+ x2 = input[2];
+ x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ output[0] = output[1] = output[2] = output[3] = 0;
+ return;
+ }
+
+ // stage 1
+ s0 = range_check_value(sinpi[1] * x0, bit + stage_range[1]);
+ s1 = range_check_value(sinpi[4] * x0, bit + stage_range[1]);
+ s2 = range_check_value(sinpi[2] * x1, bit + stage_range[1]);
+ s3 = range_check_value(sinpi[1] * x1, bit + stage_range[1]);
+ s4 = range_check_value(sinpi[3] * x2, bit + stage_range[1]);
+ s5 = range_check_value(sinpi[4] * x3, bit + stage_range[1]);
+ s6 = range_check_value(sinpi[2] * x3, bit + stage_range[1]);
+ s7 = range_check_value(x0 + x1, stage_range[1]);
+
+ // stage 2
+ s7 = range_check_value(s7 - x3, stage_range[2]);
+
+ // stage 3
+ x0 = range_check_value(s0 + s2, bit + stage_range[3]);
+ x1 = range_check_value(sinpi[3] * s7, bit + stage_range[3]);
+ x2 = range_check_value(s1 - s3, bit + stage_range[3]);
+ x3 = range_check_value(s4, bit + stage_range[3]);
+
+ // stage 4
+ x0 = range_check_value(x0 + s5, bit + stage_range[4]);
+ x2 = range_check_value(x2 + s6, bit + stage_range[4]);
+
+ // stage 5
+ s0 = range_check_value(x0 + x3, bit + stage_range[5]);
+ s1 = range_check_value(x1, bit + stage_range[5]);
+ s2 = range_check_value(x2 - x3, bit + stage_range[5]);
+ s3 = range_check_value(x2 - x0, bit + stage_range[5]);
+
+ // stage 6
+ s3 = range_check_value(s3 + x3, bit + stage_range[6]);
+
+ // 1-D transform scaling factor is sqrt(2).
+ output[0] = round_shift(s0, bit);
+ output[1] = round_shift(s1, bit);
+ output[2] = round_shift(s2, bit);
+ output[3] = round_shift(s3, bit);
+ av1_range_check_buf(6, input, output, 4, stage_range[6]);
+}
+
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 8;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[8];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[7];
+ bf1[2] = -input[3];
+ bf1[3] = input[4];
+ bf1[4] = -input[1];
+ bf1[5] = input[6];
+ bf1[6] = input[2];
+ bf1[7] = -input[5];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[6];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[4];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[2];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 16;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[16];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ assert(output != input);
+ bf1 = output;
+ bf1[0] = input[0];
+ bf1[1] = -input[15];
+ bf1[2] = -input[7];
+ bf1[3] = input[8];
+ bf1[4] = -input[3];
+ bf1[5] = input[12];
+ bf1[6] = input[4];
+ bf1[7] = -input[11];
+ bf1[8] = -input[1];
+ bf1[9] = input[14];
+ bf1[10] = input[6];
+ bf1[11] = -input[9];
+ bf1[12] = input[2];
+ bf1[13] = -input[13];
+ bf1[14] = -input[5];
+ bf1[15] = input[10];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit);
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit);
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[2];
+ bf1[1] = bf0[1] + bf0[3];
+ bf1[2] = bf0[0] - bf0[2];
+ bf1[3] = bf0[1] - bf0[3];
+ bf1[4] = bf0[4] + bf0[6];
+ bf1[5] = bf0[5] + bf0[7];
+ bf1[6] = bf0[4] - bf0[6];
+ bf1[7] = bf0[5] - bf0[7];
+ bf1[8] = bf0[8] + bf0[10];
+ bf1[9] = bf0[9] + bf0[11];
+ bf1[10] = bf0[8] - bf0[10];
+ bf1[11] = bf0[9] - bf0[11];
+ bf1[12] = bf0[12] + bf0[14];
+ bf1[13] = bf0[13] + bf0[15];
+ bf1[14] = bf0[12] - bf0[14];
+ bf1[15] = bf0[13] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit);
+ bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit);
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[4];
+ bf1[1] = bf0[1] + bf0[5];
+ bf1[2] = bf0[2] + bf0[6];
+ bf1[3] = bf0[3] + bf0[7];
+ bf1[4] = bf0[0] - bf0[4];
+ bf1[5] = bf0[1] - bf0[5];
+ bf1[6] = bf0[2] - bf0[6];
+ bf1[7] = bf0[3] - bf0[7];
+ bf1[8] = bf0[8] + bf0[12];
+ bf1[9] = bf0[9] + bf0[13];
+ bf1[10] = bf0[10] + bf0[14];
+ bf1[11] = bf0[11] + bf0[15];
+ bf1[12] = bf0[8] - bf0[12];
+ bf1[13] = bf0[9] - bf0[13];
+ bf1[14] = bf0[10] - bf0[14];
+ bf1[15] = bf0[11] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit);
+ bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit);
+ bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[8];
+ bf1[1] = bf0[1] + bf0[9];
+ bf1[2] = bf0[2] + bf0[10];
+ bf1[3] = bf0[3] + bf0[11];
+ bf1[4] = bf0[4] + bf0[12];
+ bf1[5] = bf0[5] + bf0[13];
+ bf1[6] = bf0[6] + bf0[14];
+ bf1[7] = bf0[7] + bf0[15];
+ bf1[8] = bf0[0] - bf0[8];
+ bf1[9] = bf0[1] - bf0[9];
+ bf1[10] = bf0[2] - bf0[10];
+ bf1[11] = bf0[3] - bf0[11];
+ bf1[12] = bf0[4] - bf0[12];
+ bf1[13] = bf0[5] - bf0[13];
+ bf1[14] = bf0[6] - bf0[14];
+ bf1[15] = bf0[7] - bf0[15];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit);
+ bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit);
+ bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit);
+ bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit);
+ bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit);
+ bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit);
+ bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit);
+ bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit);
+ bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit);
+ bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit);
+ bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit);
+ bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit);
+ bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit);
+ bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit);
+ bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[1];
+ bf1[1] = bf0[14];
+ bf1[2] = bf0[3];
+ bf1[3] = bf0[12];
+ bf1[4] = bf0[5];
+ bf1[5] = bf0[10];
+ bf1[6] = bf0[7];
+ bf1[7] = bf0[8];
+ bf1[8] = bf0[9];
+ bf1[9] = bf0[6];
+ bf1[10] = bf0[11];
+ bf1[11] = bf0[4];
+ bf1[12] = bf0[13];
+ bf1[13] = bf0[2];
+ bf1[14] = bf0[15];
+ bf1[15] = bf0[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
+
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 4; ++i)
+ output[i] = round_shift((int64_t)input[i] * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 4, stage_range[0]);
+}
+
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 8; ++i) output[i] = input[i] * 2;
+ av1_range_check_buf(0, input, output, 8, stage_range[0]);
+}
+
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 16; ++i)
+ output[i] = round_shift((int64_t)input[i] * 2 * NewSqrt2, NewSqrt2Bits);
+ assert(stage_range[0] + NewSqrt2Bits <= 32);
+ av1_range_check_buf(0, input, output, 16, stage_range[0]);
+}
+
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
+ av1_range_check_buf(0, input, output, 32, stage_range[0]);
+}
+
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range) {
+ const int32_t size = 64;
+ const int32_t *cospi;
+
+ int32_t stage = 0;
+ int32_t *bf0, *bf1;
+ int32_t step[64];
+
+ // stage 0;
+ av1_range_check_buf(stage, input, input, size, stage_range[stage]);
+
+ // stage 1;
+ stage++;
+ bf1 = output;
+ bf1[0] = input[0] + input[63];
+ bf1[1] = input[1] + input[62];
+ bf1[2] = input[2] + input[61];
+ bf1[3] = input[3] + input[60];
+ bf1[4] = input[4] + input[59];
+ bf1[5] = input[5] + input[58];
+ bf1[6] = input[6] + input[57];
+ bf1[7] = input[7] + input[56];
+ bf1[8] = input[8] + input[55];
+ bf1[9] = input[9] + input[54];
+ bf1[10] = input[10] + input[53];
+ bf1[11] = input[11] + input[52];
+ bf1[12] = input[12] + input[51];
+ bf1[13] = input[13] + input[50];
+ bf1[14] = input[14] + input[49];
+ bf1[15] = input[15] + input[48];
+ bf1[16] = input[16] + input[47];
+ bf1[17] = input[17] + input[46];
+ bf1[18] = input[18] + input[45];
+ bf1[19] = input[19] + input[44];
+ bf1[20] = input[20] + input[43];
+ bf1[21] = input[21] + input[42];
+ bf1[22] = input[22] + input[41];
+ bf1[23] = input[23] + input[40];
+ bf1[24] = input[24] + input[39];
+ bf1[25] = input[25] + input[38];
+ bf1[26] = input[26] + input[37];
+ bf1[27] = input[27] + input[36];
+ bf1[28] = input[28] + input[35];
+ bf1[29] = input[29] + input[34];
+ bf1[30] = input[30] + input[33];
+ bf1[31] = input[31] + input[32];
+ bf1[32] = -input[32] + input[31];
+ bf1[33] = -input[33] + input[30];
+ bf1[34] = -input[34] + input[29];
+ bf1[35] = -input[35] + input[28];
+ bf1[36] = -input[36] + input[27];
+ bf1[37] = -input[37] + input[26];
+ bf1[38] = -input[38] + input[25];
+ bf1[39] = -input[39] + input[24];
+ bf1[40] = -input[40] + input[23];
+ bf1[41] = -input[41] + input[22];
+ bf1[42] = -input[42] + input[21];
+ bf1[43] = -input[43] + input[20];
+ bf1[44] = -input[44] + input[19];
+ bf1[45] = -input[45] + input[18];
+ bf1[46] = -input[46] + input[17];
+ bf1[47] = -input[47] + input[16];
+ bf1[48] = -input[48] + input[15];
+ bf1[49] = -input[49] + input[14];
+ bf1[50] = -input[50] + input[13];
+ bf1[51] = -input[51] + input[12];
+ bf1[52] = -input[52] + input[11];
+ bf1[53] = -input[53] + input[10];
+ bf1[54] = -input[54] + input[9];
+ bf1[55] = -input[55] + input[8];
+ bf1[56] = -input[56] + input[7];
+ bf1[57] = -input[57] + input[6];
+ bf1[58] = -input[58] + input[5];
+ bf1[59] = -input[59] + input[4];
+ bf1[60] = -input[60] + input[3];
+ bf1[61] = -input[61] + input[2];
+ bf1[62] = -input[62] + input[1];
+ bf1[63] = -input[63] + input[0];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 2
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[31];
+ bf1[1] = bf0[1] + bf0[30];
+ bf1[2] = bf0[2] + bf0[29];
+ bf1[3] = bf0[3] + bf0[28];
+ bf1[4] = bf0[4] + bf0[27];
+ bf1[5] = bf0[5] + bf0[26];
+ bf1[6] = bf0[6] + bf0[25];
+ bf1[7] = bf0[7] + bf0[24];
+ bf1[8] = bf0[8] + bf0[23];
+ bf1[9] = bf0[9] + bf0[22];
+ bf1[10] = bf0[10] + bf0[21];
+ bf1[11] = bf0[11] + bf0[20];
+ bf1[12] = bf0[12] + bf0[19];
+ bf1[13] = bf0[13] + bf0[18];
+ bf1[14] = bf0[14] + bf0[17];
+ bf1[15] = bf0[15] + bf0[16];
+ bf1[16] = -bf0[16] + bf0[15];
+ bf1[17] = -bf0[17] + bf0[14];
+ bf1[18] = -bf0[18] + bf0[13];
+ bf1[19] = -bf0[19] + bf0[12];
+ bf1[20] = -bf0[20] + bf0[11];
+ bf1[21] = -bf0[21] + bf0[10];
+ bf1[22] = -bf0[22] + bf0[9];
+ bf1[23] = -bf0[23] + bf0[8];
+ bf1[24] = -bf0[24] + bf0[7];
+ bf1[25] = -bf0[25] + bf0[6];
+ bf1[26] = -bf0[26] + bf0[5];
+ bf1[27] = -bf0[27] + bf0[4];
+ bf1[28] = -bf0[28] + bf0[3];
+ bf1[29] = -bf0[29] + bf0[2];
+ bf1[30] = -bf0[30] + bf0[1];
+ bf1[31] = -bf0[31] + bf0[0];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = bf0[37];
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit);
+ bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit);
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = bf0[58];
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 3
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[15];
+ bf1[1] = bf0[1] + bf0[14];
+ bf1[2] = bf0[2] + bf0[13];
+ bf1[3] = bf0[3] + bf0[12];
+ bf1[4] = bf0[4] + bf0[11];
+ bf1[5] = bf0[5] + bf0[10];
+ bf1[6] = bf0[6] + bf0[9];
+ bf1[7] = bf0[7] + bf0[8];
+ bf1[8] = -bf0[8] + bf0[7];
+ bf1[9] = -bf0[9] + bf0[6];
+ bf1[10] = -bf0[10] + bf0[5];
+ bf1[11] = -bf0[11] + bf0[4];
+ bf1[12] = -bf0[12] + bf0[3];
+ bf1[13] = -bf0[13] + bf0[2];
+ bf1[14] = -bf0[14] + bf0[1];
+ bf1[15] = -bf0[15] + bf0[0];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit);
+ bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit);
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[47];
+ bf1[33] = bf0[33] + bf0[46];
+ bf1[34] = bf0[34] + bf0[45];
+ bf1[35] = bf0[35] + bf0[44];
+ bf1[36] = bf0[36] + bf0[43];
+ bf1[37] = bf0[37] + bf0[42];
+ bf1[38] = bf0[38] + bf0[41];
+ bf1[39] = bf0[39] + bf0[40];
+ bf1[40] = -bf0[40] + bf0[39];
+ bf1[41] = -bf0[41] + bf0[38];
+ bf1[42] = -bf0[42] + bf0[37];
+ bf1[43] = -bf0[43] + bf0[36];
+ bf1[44] = -bf0[44] + bf0[35];
+ bf1[45] = -bf0[45] + bf0[34];
+ bf1[46] = -bf0[46] + bf0[33];
+ bf1[47] = -bf0[47] + bf0[32];
+ bf1[48] = -bf0[48] + bf0[63];
+ bf1[49] = -bf0[49] + bf0[62];
+ bf1[50] = -bf0[50] + bf0[61];
+ bf1[51] = -bf0[51] + bf0[60];
+ bf1[52] = -bf0[52] + bf0[59];
+ bf1[53] = -bf0[53] + bf0[58];
+ bf1[54] = -bf0[54] + bf0[57];
+ bf1[55] = -bf0[55] + bf0[56];
+ bf1[56] = bf0[56] + bf0[55];
+ bf1[57] = bf0[57] + bf0[54];
+ bf1[58] = bf0[58] + bf0[53];
+ bf1[59] = bf0[59] + bf0[52];
+ bf1[60] = bf0[60] + bf0[51];
+ bf1[61] = bf0[61] + bf0[50];
+ bf1[62] = bf0[62] + bf0[49];
+ bf1[63] = bf0[63] + bf0[48];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 4
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0] + bf0[7];
+ bf1[1] = bf0[1] + bf0[6];
+ bf1[2] = bf0[2] + bf0[5];
+ bf1[3] = bf0[3] + bf0[4];
+ bf1[4] = -bf0[4] + bf0[3];
+ bf1[5] = -bf0[5] + bf0[2];
+ bf1[6] = -bf0[6] + bf0[1];
+ bf1[7] = -bf0[7] + bf0[0];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit);
+ bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit);
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[23];
+ bf1[17] = bf0[17] + bf0[22];
+ bf1[18] = bf0[18] + bf0[21];
+ bf1[19] = bf0[19] + bf0[20];
+ bf1[20] = -bf0[20] + bf0[19];
+ bf1[21] = -bf0[21] + bf0[18];
+ bf1[22] = -bf0[22] + bf0[17];
+ bf1[23] = -bf0[23] + bf0[16];
+ bf1[24] = -bf0[24] + bf0[31];
+ bf1[25] = -bf0[25] + bf0[30];
+ bf1[26] = -bf0[26] + bf0[29];
+ bf1[27] = -bf0[27] + bf0[28];
+ bf1[28] = bf0[28] + bf0[27];
+ bf1[29] = bf0[29] + bf0[26];
+ bf1[30] = bf0[30] + bf0[25];
+ bf1[31] = bf0[31] + bf0[24];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[34];
+ bf1[35] = bf0[35];
+ bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit);
+ bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit);
+ bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit);
+ bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit);
+ bf1[44] = bf0[44];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = bf0[50];
+ bf1[51] = bf0[51];
+ bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit);
+ bf1[60] = bf0[60];
+ bf1[61] = bf0[61];
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 5
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0] + bf0[3];
+ bf1[1] = bf0[1] + bf0[2];
+ bf1[2] = -bf0[2] + bf0[1];
+ bf1[3] = -bf0[3] + bf0[0];
+ bf1[4] = bf0[4];
+ bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit);
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8] + bf0[11];
+ bf1[9] = bf0[9] + bf0[10];
+ bf1[10] = -bf0[10] + bf0[9];
+ bf1[11] = -bf0[11] + bf0[8];
+ bf1[12] = -bf0[12] + bf0[15];
+ bf1[13] = -bf0[13] + bf0[14];
+ bf1[14] = bf0[14] + bf0[13];
+ bf1[15] = bf0[15] + bf0[12];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit);
+ bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit);
+ bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit);
+ bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit);
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit);
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[39];
+ bf1[33] = bf0[33] + bf0[38];
+ bf1[34] = bf0[34] + bf0[37];
+ bf1[35] = bf0[35] + bf0[36];
+ bf1[36] = -bf0[36] + bf0[35];
+ bf1[37] = -bf0[37] + bf0[34];
+ bf1[38] = -bf0[38] + bf0[33];
+ bf1[39] = -bf0[39] + bf0[32];
+ bf1[40] = -bf0[40] + bf0[47];
+ bf1[41] = -bf0[41] + bf0[46];
+ bf1[42] = -bf0[42] + bf0[45];
+ bf1[43] = -bf0[43] + bf0[44];
+ bf1[44] = bf0[44] + bf0[43];
+ bf1[45] = bf0[45] + bf0[42];
+ bf1[46] = bf0[46] + bf0[41];
+ bf1[47] = bf0[47] + bf0[40];
+ bf1[48] = bf0[48] + bf0[55];
+ bf1[49] = bf0[49] + bf0[54];
+ bf1[50] = bf0[50] + bf0[53];
+ bf1[51] = bf0[51] + bf0[52];
+ bf1[52] = -bf0[52] + bf0[51];
+ bf1[53] = -bf0[53] + bf0[50];
+ bf1[54] = -bf0[54] + bf0[49];
+ bf1[55] = -bf0[55] + bf0[48];
+ bf1[56] = -bf0[56] + bf0[63];
+ bf1[57] = -bf0[57] + bf0[62];
+ bf1[58] = -bf0[58] + bf0[61];
+ bf1[59] = -bf0[59] + bf0[60];
+ bf1[60] = bf0[60] + bf0[59];
+ bf1[61] = bf0[61] + bf0[58];
+ bf1[62] = bf0[62] + bf0[57];
+ bf1[63] = bf0[63] + bf0[56];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 6
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit);
+ bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit);
+ bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit);
+ bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit);
+ bf1[4] = bf0[4] + bf0[5];
+ bf1[5] = -bf0[5] + bf0[4];
+ bf1[6] = -bf0[6] + bf0[7];
+ bf1[7] = bf0[7] + bf0[6];
+ bf1[8] = bf0[8];
+ bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit);
+ bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit);
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit);
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16] + bf0[19];
+ bf1[17] = bf0[17] + bf0[18];
+ bf1[18] = -bf0[18] + bf0[17];
+ bf1[19] = -bf0[19] + bf0[16];
+ bf1[20] = -bf0[20] + bf0[23];
+ bf1[21] = -bf0[21] + bf0[22];
+ bf1[22] = bf0[22] + bf0[21];
+ bf1[23] = bf0[23] + bf0[20];
+ bf1[24] = bf0[24] + bf0[27];
+ bf1[25] = bf0[25] + bf0[26];
+ bf1[26] = -bf0[26] + bf0[25];
+ bf1[27] = -bf0[27] + bf0[24];
+ bf1[28] = -bf0[28] + bf0[31];
+ bf1[29] = -bf0[29] + bf0[30];
+ bf1[30] = bf0[30] + bf0[29];
+ bf1[31] = bf0[31] + bf0[28];
+ bf1[32] = bf0[32];
+ bf1[33] = bf0[33];
+ bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit);
+ bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit);
+ bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit);
+ bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit);
+ bf1[38] = bf0[38];
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = bf0[41];
+ bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit);
+ bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit);
+ bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit);
+ bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit);
+ bf1[46] = bf0[46];
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = bf0[49];
+ bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit);
+ bf1[54] = bf0[54];
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = bf0[57];
+ bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit);
+ bf1[62] = bf0[62];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 7
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit);
+ bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit);
+ bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit);
+ bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit);
+ bf1[8] = bf0[8] + bf0[9];
+ bf1[9] = -bf0[9] + bf0[8];
+ bf1[10] = -bf0[10] + bf0[11];
+ bf1[11] = bf0[11] + bf0[10];
+ bf1[12] = bf0[12] + bf0[13];
+ bf1[13] = -bf0[13] + bf0[12];
+ bf1[14] = -bf0[14] + bf0[15];
+ bf1[15] = bf0[15] + bf0[14];
+ bf1[16] = bf0[16];
+ bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit);
+ bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit);
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit);
+ bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit);
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit);
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit);
+ bf1[31] = bf0[31];
+ bf1[32] = bf0[32] + bf0[35];
+ bf1[33] = bf0[33] + bf0[34];
+ bf1[34] = -bf0[34] + bf0[33];
+ bf1[35] = -bf0[35] + bf0[32];
+ bf1[36] = -bf0[36] + bf0[39];
+ bf1[37] = -bf0[37] + bf0[38];
+ bf1[38] = bf0[38] + bf0[37];
+ bf1[39] = bf0[39] + bf0[36];
+ bf1[40] = bf0[40] + bf0[43];
+ bf1[41] = bf0[41] + bf0[42];
+ bf1[42] = -bf0[42] + bf0[41];
+ bf1[43] = -bf0[43] + bf0[40];
+ bf1[44] = -bf0[44] + bf0[47];
+ bf1[45] = -bf0[45] + bf0[46];
+ bf1[46] = bf0[46] + bf0[45];
+ bf1[47] = bf0[47] + bf0[44];
+ bf1[48] = bf0[48] + bf0[51];
+ bf1[49] = bf0[49] + bf0[50];
+ bf1[50] = -bf0[50] + bf0[49];
+ bf1[51] = -bf0[51] + bf0[48];
+ bf1[52] = -bf0[52] + bf0[55];
+ bf1[53] = -bf0[53] + bf0[54];
+ bf1[54] = bf0[54] + bf0[53];
+ bf1[55] = bf0[55] + bf0[52];
+ bf1[56] = bf0[56] + bf0[59];
+ bf1[57] = bf0[57] + bf0[58];
+ bf1[58] = -bf0[58] + bf0[57];
+ bf1[59] = -bf0[59] + bf0[56];
+ bf1[60] = -bf0[60] + bf0[63];
+ bf1[61] = -bf0[61] + bf0[62];
+ bf1[62] = bf0[62] + bf0[61];
+ bf1[63] = bf0[63] + bf0[60];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 8
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit);
+ bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit);
+ bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit);
+ bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit);
+ bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit);
+ bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit);
+ bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit);
+ bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit);
+ bf1[16] = bf0[16] + bf0[17];
+ bf1[17] = -bf0[17] + bf0[16];
+ bf1[18] = -bf0[18] + bf0[19];
+ bf1[19] = bf0[19] + bf0[18];
+ bf1[20] = bf0[20] + bf0[21];
+ bf1[21] = -bf0[21] + bf0[20];
+ bf1[22] = -bf0[22] + bf0[23];
+ bf1[23] = bf0[23] + bf0[22];
+ bf1[24] = bf0[24] + bf0[25];
+ bf1[25] = -bf0[25] + bf0[24];
+ bf1[26] = -bf0[26] + bf0[27];
+ bf1[27] = bf0[27] + bf0[26];
+ bf1[28] = bf0[28] + bf0[29];
+ bf1[29] = -bf0[29] + bf0[28];
+ bf1[30] = -bf0[30] + bf0[31];
+ bf1[31] = bf0[31] + bf0[30];
+ bf1[32] = bf0[32];
+ bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit);
+ bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit);
+ bf1[35] = bf0[35];
+ bf1[36] = bf0[36];
+ bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit);
+ bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit);
+ bf1[39] = bf0[39];
+ bf1[40] = bf0[40];
+ bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit);
+ bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit);
+ bf1[43] = bf0[43];
+ bf1[44] = bf0[44];
+ bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit);
+ bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit);
+ bf1[47] = bf0[47];
+ bf1[48] = bf0[48];
+ bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit);
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[52];
+ bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit);
+ bf1[55] = bf0[55];
+ bf1[56] = bf0[56];
+ bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit);
+ bf1[59] = bf0[59];
+ bf1[60] = bf0[60];
+ bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit);
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 9
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit);
+ bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit);
+ bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit);
+ bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit);
+ bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit);
+ bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit);
+ bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit);
+ bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit);
+ bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit);
+ bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit);
+ bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit);
+ bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit);
+ bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit);
+ bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit);
+ bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit);
+ bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit);
+ bf1[32] = bf0[32] + bf0[33];
+ bf1[33] = -bf0[33] + bf0[32];
+ bf1[34] = -bf0[34] + bf0[35];
+ bf1[35] = bf0[35] + bf0[34];
+ bf1[36] = bf0[36] + bf0[37];
+ bf1[37] = -bf0[37] + bf0[36];
+ bf1[38] = -bf0[38] + bf0[39];
+ bf1[39] = bf0[39] + bf0[38];
+ bf1[40] = bf0[40] + bf0[41];
+ bf1[41] = -bf0[41] + bf0[40];
+ bf1[42] = -bf0[42] + bf0[43];
+ bf1[43] = bf0[43] + bf0[42];
+ bf1[44] = bf0[44] + bf0[45];
+ bf1[45] = -bf0[45] + bf0[44];
+ bf1[46] = -bf0[46] + bf0[47];
+ bf1[47] = bf0[47] + bf0[46];
+ bf1[48] = bf0[48] + bf0[49];
+ bf1[49] = -bf0[49] + bf0[48];
+ bf1[50] = -bf0[50] + bf0[51];
+ bf1[51] = bf0[51] + bf0[50];
+ bf1[52] = bf0[52] + bf0[53];
+ bf1[53] = -bf0[53] + bf0[52];
+ bf1[54] = -bf0[54] + bf0[55];
+ bf1[55] = bf0[55] + bf0[54];
+ bf1[56] = bf0[56] + bf0[57];
+ bf1[57] = -bf0[57] + bf0[56];
+ bf1[58] = -bf0[58] + bf0[59];
+ bf1[59] = bf0[59] + bf0[58];
+ bf1[60] = bf0[60] + bf0[61];
+ bf1[61] = -bf0[61] + bf0[60];
+ bf1[62] = -bf0[62] + bf0[63];
+ bf1[63] = bf0[63] + bf0[62];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 10
+ stage++;
+ cospi = cospi_arr(cos_bit);
+ bf0 = output;
+ bf1 = step;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[1];
+ bf1[2] = bf0[2];
+ bf1[3] = bf0[3];
+ bf1[4] = bf0[4];
+ bf1[5] = bf0[5];
+ bf1[6] = bf0[6];
+ bf1[7] = bf0[7];
+ bf1[8] = bf0[8];
+ bf1[9] = bf0[9];
+ bf1[10] = bf0[10];
+ bf1[11] = bf0[11];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[13];
+ bf1[14] = bf0[14];
+ bf1[15] = bf0[15];
+ bf1[16] = bf0[16];
+ bf1[17] = bf0[17];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[19];
+ bf1[20] = bf0[20];
+ bf1[21] = bf0[21];
+ bf1[22] = bf0[22];
+ bf1[23] = bf0[23];
+ bf1[24] = bf0[24];
+ bf1[25] = bf0[25];
+ bf1[26] = bf0[26];
+ bf1[27] = bf0[27];
+ bf1[28] = bf0[28];
+ bf1[29] = bf0[29];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[31];
+ bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit);
+ bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit);
+ bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit);
+ bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit);
+ bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit);
+ bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit);
+ bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit);
+ bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit);
+ bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit);
+ bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit);
+ bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit);
+ bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit);
+ bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit);
+ bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit);
+ bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit);
+ bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit);
+ bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit);
+ bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit);
+ bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit);
+ bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit);
+ bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit);
+ bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit);
+ bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit);
+ bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit);
+ bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit);
+ bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit);
+ bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit);
+ bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit);
+ bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit);
+ bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit);
+ bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit);
+ bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit);
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+
+ // stage 11
+ stage++;
+ bf0 = step;
+ bf1 = output;
+ bf1[0] = bf0[0];
+ bf1[1] = bf0[32];
+ bf1[2] = bf0[16];
+ bf1[3] = bf0[48];
+ bf1[4] = bf0[8];
+ bf1[5] = bf0[40];
+ bf1[6] = bf0[24];
+ bf1[7] = bf0[56];
+ bf1[8] = bf0[4];
+ bf1[9] = bf0[36];
+ bf1[10] = bf0[20];
+ bf1[11] = bf0[52];
+ bf1[12] = bf0[12];
+ bf1[13] = bf0[44];
+ bf1[14] = bf0[28];
+ bf1[15] = bf0[60];
+ bf1[16] = bf0[2];
+ bf1[17] = bf0[34];
+ bf1[18] = bf0[18];
+ bf1[19] = bf0[50];
+ bf1[20] = bf0[10];
+ bf1[21] = bf0[42];
+ bf1[22] = bf0[26];
+ bf1[23] = bf0[58];
+ bf1[24] = bf0[6];
+ bf1[25] = bf0[38];
+ bf1[26] = bf0[22];
+ bf1[27] = bf0[54];
+ bf1[28] = bf0[14];
+ bf1[29] = bf0[46];
+ bf1[30] = bf0[30];
+ bf1[31] = bf0[62];
+ bf1[32] = bf0[1];
+ bf1[33] = bf0[33];
+ bf1[34] = bf0[17];
+ bf1[35] = bf0[49];
+ bf1[36] = bf0[9];
+ bf1[37] = bf0[41];
+ bf1[38] = bf0[25];
+ bf1[39] = bf0[57];
+ bf1[40] = bf0[5];
+ bf1[41] = bf0[37];
+ bf1[42] = bf0[21];
+ bf1[43] = bf0[53];
+ bf1[44] = bf0[13];
+ bf1[45] = bf0[45];
+ bf1[46] = bf0[29];
+ bf1[47] = bf0[61];
+ bf1[48] = bf0[3];
+ bf1[49] = bf0[35];
+ bf1[50] = bf0[19];
+ bf1[51] = bf0[51];
+ bf1[52] = bf0[11];
+ bf1[53] = bf0[43];
+ bf1[54] = bf0[27];
+ bf1[55] = bf0[59];
+ bf1[56] = bf0[7];
+ bf1[57] = bf0[39];
+ bf1[58] = bf0[23];
+ bf1[59] = bf0[55];
+ bf1[60] = bf0[15];
+ bf1[61] = bf0[47];
+ bf1[62] = bf0[31];
+ bf1[63] = bf0[63];
+ av1_range_check_buf(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
new file mode 100644
index 0000000000..9dcf16552c
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
+
+#include "av1/common/av1_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct32_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fdct64_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst4_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst8_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fadst16_new(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity4_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity8_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity16_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+void av1_fidentity32_c(const int32_t *input, int32_t *output, int8_t cos_bit,
+ const int8_t *stage_range);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
new file mode 100644
index 0000000000..98b6530db1
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm1d_cfg.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#define AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
+#include "av1/common/enums.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+extern const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL];
+extern const int8_t fwd_cos_bit_col[5][5];
+extern const int8_t fwd_cos_bit_row[5][5];
+#endif // AOM_AV1_ENCODER_AV1_FWD_TXFM1D_CFG_H_
diff --git a/third_party/aom/av1/encoder/av1_fwd_txfm2d.c b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
new file mode 100644
index 0000000000..f25a667cf4
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_fwd_txfm2d.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/av1_fwd_txfm1d.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT4: return av1_fdct4_new;
+ case TXFM_TYPE_DCT8: return av1_fdct8_new;
+ case TXFM_TYPE_DCT16: return av1_fdct16_new;
+ case TXFM_TYPE_DCT32: return av1_fdct32_new;
+ case TXFM_TYPE_DCT64: return av1_fdct64_new;
+ case TXFM_TYPE_ADST4: return av1_fadst4_new;
+ case TXFM_TYPE_ADST8: return av1_fadst8_new;
+ case TXFM_TYPE_ADST16: return av1_fadst16_new;
+ case TXFM_TYPE_IDENTITY4: return av1_fidentity4_c;
+ case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
+ case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
+ case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+ default: assert(0); return NULL;
+ }
+}
+
+void av1_gen_fwd_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
+ const TXFM_2D_FLIP_CFG *cfg, int bd) {
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_col && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_col[i] = cfg->stage_range_col[i] + shift[0] + bd + 1;
+ }
+
+ // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+ for (int i = 0; i < cfg->stage_num_row && i < MAX_TXFM_STAGE_NUM; ++i) {
+ stage_range_row[i] = cfg->stage_range_row[i] + shift[0] + shift[1] + bd + 1;
+ }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+ const int stride, const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *buf, int bd) {
+ int c, r;
+ // Note when assigning txfm_size_col, we use the txfm_size from the
+ // row configuration and vice versa. This is intentionally done to
+ // accurately perform rectangular transforms. When the transform is
+ // rectangular, the number of columns will be the same as the
+ // txfm_size stored in the row cfg struct. It will make no difference
+ // for square transforms.
+ const int txfm_size_col = tx_size_wide[cfg->tx_size];
+ const int txfm_size_row = tx_size_high[cfg->tx_size];
+ // Take the shift from the larger dimension in the rectangular case.
+ const int8_t *shift = cfg->shift;
+ const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+ int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
+ int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
+ assert(cfg->stage_num_col <= MAX_TXFM_STAGE_NUM);
+ assert(cfg->stage_num_row <= MAX_TXFM_STAGE_NUM);
+ av1_gen_fwd_stage_range(stage_range_col, stage_range_row, cfg, bd);
+
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ // use output buffer as temp buffer
+ int32_t *temp_in = output;
+ int32_t *temp_out = output + txfm_size_row;
+
+ // Columns
+ for (c = 0; c < txfm_size_col; ++c) {
+ if (cfg->ud_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r) temp_in[r] = input[r * stride + c];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip upside down
+ temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
+ }
+ av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+ txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+ av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+ if (cfg->lr_flip == 0) {
+ for (r = 0; r < txfm_size_row; ++r)
+ buf[r * txfm_size_col + c] = temp_out[r];
+ } else {
+ for (r = 0; r < txfm_size_row; ++r)
+ // flip from left to right
+ buf[r * txfm_size_col + (txfm_size_col - c - 1)] = temp_out[r];
+ }
+ }
+
+ // Rows
+ for (r = 0; r < txfm_size_row; ++r) {
+ txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
+ cos_bit_row, stage_range_row);
+ av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+ if (abs(rect_type) == 1) {
+ // Multiply everything by Sqrt2 if the transform is rectangular and the
+ // size difference is a factor of 2.
+ for (c = 0; c < txfm_size_col; ++c) {
+ output[r * txfm_size_col + c] = round_shift(
+ (int64_t)output[r * txfm_size_col + c] * NewSqrt2, NewSqrt2Bits);
+ }
+ }
+ }
+}
+
+void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[4 * 4];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_4X4, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[8 * 8];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_8X8, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[16 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[32 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+}
+
+void av1_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 64];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+ // Zero out top-right 32x32 area.
+ for (int row = 0; row < 32; ++row) {
+ memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Zero out the bottom 64x32 area.
+ memset(output + 32 * 64, 0, 32 * 64 * sizeof(*output));
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int row = 1; row < 32; ++row) {
+ memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 32x32 area.
+ memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x32_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 32];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X32, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+
+ // Zero out right 32x32 area.
+ for (int row = 0; row < 32; ++row) {
+ memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x32 indices.
+ for (int row = 1; row < 32; ++row) {
+ memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+ }
+}
+
+void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+ // Note: no repacking needed here.
+}
+
+void av1_fwd_txfm2d_64x16_c(const int16_t *input, int32_t *output, int stride,
+ TX_TYPE tx_type, int bd) {
+ int32_t txfm_buf[64 * 16];
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X16, &cfg);
+ fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+ // Zero out right 32x16 area.
+ for (int row = 0; row < 16; ++row) {
+ memset(output + row * 64 + 32, 0, 32 * sizeof(*output));
+ }
+ // Re-pack non-zero coeffs in the first 32x16 indices.
+ for (int row = 1; row < 16; ++row) {
+ memcpy(output + row * 32, output + row * 64, 32 * sizeof(*output));
+ }
+}
+
+static const int8_t fwd_shift_4x4[3] = { 2, 0, 0 };
+static const int8_t fwd_shift_8x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_64x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_4x8[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x16[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x32[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x16[3] = { 2, -4, 0 };
+static const int8_t fwd_shift_32x64[3] = { 0, -2, -2 };
+static const int8_t fwd_shift_64x32[3] = { 2, -4, -2 };
+static const int8_t fwd_shift_4x16[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_16x4[3] = { 2, -1, 0 };
+static const int8_t fwd_shift_8x32[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_32x8[3] = { 2, -2, 0 };
+static const int8_t fwd_shift_16x64[3] = { 0, -2, 0 };
+static const int8_t fwd_shift_64x16[3] = { 2, -4, 0 };
+
+const int8_t *fwd_txfm_shift_ls[TX_SIZES_ALL] = {
+ fwd_shift_4x4, fwd_shift_8x8, fwd_shift_16x16, fwd_shift_32x32,
+ fwd_shift_64x64, fwd_shift_4x8, fwd_shift_8x4, fwd_shift_8x16,
+ fwd_shift_16x8, fwd_shift_16x32, fwd_shift_32x16, fwd_shift_32x64,
+ fwd_shift_64x32, fwd_shift_4x16, fwd_shift_16x4, fwd_shift_8x32,
+ fwd_shift_32x8, fwd_shift_16x64, fwd_shift_64x16,
+};
+
+const int8_t fwd_cos_bit_col[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 13, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 13, 12, 13 },
+ { 0, 13, 13, 12, 13 },
+ { 0, 0, 13, 12, 13 }
+ };
+
+const int8_t fwd_cos_bit_row[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = {
+ { 13, 13, 12, 0, 0 },
+ { 13, 13, 13, 12, 0 },
+ { 13, 13, 12, 13, 12 },
+ { 0, 12, 13, 12, 11 },
+ { 0, 0, 12, 11, 10 }
+ };
+
+static const int8_t fdct4_range_mult2[4] = { 0, 2, 3, 3 };
+static const int8_t fdct8_range_mult2[6] = { 0, 2, 4, 5, 5, 5 };
+static const int8_t fdct16_range_mult2[8] = { 0, 2, 4, 6, 7, 7, 7, 7 };
+static const int8_t fdct32_range_mult2[10] = { 0, 2, 4, 6, 8, 9, 9, 9, 9, 9 };
+static const int8_t fdct64_range_mult2[12] = { 0, 2, 4, 6, 8, 10,
+ 11, 11, 11, 11, 11, 11 };
+
+static const int8_t fadst4_range_mult2[7] = { 0, 2, 4, 3, 3, 3, 3 };
+static const int8_t fadst8_range_mult2[8] = { 0, 0, 1, 3, 3, 5, 5, 5 };
+static const int8_t fadst16_range_mult2[10] = { 0, 0, 1, 3, 3, 5, 5, 7, 7, 7 };
+
+static const int8_t max_fwd_range_mult2_col[5] = { 3, 5, 7, 9, 11 };
+
+static const int8_t fidtx4_range_mult2[1] = { 1 };
+static const int8_t fidtx8_range_mult2[1] = { 2 };
+static const int8_t fidtx16_range_mult2[1] = { 3 };
+static const int8_t fidtx32_range_mult2[1] = { 4 };
+
+#if 0
+const int8_t fwd_idtx_range_row[MAX_TXWH_IDX /*txw_idx*/]
+ [MAX_TXWH_IDX /*txh_idx*/] = { { 2, 4, 5, 0, 0 },
+ { 3, 4, 5, 6, 0 },
+ { 4, 5, 6, 7, 8 },
+ { 0, 5, 6, 7, 8 },
+ { 0, 0, 7, 8,
+ 9 } };
+#endif
+
+const int8_t *fwd_txfm_range_mult2_list[TXFM_TYPES] = {
+ fdct4_range_mult2, fdct8_range_mult2, fdct16_range_mult2,
+ fdct32_range_mult2, fdct64_range_mult2, fadst4_range_mult2,
+ fadst8_range_mult2, fadst16_range_mult2, fidtx4_range_mult2,
+ fidtx8_range_mult2, fidtx16_range_mult2, fidtx32_range_mult2
+};
+
+static INLINE void set_fwd_txfm_non_scale_range(TXFM_2D_FLIP_CFG *cfg) {
+ const int txh_idx = get_txh_idx(cfg->tx_size);
+ av1_zero(cfg->stage_range_col);
+ av1_zero(cfg->stage_range_row);
+
+ if (cfg->txfm_type_col != TXFM_TYPE_INVALID) {
+ int stage_num_col = cfg->stage_num_col;
+ const int8_t *range_mult2_col =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_col];
+ for (int i = 0; i < stage_num_col; ++i)
+ cfg->stage_range_col[i] = (range_mult2_col[i] + 1) >> 1;
+ }
+
+ if (cfg->txfm_type_row != TXFM_TYPE_INVALID) {
+ int stage_num_row = cfg->stage_num_row;
+ const int8_t *range_mult2_row =
+ fwd_txfm_range_mult2_list[cfg->txfm_type_row];
+ for (int i = 0; i < stage_num_row; ++i)
+ cfg->stage_range_row[i] =
+ (max_fwd_range_mult2_col[txh_idx] + range_mult2_row[i] + 1) >> 1;
+ }
+}
+
+void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,
+ TXFM_2D_FLIP_CFG *cfg) {
+ assert(cfg != NULL);
+ cfg->tx_size = tx_size;
+ set_flip_cfg(tx_type, cfg);
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const int txw_idx = tx_size_wide_log2[tx_size] - tx_size_wide_log2[0];
+ const int txh_idx = tx_size_high_log2[tx_size] - tx_size_high_log2[0];
+ cfg->shift = fwd_txfm_shift_ls[tx_size];
+ cfg->cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ cfg->cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ cfg->txfm_type_col = av1_txfm_type_ls[txh_idx][tx_type_1d_col];
+ cfg->txfm_type_row = av1_txfm_type_ls[txw_idx][tx_type_1d_row];
+ cfg->stage_num_col = av1_txfm_stage_num_list[cfg->txfm_type_col];
+ cfg->stage_num_row = av1_txfm_stage_num_list[cfg->txfm_type_row];
+ set_fwd_txfm_non_scale_range(cfg);
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.c b/third_party/aom/av1/encoder/av1_quantize.c
new file mode 100644
index 0000000000..a0a9260052
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/idct.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/rd.h"
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ *eob_ptr = 0;
+}
+
+static void quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i, eob = -1;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (qm_ptr == NULL && iqm_ptr == NULL) {
+ const int rounding0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+ { // rc == 0
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ if ((abs_coeff << (1 + log_scale)) >= (int32_t)(dequant_ptr[0])) {
+ abs_coeff = clamp64(abs_coeff + rounding0, INT16_MIN, INT16_MAX);
+ const int tmp32 = (int)((abs_coeff * quant_ptr[0]) >> (16 - log_scale));
+ if (tmp32) {
+ qcoeff_ptr[0] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[0]) >> log_scale;
+ dqcoeff_ptr[0] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ eob = 0;
+ }
+ }
+ }
+ const int rounding1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int32_t thresh1 = (int32_t)(dequant_ptr[1]);
+ for (i = 1; i < n_coeffs; i++) {
+ const int coeff = coeff_ptr[i];
+ const int coeff_sign = (coeff >> 31);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ if ((abs_coeff << (1 + log_scale)) >= thresh1) {
+ abs_coeff = clamp64(abs_coeff + rounding1, INT16_MIN, INT16_MAX);
+ const int tmp32 = (int)((abs_coeff * quant_ptr[1]) >> (16 - log_scale));
+ if (tmp32) {
+ qcoeff_ptr[i] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[1]) >> log_scale;
+ dqcoeff_ptr[i] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ eob = AOMMAX(iscan[i], eob);
+ }
+ }
+ }
+ } else {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < n_coeffs; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = (coeff >> 31);
+ int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int tmp32 = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_coeff = clamp64(abs_coeff, INT16_MIN, INT16_MAX);
+ tmp32 = (int)((abs_coeff * wt * quant_ptr[rc != 0]) >>
+ (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (abs_dqcoeff ^ coeff_sign) - coeff_sign;
+ }
+
+ if (tmp32) eob = i;
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+static void highbd_quantize_fp_helper_c(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, int log_scale) {
+ int i;
+ int eob = -1;
+ const int shift = 16 - log_scale;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)iscan;
+
+ if (qm_ptr || iqm_ptr) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int dequant =
+ (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+ AOM_QM_BITS;
+ const int coeff_sign = (coeff >> 31);
+ const int64_t abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int abs_qcoeff = 0;
+ if (abs_coeff * wt >=
+ (dequant_ptr[rc != 0] << (AOM_QM_BITS - (1 + log_scale)))) {
+ const int64_t tmp =
+ abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+ abs_qcoeff =
+ (int)((tmp * quant_ptr[rc != 0] * wt) >> (shift + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = i;
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ } else {
+ const int log_scaled_round_arr[2] = {
+ ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale),
+ };
+ for (i = 0; i < count; i++) {
+ const int rc = scan[i];
+ const int coeff = coeff_ptr[rc];
+ const int rc01 = (rc != 0);
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int log_scaled_round = log_scaled_round_arr[rc01];
+ if ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01]) {
+ const int quant = quant_ptr[rc01];
+ const int dequant = dequant_ptr[rc01];
+ const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+ const int abs_qcoeff = (int)((tmp * quant) >> shift);
+ qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ if (abs_qcoeff) eob = i;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ } else {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void av1_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void av1_quantize_fp_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+ eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_fp_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ if (n_coeffs < 16) {
+ // TODO(jingning): Need SIMD implementation for smaller block size
+ // quantization.
+ quantize_fp_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan, NULL, NULL, 0);
+ } else {
+ av1_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ }
+ break;
+ case 1:
+ av1_quantize_fp_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ av1_quantize_fp_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ aom_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 1:
+ aom_quantize_b_32x32(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ case 2:
+ aom_quantize_b_64x64(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static void quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr,
+ uint16_t *eob_ptr, const qm_val_t *qm_ptr,
+ const qm_val_t *iqm_ptr, const int log_scale) {
+ const int rc = 0;
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ int64_t tmp;
+ int eob = -1;
+ int32_t tmp32;
+ int dequant;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const int wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+ const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+ tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+ INT16_MIN, INT16_MAX);
+ tmp32 = (int32_t)((tmp * wt * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+ dequant = (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+ dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (tmp32) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ (void)sc;
+ assert(qparam->log_scale >= 0 && qparam->log_scale < (3));
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX[0],
+ eob_ptr, qm_ptr, iqm_ptr, qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_fp_helper_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ if (n_coeffs < 16) {
+ // TODO(jingning): Need SIMD implementation for smaller block size
+ // quantization.
+ av1_highbd_quantize_fp_c(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX, p->quant_fp_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX, eob_ptr,
+ sc->scan, sc->iscan, qparam->log_scale);
+ return;
+ }
+ av1_highbd_quantize_fp(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_fp_QTX,
+ p->quant_fp_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qparam->log_scale);
+ }
+}
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ if (qm_ptr != NULL && iqm_ptr != NULL) {
+ highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan, qm_ptr, iqm_ptr, qparam->log_scale);
+ } else {
+ switch (qparam->log_scale) {
+ case 0:
+ if (LIKELY(n_coeffs >= 8)) {
+ aom_highbd_quantize_b(coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX,
+ p->quant_QTX, p->quant_shift_QTX, qcoeff_ptr,
+ dqcoeff_ptr, p->dequant_QTX, eob_ptr, sc->scan,
+ sc->iscan);
+ } else {
+ // TODO(luoyi): Need SIMD (e.g. sse2) for smaller block size
+ // quantization
+ aom_highbd_quantize_b_c(coeff_ptr, n_coeffs, p->zbin_QTX,
+ p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
+ }
+ break;
+ case 1:
+ aom_highbd_quantize_b_32x32(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ case 2:
+ aom_highbd_quantize_b_64x64(
+ coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
+ eob_ptr, sc->scan, sc->iscan);
+ break;
+ default: assert(0);
+ }
+ }
+}
+
+static INLINE void highbd_quantize_dc(
+ const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+ const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t dequant_ptr, uint16_t *eob_ptr,
+ const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr, const int log_scale) {
+ int eob = -1;
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ if (!skip_block) {
+ const qm_val_t wt = qm_ptr != NULL ? qm_ptr[0] : (1 << AOM_QM_BITS);
+ const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[0] : (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[0];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+ const int64_t tmpw = tmp * wt;
+ const int abs_qcoeff =
+ (int)((tmpw * quant) >> (16 - log_scale + AOM_QM_BITS));
+ qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+ const int dequant =
+ (dequant_ptr * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+
+ const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+ dqcoeff_ptr[0] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+ if (abs_qcoeff) eob = 0;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam) {
+ // obsolete skip_block
+ const int skip_block = 0;
+ const qm_val_t *qm_ptr = qparam->qmatrix;
+ const qm_val_t *iqm_ptr = qparam->iqmatrix;
+ (void)sc;
+
+ highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round_QTX,
+ p->quant_fp_QTX[0], qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX[0], eob_ptr, qm_ptr, iqm_ptr,
+ qparam->log_scale);
+}
+
+void av1_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan,
+ int log_scale) {
+ highbd_quantize_fp_helper_c(coeff_ptr, count, zbin_ptr, round_ptr, quant_ptr,
+ quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr,
+ dequant_ptr, eob_ptr, scan, iscan, NULL, NULL,
+ log_scale);
+}
+
+static void invert_quant(int16_t *quant, int16_t *shift, int d) {
+ uint32_t t;
+ int l, m;
+ t = d;
+ for (l = 0; t > 1; l++) t >>= 1;
+ m = 1 + (1 << (16 + l)) / d;
+ *quant = (int16_t)(m - (1 << 16));
+ *shift = 1 << (16 - l);
+}
+
+static int get_qzbin_factor(int q, aom_bit_depth_t bit_depth) {
+ const int quant = av1_dc_quant_Q3(q, 0, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+ case AOM_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
+ case AOM_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq) {
+ int i, q, quant_Q3, quant_QTX;
+
+ for (q = 0; q < QINDEX_RANGE; q++) {
+ const int qzbin_factor = get_qzbin_factor(q, bit_depth);
+ const int qrounding_factor = q == 0 ? 64 : 48;
+
+ for (i = 0; i < 2; ++i) {
+ int qrounding_factor_fp = 64;
+ // y quantizer setup with original coeff shift of Q3
+ quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, y_dc_delta_q, bit_depth)
+ : av1_ac_quant_Q3(q, 0, bit_depth);
+ // y quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, y_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, 0, bit_depth);
+ invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i],
+ quant_QTX);
+ quants->y_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->y_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->y_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->y_dequant_QTX[q][i] = quant_QTX;
+ deq->y_dequant_Q3[q][i] = quant_Q3;
+
+ // u quantizer setup with original coeff shift of Q3
+ quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, u_dc_delta_q, bit_depth)
+ : av1_ac_quant_Q3(q, u_ac_delta_q, bit_depth);
+ // u quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, u_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, u_ac_delta_q, bit_depth);
+ invert_quant(&quants->u_quant[q][i], &quants->u_quant_shift[q][i],
+ quant_QTX);
+ quants->u_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->u_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->u_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->u_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->u_dequant_QTX[q][i] = quant_QTX;
+ deq->u_dequant_Q3[q][i] = quant_Q3;
+
+ // v quantizer setup with original coeff shift of Q3
+ quant_Q3 = i == 0 ? av1_dc_quant_Q3(q, v_dc_delta_q, bit_depth)
+ : av1_ac_quant_Q3(q, v_ac_delta_q, bit_depth);
+ // v quantizer with TX scale
+ quant_QTX = i == 0 ? av1_dc_quant_QTX(q, v_dc_delta_q, bit_depth)
+ : av1_ac_quant_QTX(q, v_ac_delta_q, bit_depth);
+ invert_quant(&quants->v_quant[q][i], &quants->v_quant_shift[q][i],
+ quant_QTX);
+ quants->v_quant_fp[q][i] = (1 << 16) / quant_QTX;
+ quants->v_round_fp[q][i] = (qrounding_factor_fp * quant_QTX) >> 7;
+ quants->v_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant_QTX, 7);
+ quants->v_round[q][i] = (qrounding_factor * quant_QTX) >> 7;
+ deq->v_dequant_QTX[q][i] = quant_QTX;
+ deq->v_dequant_Q3[q][i] = quant_Q3;
+ }
+
+ for (i = 2; i < 8; i++) { // 8: SIMD width
+ quants->y_quant[q][i] = quants->y_quant[q][1];
+ quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
+ quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
+ quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
+ quants->y_zbin[q][i] = quants->y_zbin[q][1];
+ quants->y_round[q][i] = quants->y_round[q][1];
+ deq->y_dequant_QTX[q][i] = deq->y_dequant_QTX[q][1];
+ deq->y_dequant_Q3[q][i] = deq->y_dequant_Q3[q][1];
+
+ quants->u_quant[q][i] = quants->u_quant[q][1];
+ quants->u_quant_fp[q][i] = quants->u_quant_fp[q][1];
+ quants->u_round_fp[q][i] = quants->u_round_fp[q][1];
+ quants->u_quant_shift[q][i] = quants->u_quant_shift[q][1];
+ quants->u_zbin[q][i] = quants->u_zbin[q][1];
+ quants->u_round[q][i] = quants->u_round[q][1];
+ deq->u_dequant_QTX[q][i] = deq->u_dequant_QTX[q][1];
+ deq->u_dequant_Q3[q][i] = deq->u_dequant_Q3[q][1];
+ quants->v_quant[q][i] = quants->u_quant[q][1];
+ quants->v_quant_fp[q][i] = quants->v_quant_fp[q][1];
+ quants->v_round_fp[q][i] = quants->v_round_fp[q][1];
+ quants->v_quant_shift[q][i] = quants->v_quant_shift[q][1];
+ quants->v_zbin[q][i] = quants->v_zbin[q][1];
+ quants->v_round[q][i] = quants->v_round[q][1];
+ deq->v_dequant_QTX[q][i] = deq->v_dequant_QTX[q][1];
+ deq->v_dequant_Q3[q][i] = deq->v_dequant_Q3[q][1];
+ }
+ }
+}
+
+void av1_init_quantizer(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ QUANTS *const quants = &cpi->quants;
+ Dequants *const dequants = &cpi->dequants;
+ av1_build_quantizer(cm->seq_params.bit_depth, cm->y_dc_delta_q,
+ cm->u_dc_delta_q, cm->u_ac_delta_q, cm->v_dc_delta_q,
+ cm->v_ac_delta_q, quants, dequants);
+}
+
+void av1_init_plane_quantizers(const AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const QUANTS *const quants = &cpi->quants;
+
+ int current_qindex = AOMMAX(
+ 0, AOMMIN(QINDEX_RANGE - 1, cpi->oxcf.deltaq_mode != NO_DELTA_Q
+ ? cm->base_qindex + xd->delta_qindex
+ : cm->base_qindex));
+ const int qindex = av1_get_qindex(&cm->seg, segment_id, current_qindex);
+ const int rdmult = av1_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
+ int qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+ ? NUM_QM_LEVELS - 1
+ : cm->qm_y;
+
+ // Y
+ x->plane[0].quant_QTX = quants->y_quant[qindex];
+ x->plane[0].quant_fp_QTX = quants->y_quant_fp[qindex];
+ x->plane[0].round_fp_QTX = quants->y_round_fp[qindex];
+ x->plane[0].quant_shift_QTX = quants->y_quant_shift[qindex];
+ x->plane[0].zbin_QTX = quants->y_zbin[qindex];
+ x->plane[0].round_QTX = quants->y_round[qindex];
+ x->plane[0].dequant_QTX = cpi->dequants.y_dequant_QTX[qindex];
+ memcpy(&xd->plane[0].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][0],
+ sizeof(cm->gqmatrix[qmlevel][0]));
+ memcpy(&xd->plane[0].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][0],
+ sizeof(cm->giqmatrix[qmlevel][0]));
+ xd->plane[0].dequant_Q3 = cpi->dequants.y_dequant_Q3[qindex];
+
+ // U
+ qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+ ? NUM_QM_LEVELS - 1
+ : cm->qm_u;
+ {
+ x->plane[1].quant_QTX = quants->u_quant[qindex];
+ x->plane[1].quant_fp_QTX = quants->u_quant_fp[qindex];
+ x->plane[1].round_fp_QTX = quants->u_round_fp[qindex];
+ x->plane[1].quant_shift_QTX = quants->u_quant_shift[qindex];
+ x->plane[1].zbin_QTX = quants->u_zbin[qindex];
+ x->plane[1].round_QTX = quants->u_round[qindex];
+ x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+ memcpy(&xd->plane[1].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][1],
+ sizeof(cm->gqmatrix[qmlevel][1]));
+ memcpy(&xd->plane[1].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][1],
+ sizeof(cm->giqmatrix[qmlevel][1]));
+ x->plane[1].dequant_QTX = cpi->dequants.u_dequant_QTX[qindex];
+ xd->plane[1].dequant_Q3 = cpi->dequants.u_dequant_Q3[qindex];
+ }
+ // V
+ qmlevel = (xd->lossless[segment_id] || cm->using_qmatrix == 0)
+ ? NUM_QM_LEVELS - 1
+ : cm->qm_v;
+ {
+ x->plane[2].quant_QTX = quants->v_quant[qindex];
+ x->plane[2].quant_fp_QTX = quants->v_quant_fp[qindex];
+ x->plane[2].round_fp_QTX = quants->v_round_fp[qindex];
+ x->plane[2].quant_shift_QTX = quants->v_quant_shift[qindex];
+ x->plane[2].zbin_QTX = quants->v_zbin[qindex];
+ x->plane[2].round_QTX = quants->v_round[qindex];
+ x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+ memcpy(&xd->plane[2].seg_qmatrix[segment_id], cm->gqmatrix[qmlevel][2],
+ sizeof(cm->gqmatrix[qmlevel][2]));
+ memcpy(&xd->plane[2].seg_iqmatrix[segment_id], cm->giqmatrix[qmlevel][2],
+ sizeof(cm->giqmatrix[qmlevel][2]));
+ x->plane[2].dequant_QTX = cpi->dequants.v_dequant_QTX[qindex];
+ xd->plane[2].dequant_Q3 = cpi->dequants.v_dequant_Q3[qindex];
+ }
+ x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
+ x->qindex = qindex;
+
+ set_error_per_bit(x, rdmult);
+
+ av1_initialize_me_consts(cpi, x, qindex);
+}
+
+void av1_frame_init_quantizer(AV1_COMP *cpi) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+}
+
+void av1_set_quantizer(AV1_COMMON *cm, int q) {
+ // quantizer has to be reinitialized with av1_init_quantizer() if any
+ // delta_q changes.
+ cm->base_qindex = AOMMAX(cm->delta_q_present_flag, q);
+ cm->y_dc_delta_q = 0;
+ cm->u_dc_delta_q = 0;
+ cm->u_ac_delta_q = 0;
+ cm->v_dc_delta_q = 0;
+ cm->v_ac_delta_q = 0;
+ cm->qm_y = aom_get_qmlevel(cm->base_qindex, cm->min_qmlevel, cm->max_qmlevel);
+ cm->qm_u = aom_get_qmlevel(cm->base_qindex + cm->u_ac_delta_q,
+ cm->min_qmlevel, cm->max_qmlevel);
+
+ if (!cm->seq_params.separate_uv_delta_q)
+ cm->qm_v = cm->qm_u;
+ else
+ cm->qm_v = aom_get_qmlevel(cm->base_qindex + cm->v_ac_delta_q,
+ cm->min_qmlevel, cm->max_qmlevel);
+}
+
+// Table that converts 0-63 Q-range values passed in outside to the Qindex
+// range used internally.
+static const int quantizer_to_qindex[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48,
+ 52, 56, 60, 64, 68, 72, 76, 80, 84, 88, 92, 96, 100,
+ 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152,
+ 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 204,
+ 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 249, 255,
+};
+
+int av1_quantizer_to_qindex(int quantizer) {
+ return quantizer_to_qindex[quantizer];
+}
+
+int av1_qindex_to_quantizer(int qindex) {
+ int quantizer;
+
+ for (quantizer = 0; quantizer < 64; ++quantizer)
+ if (quantizer_to_qindex[quantizer] >= qindex) return quantizer;
+
+ return 63;
+}
diff --git a/third_party/aom/av1/encoder/av1_quantize.h b/third_party/aom/av1/encoder/av1_quantize.h
new file mode 100644
index 0000000000..35af9a67ae
--- /dev/null
+++ b/third_party/aom/av1/encoder/av1_quantize.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+#define AOM_AV1_ENCODER_AV1_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/quant_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct QUANT_PARAM {
+ int log_scale;
+ TX_SIZE tx_size;
+ const qm_val_t *qmatrix;
+ const qm_val_t *iqmatrix;
+} QUANT_PARAM;
+
+typedef void (*AV1_QUANT_FACADE)(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+// The QUANTS structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// All of its fields use the same coefficient shift/scaling at TX.
+typedef struct {
+ // 0: dc 1: ac 2-8: ac repeated to SIMD width
+ DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
+
+ // TODO(jingning): in progress of re-working the quantization. will decide
+ // if we want to deprecate the current use of y_quant.
+ DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round_fp[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round_fp[QINDEX_RANGE][8]);
+
+ DECLARE_ALIGNED(16, int16_t, u_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_quant_shift[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_zbin[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, u_round[QINDEX_RANGE][8]);
+ DECLARE_ALIGNED(16, int16_t, v_round[QINDEX_RANGE][8]);
+} QUANTS;
+
+// The Dequants structure is used only for internal quantizer setup in
+// av1_quantize.c.
+// Fields are sufffixed according to whether or not they're expressed in
+// the same coefficient shift/precision as TX or a fixed Q3 format.
+typedef struct {
+ DECLARE_ALIGNED(16, int16_t,
+ y_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ u_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t,
+ v_dequant_QTX[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t, y_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t, u_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width
+ DECLARE_ALIGNED(16, int16_t, v_dequant_Q3[QINDEX_RANGE][8]); // 8: SIMD width
+} Dequants;
+
+struct AV1_COMP;
+struct AV1Common;
+
+void av1_frame_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_init_plane_quantizers(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int segment_id);
+
+void av1_build_quantizer(aom_bit_depth_t bit_depth, int y_dc_delta_q,
+ int u_dc_delta_q, int u_ac_delta_q, int v_dc_delta_q,
+ int v_ac_delta_q, QUANTS *const quants,
+ Dequants *const deq);
+
+void av1_init_quantizer(struct AV1_COMP *cpi);
+
+void av1_set_quantizer(struct AV1Common *cm, int q);
+
+int av1_quantizer_to_qindex(int quantizer);
+
+int av1_qindex_to_quantizer(int qindex);
+
+void av1_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void av1_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc, const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_fp_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+void av1_highbd_quantize_dc_facade(const tran_low_t *coeff_ptr,
+ intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+ tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+ const SCAN_ORDER *sc,
+ const QUANT_PARAM *qparam);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_AV1_QUANTIZE_H_
diff --git a/third_party/aom/av1/encoder/bitstream.c b/third_party/aom/av1/encoder/bitstream.c
new file mode 100644
index 0000000000..2c4acdb021
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.c
@@ -0,0 +1,3999 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdio.h>
+
+#include "aom/aom_encoder.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem_ops.h"
+#include "aom_ports/system_state.h"
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG
+
+#include "av1/common/cdef.h"
+#include "av1/common/cfl.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+#define ENC_MISMATCH_DEBUG 0
+
+static INLINE void write_uniform(aom_writer *w, int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_write_literal(w, v, l - 1);
+ } else {
+ aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+ aom_write_literal(w, (v - m) & 1, 1);
+ }
+}
+
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd,
+ const RestorationUnitInfo *rui,
+ aom_writer *const w, int plane,
+ FRAME_COUNTS *counts);
+
+static void write_intra_y_mode_kf(FRAME_CONTEXT *frame_ctx,
+ const MB_MODE_INFO *mi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi,
+ PREDICTION_MODE mode, aom_writer *w) {
+ assert(!is_intrabc_block(mi));
+ (void)mi;
+ aom_write_symbol(w, mode, get_y_mode_cdf(frame_ctx, above_mi, left_mi),
+ INTRA_MODES);
+}
+
+static void write_inter_mode(aom_writer *w, PREDICTION_MODE mode,
+ FRAME_CONTEXT *ec_ctx, const int16_t mode_ctx) {
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+
+ aom_write_symbol(w, mode != NEWMV, ec_ctx->newmv_cdf[newmv_ctx], 2);
+
+ if (mode != NEWMV) {
+ const int16_t zeromv_ctx =
+ (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ aom_write_symbol(w, mode != GLOBALMV, ec_ctx->zeromv_cdf[zeromv_ctx], 2);
+
+ if (mode != GLOBALMV) {
+ int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ aom_write_symbol(w, mode != NEARESTMV, ec_ctx->refmv_cdf[refmv_ctx], 2);
+ }
+ }
+}
+
+static void write_drl_idx(FRAME_CONTEXT *ec_ctx, const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT *mbmi_ext, aom_writer *w) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+
+ assert(mbmi->ref_mv_idx < 3);
+
+ const int new_mv = mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV;
+ if (new_mv) {
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+
+ aom_write_symbol(w, mbmi->ref_mv_idx != idx, ec_ctx->drl_cdf[drl_ctx],
+ 2);
+ if (mbmi->ref_mv_idx == idx) return;
+ }
+ }
+ return;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ int idx;
+ // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ aom_write_symbol(w, mbmi->ref_mv_idx != (idx - 1),
+ ec_ctx->drl_cdf[drl_ctx], 2);
+ if (mbmi->ref_mv_idx == (idx - 1)) return;
+ }
+ }
+ return;
+ }
+}
+
+static void write_inter_compound_mode(MACROBLOCKD *xd, aom_writer *w,
+ PREDICTION_MODE mode,
+ const int16_t mode_ctx) {
+ assert(is_inter_compound_mode(mode));
+ aom_write_symbol(w, INTER_COMPOUND_OFFSET(mode),
+ xd->tile_ctx->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_MODES);
+}
+
+static void write_tx_size_vartx(MACROBLOCKD *xd, const MB_MODE_INFO *mbmi,
+ TX_SIZE tx_size, int depth, int blk_row,
+ int blk_col, aom_writer *w) {
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const int max_blocks_high = max_block_high(xd, mbmi->sb_type, 0);
+ const int max_blocks_wide = max_block_wide(xd, mbmi->sb_type, 0);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (depth == MAX_VARTX_DEPTH) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ const int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->sb_type, tx_size);
+ const int txb_size_index =
+ av1_get_txb_size_index(mbmi->sb_type, blk_row, blk_col);
+ const int write_txfm_partition =
+ tx_size == mbmi->inter_tx_size[txb_size_index];
+ if (write_txfm_partition) {
+ aom_write_symbol(w, 0, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ // TODO(yuec): set correct txfm partition update for qttx
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+ aom_write_symbol(w, 1, ec_ctx->txfm_partition_cdf[ctx], 2);
+
+ if (sub_txs == TX_4X4) {
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, sub_txs, tx_size);
+ return;
+ }
+
+ assert(bsw > 0 && bsh > 0);
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh)
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = blk_row + row;
+ int offsetc = blk_col + col;
+ write_tx_size_vartx(xd, mbmi, sub_txs, depth + 1, offsetr, offsetc, w);
+ }
+ }
+}
+
+static void write_selected_tx_size(const MACROBLOCKD *xd, aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ if (block_signals_txsize(bsize)) {
+ const TX_SIZE tx_size = mbmi->tx_size;
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+
+ assert(depth >= 0 && depth <= max_depths);
+ assert(!is_inter_block(mbmi));
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed(xd, mbmi)));
+
+ aom_write_symbol(w, depth, ec_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ max_depths + 1);
+ }
+}
+
+static int write_skip(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, const MB_MODE_INFO *mi, aom_writer *w) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 1;
+ } else {
+ const int skip = mi->skip;
+ const int ctx = av1_get_skip_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, skip, ec_ctx->skip_cdfs[ctx], 2);
+ return skip;
+ }
+}
+
+static int write_skip_mode(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, const MB_MODE_INFO *mi,
+ aom_writer *w) {
+ if (!cm->skip_mode_flag) return 0;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+ return 0;
+ }
+ const int skip_mode = mi->skip_mode;
+ if (!is_comp_ref_allowed(mi->sb_type)) {
+ assert(!skip_mode);
+ return 0;
+ }
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ // These features imply single-reference mode, while skip mode implies
+ // compound reference. Hence, the two are mutually exclusive.
+ // In other words, skip_mode is implicitly 0 here.
+ assert(!skip_mode);
+ return 0;
+ }
+ const int ctx = av1_get_skip_mode_context(xd);
+ aom_write_symbol(w, skip_mode, xd->tile_ctx->skip_mode_cdfs[ctx], 2);
+ return skip_mode;
+}
+
+static void write_is_inter(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int segment_id, aom_writer *w, const int is_inter) {
+ if (!segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(is_inter);
+ return;
+ }
+ const int ctx = av1_get_intra_inter_context(xd);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, is_inter, ec_ctx->intra_inter_cdf[ctx], 2);
+ }
+}
+
+static void write_motion_mode(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi, aom_writer *w) {
+ MOTION_MODE last_motion_mode_allowed =
+ cm->switchable_motion_mode
+ ? motion_mode_allowed(cm->global_motion, xd, mbmi,
+ cm->allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ assert(mbmi->motion_mode <= last_motion_mode_allowed);
+ switch (last_motion_mode_allowed) {
+ case SIMPLE_TRANSLATION: break;
+ case OBMC_CAUSAL:
+ aom_write_symbol(w, mbmi->motion_mode == OBMC_CAUSAL,
+ xd->tile_ctx->obmc_cdf[mbmi->sb_type], 2);
+ break;
+ default:
+ aom_write_symbol(w, mbmi->motion_mode,
+ xd->tile_ctx->motion_mode_cdf[mbmi->sb_type],
+ MOTION_MODES);
+ }
+}
+
+static void write_delta_qindex(const MACROBLOCKD *xd, int delta_qindex,
+ aom_writer *w) {
+ int sign = delta_qindex < 0;
+ int abs = sign ? -delta_qindex : delta_qindex;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_Q_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ aom_write_symbol(w, AOMMIN(abs, DELTA_Q_SMALL), ec_ctx->delta_q_cdf,
+ DELTA_Q_PROBS + 1);
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static void write_delta_lflevel(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ int lf_id, int delta_lflevel, aom_writer *w) {
+ int sign = delta_lflevel < 0;
+ int abs = sign ? -delta_lflevel : delta_lflevel;
+ int rem_bits, thr;
+ int smallval = abs < DELTA_LF_SMALL ? 1 : 0;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (cm->delta_lf_multi) {
+ assert(lf_id >= 0 && lf_id < (av1_num_planes(cm) > 1 ? FRAME_LF_COUNT
+ : FRAME_LF_COUNT - 2));
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL),
+ ec_ctx->delta_lf_multi_cdf[lf_id], DELTA_LF_PROBS + 1);
+ } else {
+ aom_write_symbol(w, AOMMIN(abs, DELTA_LF_SMALL), ec_ctx->delta_lf_cdf,
+ DELTA_LF_PROBS + 1);
+ }
+
+ if (!smallval) {
+ rem_bits = get_msb(abs - 1);
+ thr = (1 << rem_bits) + 1;
+ aom_write_literal(w, rem_bits - 1, 3);
+ aom_write_literal(w, abs - thr, rem_bits);
+ }
+ if (abs > 0) {
+ aom_write_bit(w, sign);
+ }
+}
+
+static void pack_map_tokens(aom_writer *w, const TOKENEXTRA **tp, int n,
+ int num) {
+ const TOKENEXTRA *p = *tp;
+ write_uniform(w, n, p->token); // The first color index.
+ ++p;
+ --num;
+ for (int i = 0; i < num; ++i) {
+ aom_write_symbol(w, p->token, p->color_map_cdf, n);
+ ++p;
+ }
+ *tp = p;
+}
+
+static void pack_txb_tokens(aom_writer *w, AV1_COMMON *cm, MACROBLOCK *const x,
+ const TOKENEXTRA **tp,
+ const TOKENEXTRA *const tok_end, MACROBLOCKD *xd,
+ MB_MODE_INFO *mbmi, int plane,
+ BLOCK_SIZE plane_bsize, aom_bit_depth_t bit_depth,
+ int block, int blk_row, int blk_col,
+ TX_SIZE tx_size, TOKEN_STATS *token_stats) {
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ const uint16_t eob = x->mbmi_ext->eobs[plane][block];
+ TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+ x->mbmi_ext->dc_sign_ctx[plane][block] };
+ av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff,
+ eob, &txb_ctx);
+#if CONFIG_RD_DEBUG
+ TOKEN_STATS tmp_token_stats;
+ init_token_stats(&tmp_token_stats);
+ token_stats->txb_coeff_cost_map[blk_row][blk_col] = tmp_token_stats.cost;
+ token_stats->cost += tmp_token_stats.cost;
+#endif
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+ for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw) {
+ const int offsetr = blk_row + r;
+ const int offsetc = blk_col + c;
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+ pack_txb_tokens(w, cm, x, tp, tok_end, xd, mbmi, plane, plane_bsize,
+ bit_depth, block, offsetr, offsetc, sub_txs,
+ token_stats);
+ block += step;
+ }
+ }
+ }
+}
+
+static INLINE void set_spatial_segment_id(const AV1_COMMON *const cm,
+ uint8_t *segment_ids,
+ BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int segment_id) {
+ const int mi_offset = mi_row * cm->mi_cols + mi_col;
+ const int bw = mi_size_wide[bsize];
+ const int bh = mi_size_high[bsize];
+ const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
+ const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
+ int x, y;
+
+ for (y = 0; y < ymis; ++y)
+ for (x = 0; x < xmis; ++x)
+ segment_ids[mi_offset + y * cm->mi_cols + x] = segment_id;
+}
+
+int av1_neg_interleave(int x, int ref, int max) {
+ assert(x < max);
+ const int diff = x - ref;
+ if (!ref) return x;
+ if (ref >= (max - 1)) return -x + max - 1;
+ if (2 * ref < max) {
+ if (abs(diff) <= ref) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return x;
+ } else {
+ if (abs(diff) < (max - ref)) {
+ if (diff > 0)
+ return (diff << 1) - 1;
+ else
+ return ((-diff) << 1);
+ }
+ return (max - x) - 1;
+ }
+}
+
+static void write_segment_id(AV1_COMP *cpi, const MB_MODE_INFO *const mbmi,
+ aom_writer *w, const struct segmentation *seg,
+ struct segmentation_probs *segp, int mi_row,
+ int mi_col, int skip) {
+ if (!seg->enabled || !seg->update_map) return;
+
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int cdf_num;
+ const int pred = av1_get_spatial_seg_pred(cm, xd, mi_row, mi_col, &cdf_num);
+
+ if (skip) {
+ // Still need to transmit tx size for intra blocks even if skip is
+ // true. Changing segment_id may make the tx size become invalid, e.g
+ // changing from lossless to lossy.
+ assert(is_inter_block(mbmi) || !cpi->has_lossless_segment);
+
+ set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+ mi_col, pred);
+ set_spatial_segment_id(cm, cpi->segmentation_map, mbmi->sb_type, mi_row,
+ mi_col, pred);
+ /* mbmi is read only but we need to update segment_id */
+ ((MB_MODE_INFO *)mbmi)->segment_id = pred;
+ return;
+ }
+
+ const int coded_id =
+ av1_neg_interleave(mbmi->segment_id, pred, seg->last_active_segid + 1);
+ aom_cdf_prob *pred_cdf = segp->spatial_pred_seg_cdf[cdf_num];
+ aom_write_symbol(w, coded_id, pred_cdf, MAX_SEGMENTS);
+ set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type, mi_row,
+ mi_col, mbmi->segment_id);
+}
+
+#define WRITE_REF_BIT(bname, pname) \
+ aom_write_symbol(w, bname, av1_get_pred_cdf_##pname(xd), 2)
+
+// This function encodes the reference frame
+static void write_ref_frames(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ const int segment_id = mbmi->segment_id;
+
+ // If segment level coding of this signal is disabled...
+ // or the segment allows multiple reference frame options
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] ==
+ get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME));
+ } else if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) ||
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_GLOBALMV)) {
+ assert(!is_compound);
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ } else {
+ // does the feature use compound prediction or not
+ // (if not specified at the frame/segment level)
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ if (is_comp_ref_allowed(mbmi->sb_type))
+ aom_write_symbol(w, is_compound, av1_get_reference_mode_cdf(xd), 2);
+ } else {
+ assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
+ }
+
+ if (is_compound) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ aom_write_symbol(w, comp_ref_type, av1_get_comp_reference_type_cdf(xd),
+ 2);
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = mbmi->ref_frame[0] == BWDREF_FRAME;
+ WRITE_REF_BIT(bit, uni_comp_ref_p);
+
+ if (!bit) {
+ assert(mbmi->ref_frame[0] == LAST_FRAME);
+ const int bit1 = mbmi->ref_frame[1] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit1, uni_comp_ref_p1);
+ if (bit1) {
+ const int bit2 = mbmi->ref_frame[1] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, uni_comp_ref_p2);
+ }
+ } else {
+ assert(mbmi->ref_frame[1] == ALTREF_FRAME);
+ }
+
+ return;
+ }
+
+ assert(comp_ref_type == BIDIR_COMP_REFERENCE);
+
+ const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME);
+ WRITE_REF_BIT(bit, comp_ref_p);
+
+ if (!bit) {
+ const int bit1 = mbmi->ref_frame[0] == LAST2_FRAME;
+ WRITE_REF_BIT(bit1, comp_ref_p1);
+ } else {
+ const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+ WRITE_REF_BIT(bit2, comp_ref_p2);
+ }
+
+ const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit_bwd, comp_bwdref_p);
+
+ if (!bit_bwd) {
+ WRITE_REF_BIT(mbmi->ref_frame[1] == ALTREF2_FRAME, comp_bwdref_p1);
+ }
+
+ } else {
+ const int bit0 = (mbmi->ref_frame[0] <= ALTREF_FRAME &&
+ mbmi->ref_frame[0] >= BWDREF_FRAME);
+ WRITE_REF_BIT(bit0, single_ref_p1);
+
+ if (bit0) {
+ const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+ WRITE_REF_BIT(bit1, single_ref_p2);
+
+ if (!bit1) {
+ WRITE_REF_BIT(mbmi->ref_frame[0] == ALTREF2_FRAME, single_ref_p6);
+ }
+ } else {
+ const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[0] == GOLDEN_FRAME);
+ WRITE_REF_BIT(bit2, single_ref_p3);
+
+ if (!bit2) {
+ const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+ WRITE_REF_BIT(bit3, single_ref_p4);
+ } else {
+ const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+ WRITE_REF_BIT(bit4, single_ref_p5);
+ }
+ }
+ }
+ }
+}
+
+static void write_filter_intra_mode_info(const AV1_COMMON *cm,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *const mbmi,
+ aom_writer *w) {
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ aom_write_symbol(w, mbmi->filter_intra_mode_info.use_filter_intra,
+ xd->tile_ctx->filter_intra_cdfs[mbmi->sb_type], 2);
+ if (mbmi->filter_intra_mode_info.use_filter_intra) {
+ const FILTER_INTRA_MODE mode =
+ mbmi->filter_intra_mode_info.filter_intra_mode;
+ aom_write_symbol(w, mode, xd->tile_ctx->filter_intra_mode_cdf,
+ FILTER_INTRA_MODES);
+ }
+ }
+}
+
+static void write_angle_delta(aom_writer *w, int angle_delta,
+ aom_cdf_prob *cdf) {
+ aom_write_symbol(w, angle_delta + MAX_ANGLE_DELTA, cdf,
+ 2 * MAX_ANGLE_DELTA + 1);
+}
+
+static void write_mb_interp_filter(AV1_COMP *cpi, const MACROBLOCKD *xd,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!av1_is_interp_needed(xd)) {
+ assert(mbmi->interp_filters ==
+ av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->interp_filter)));
+ return;
+ }
+ if (cm->interp_filter == SWITCHABLE) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ aom_write_symbol(w, filter, ec_ctx->switchable_interp_cdf[ctx],
+ SWITCHABLE_FILTERS);
+ ++cpi->interp_filter_selected[0][filter];
+ if (cm->seq_params.enable_dual_filter == 0) return;
+ }
+ }
+}
+
+// Transmit color values with delta encoding. Write the first value as
+// literal, and the deltas between each value and the previous one. "min_val" is
+// the smallest possible value of the deltas.
+static void delta_encode_palette_colors(const int *colors, int num,
+ int bit_depth, int min_val,
+ aom_writer *w) {
+ if (num <= 0) return;
+ assert(colors[0] < (1 << bit_depth));
+ aom_write_literal(w, colors[0], bit_depth);
+ if (num == 1) return;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ memset(deltas, 0, sizeof(deltas));
+ for (int i = 1; i < num; ++i) {
+ assert(colors[i] < (1 << bit_depth));
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ const int min_bits = bit_depth - 3;
+ int bits = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ aom_write_literal(w, bits - min_bits, 2);
+ for (int i = 0; i < num - 1; ++i) {
+ aom_write_literal(w, deltas[i] - min_val, bits);
+ range -= deltas[i];
+ bits = AOMMIN(bits, av1_ceil_log2(range));
+ }
+}
+
+// Transmit luma palette color values. First signal if each color in the color
+// cache is used. Those colors that are not in the cache are transmitted with
+// delta encoding.
+static void write_palette_colors_y(const MACROBLOCKD *const xd,
+ const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[0];
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ assert(n_in_cache + n_out_cache == n);
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 1, w);
+}
+
+// Write chroma palette color values. U channel is handled similarly to the luma
+// channel. For v channel, either use delta encoding or transmit raw values
+// directly, whichever costs less.
+static void write_palette_colors_uv(const MACROBLOCKD *const xd,
+ const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, aom_writer *w) {
+ const int n = pmi->palette_size[1];
+ const uint16_t *colors_u = pmi->palette_colors + PALETTE_MAX_SIZE;
+ const uint16_t *colors_v = pmi->palette_colors + 2 * PALETTE_MAX_SIZE;
+ // U channel colors.
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, colors_u, n, cache_color_found, out_cache_colors);
+ int n_in_cache = 0;
+ for (int i = 0; i < n_cache && n_in_cache < n; ++i) {
+ const int found = cache_color_found[i];
+ aom_write_bit(w, found);
+ n_in_cache += found;
+ }
+ delta_encode_palette_colors(out_cache_colors, n_out_cache, bit_depth, 0, w);
+
+ // V channel colors. Don't use color cache as the colors are not sorted.
+ const int max_val = 1 << bit_depth;
+ int zero_count = 0, min_bits_v = 0;
+ int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int rate_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int rate_using_raw = bit_depth * n;
+ if (rate_using_delta < rate_using_raw) { // delta encoding
+ assert(colors_v[0] < (1 << bit_depth));
+ aom_write_bit(w, 1);
+ aom_write_literal(w, bits_v - min_bits_v, 2);
+ aom_write_literal(w, colors_v[0], bit_depth);
+ for (int i = 1; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ if (colors_v[i] == colors_v[i - 1]) { // No need to signal sign bit.
+ aom_write_literal(w, 0, bits_v);
+ continue;
+ }
+ const int delta = abs((int)colors_v[i] - colors_v[i - 1]);
+ const int sign_bit = colors_v[i] < colors_v[i - 1];
+ if (delta <= max_val - delta) {
+ aom_write_literal(w, delta, bits_v);
+ aom_write_bit(w, sign_bit);
+ } else {
+ aom_write_literal(w, max_val - delta, bits_v);
+ aom_write_bit(w, !sign_bit);
+ }
+ }
+ } else { // Transmit raw values.
+ aom_write_bit(w, 0);
+ for (int i = 0; i < n; ++i) {
+ assert(colors_v[i] < (1 << bit_depth));
+ aom_write_literal(w, colors_v[i], bit_depth);
+ }
+ }
+}
+
+static void write_palette_mode_info(const AV1_COMMON *cm, const MACROBLOCKD *xd,
+ const MB_MODE_INFO *const mbmi, int mi_row,
+ int mi_col, aom_writer *w) {
+ const int num_planes = av1_num_planes(cm);
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ assert(av1_allow_palette(cm->allow_screen_content_tools, bsize));
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_y_mode_ctx = av1_get_palette_mode_ctx(xd);
+ aom_write_symbol(
+ w, n > 0,
+ xd->tile_ctx->palette_y_mode_cdf[bsize_ctx][palette_y_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_y_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_y(xd, pmi, cm->seq_params.bit_depth, w);
+ }
+ }
+
+ const int uv_dc_pred =
+ num_planes > 1 && mbmi->uv_mode == UV_DC_PRED &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+ if (uv_dc_pred) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+ aom_write_symbol(w, n > 0,
+ xd->tile_ctx->palette_uv_mode_cdf[palette_uv_mode_ctx], 2);
+ if (n > 0) {
+ aom_write_symbol(w, n - PALETTE_MIN_SIZE,
+ xd->tile_ctx->palette_uv_size_cdf[bsize_ctx],
+ PALETTE_SIZES);
+ write_palette_colors_uv(xd, pmi, cm->seq_params.bit_depth, w);
+ }
+ }
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ int blk_row, int blk_col, int plane, TX_SIZE tx_size,
+ aom_writer *w) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ // Only y plane's tx_type is transmitted
+ if (plane > 0) return;
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+ cm->reduced_tx_set_used);
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+ if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+ ((!cm->seg.enabled && cm->base_qindex > 0) ||
+ (cm->seg.enabled && xd->qindex[mbmi->segment_id] > 0)) &&
+ !mbmi->skip &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+ const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+ // eset == 0 should correspond to a set with only DCT_DCT and there
+ // is no need to send the tx_type
+ assert(eset > 0);
+ assert(av1_ext_tx_used[tx_set_type][tx_type]);
+ if (is_inter) {
+ aom_write_symbol(w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->inter_ext_tx_cdf[eset][square_tx_size],
+ av1_num_ext_tx_set[tx_set_type]);
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir =
+ fimode_to_intradir[mbmi->filter_intra_mode_info.filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ aom_write_symbol(
+ w, av1_ext_tx_ind[tx_set_type][tx_type],
+ ec_ctx->intra_ext_tx_cdf[eset][square_tx_size][intra_dir],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+}
+
+static void write_intra_y_mode_nonkf(FRAME_CONTEXT *frame_ctx, BLOCK_SIZE bsize,
+ PREDICTION_MODE mode, aom_writer *w) {
+ aom_write_symbol(w, mode, frame_ctx->y_mode_cdf[size_group_lookup[bsize]],
+ INTRA_MODES);
+}
+
+static void write_intra_uv_mode(FRAME_CONTEXT *frame_ctx,
+ UV_PREDICTION_MODE uv_mode,
+ PREDICTION_MODE y_mode,
+ CFL_ALLOWED_TYPE cfl_allowed, aom_writer *w) {
+ aom_write_symbol(w, uv_mode, frame_ctx->uv_mode_cdf[cfl_allowed][y_mode],
+ UV_INTRA_MODES - !cfl_allowed);
+}
+
+static void write_cfl_alphas(FRAME_CONTEXT *const ec_ctx, int idx,
+ int joint_sign, aom_writer *w) {
+ aom_write_symbol(w, joint_sign, ec_ctx->cfl_sign_cdf, CFL_JOINT_SIGNS);
+ // Magnitudes are only signaled for nonzero codes.
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_U(idx), cdf_u, CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = ec_ctx->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ aom_write_symbol(w, CFL_IDX_V(idx), cdf_v, CFL_ALPHABET_SIZE);
+ }
+}
+
+static void write_cdef(AV1_COMMON *cm, MACROBLOCKD *const xd, aom_writer *w,
+ int skip, int mi_col, int mi_row) {
+ if (cm->coded_lossless || cm->allow_intrabc) {
+ // Initialize to indicate no CDEF for safety.
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->nb_cdef_strengths = 1;
+ cm->cdef_uv_strengths[0] = 0;
+ return;
+ }
+
+ const int m = ~((1 << (6 - MI_SIZE_LOG2)) - 1);
+ const MB_MODE_INFO *mbmi =
+ cm->mi_grid_visible[(mi_row & m) * cm->mi_stride + (mi_col & m)];
+ // Initialise when at top left part of the superblock
+ if (!(mi_row & (cm->seq_params.mib_size - 1)) &&
+ !(mi_col & (cm->seq_params.mib_size - 1))) { // Top left?
+ xd->cdef_preset[0] = xd->cdef_preset[1] = xd->cdef_preset[2] =
+ xd->cdef_preset[3] = -1;
+ }
+
+ // Emit CDEF param at first non-skip coding block
+ const int mask = 1 << (6 - MI_SIZE_LOG2);
+ const int index = cm->seq_params.sb_size == BLOCK_128X128
+ ? !!(mi_col & mask) + 2 * !!(mi_row & mask)
+ : 0;
+ if (xd->cdef_preset[index] == -1 && !skip) {
+ aom_write_literal(w, mbmi->cdef_strength, cm->cdef_bits);
+ xd->cdef_preset[index] = mbmi->cdef_strength;
+ }
+}
+
+static void write_inter_segment_id(AV1_COMP *cpi, aom_writer *w,
+ const struct segmentation *const seg,
+ struct segmentation_probs *const segp,
+ int mi_row, int mi_col, int skip,
+ int preskip) {
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ AV1_COMMON *const cm = &cpi->common;
+
+ if (seg->update_map) {
+ if (preskip) {
+ if (!seg->segid_preskip) return;
+ } else {
+ if (seg->segid_preskip) return;
+ if (skip) {
+ write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 1);
+ if (seg->temporal_update) ((MB_MODE_INFO *)mbmi)->seg_id_predicted = 0;
+ return;
+ }
+ }
+ if (seg->temporal_update) {
+ const int pred_flag = mbmi->seg_id_predicted;
+ aom_cdf_prob *pred_cdf = av1_get_pred_cdf_seg_id(segp, xd);
+ aom_write_symbol(w, pred_flag, pred_cdf, 2);
+ if (!pred_flag) {
+ write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+ }
+ if (pred_flag) {
+ set_spatial_segment_id(cm, cm->current_frame_seg_map, mbmi->sb_type,
+ mi_row, mi_col, mbmi->segment_id);
+ }
+ } else {
+ write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+ }
+ }
+}
+
+// If delta q is present, writes delta_q index.
+// Also writes delta_q loop filter levels, if present.
+static void write_delta_q_params(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int skip, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cm->delta_q_present_flag) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int super_block_upper_left =
+ ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+ ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+ if ((bsize != cm->seq_params.sb_size || skip == 0) &&
+ super_block_upper_left) {
+ assert(mbmi->current_qindex > 0);
+ const int reduced_delta_qindex =
+ (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+ write_delta_qindex(xd, reduced_delta_qindex, w);
+ xd->current_qindex = mbmi->current_qindex;
+ if (cm->delta_lf_present_flag) {
+ if (cm->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) /
+ cm->delta_lf_res;
+ write_delta_lflevel(cm, xd, lf_id, reduced_delta_lflevel, w);
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+ int reduced_delta_lflevel =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ cm->delta_lf_res;
+ write_delta_lflevel(cm, xd, -1, reduced_delta_lflevel, w);
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+ }
+}
+
+static void write_intra_prediction_modes(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, int is_keyframe,
+ aom_writer *w) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PREDICTION_MODE mode = mbmi->mode;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+
+ // Y mode.
+ if (is_keyframe) {
+ const MB_MODE_INFO *const above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *const left_mi = xd->left_mbmi;
+ write_intra_y_mode_kf(ec_ctx, mbmi, above_mi, left_mi, mode, w);
+ } else {
+ write_intra_y_mode_nonkf(ec_ctx, bsize, mode, w);
+ }
+
+ // Y angle delta.
+ const int use_angle_delta = av1_use_angle_delta(bsize);
+ if (use_angle_delta && av1_is_directional_mode(mode)) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_Y],
+ ec_ctx->angle_delta_cdf[mode - V_PRED]);
+ }
+
+ // UV mode and UV angle delta.
+ if (!cm->seq_params.monochrome &&
+ is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y)) {
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ write_intra_uv_mode(ec_ctx, uv_mode, mode, is_cfl_allowed(xd), w);
+ if (uv_mode == UV_CFL_PRED)
+ write_cfl_alphas(ec_ctx, mbmi->cfl_alpha_idx, mbmi->cfl_alpha_signs, w);
+ if (use_angle_delta && av1_is_directional_mode(get_uv_mode(uv_mode))) {
+ write_angle_delta(w, mbmi->angle_delta[PLANE_TYPE_UV],
+ ec_ctx->angle_delta_cdf[uv_mode - V_PRED]);
+ }
+ }
+
+ // Palette.
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+ write_palette_mode_info(cm, xd, mbmi, mi_row, mi_col, w);
+ }
+
+ // Filter intra.
+ write_filter_intra_mode_info(cm, xd, mbmi, w);
+}
+
+static void pack_inter_mode_mvs(AV1_COMP *cpi, const int mi_row,
+ const int mi_col, aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const PREDICTION_MODE mode = mbmi->mode;
+ const int segment_id = mbmi->segment_id;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int allow_hp = cm->allow_high_precision_mv;
+ const int is_inter = is_inter_block(mbmi);
+ const int is_compound = has_second_ref(mbmi);
+ int ref;
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, 0, 1);
+
+ write_skip_mode(cm, xd, segment_id, mbmi, w);
+
+ assert(IMPLIES(mbmi->skip_mode, mbmi->skip));
+ const int skip =
+ mbmi->skip_mode ? 1 : write_skip(cm, xd, segment_id, mbmi, w);
+
+ write_inter_segment_id(cpi, w, seg, segp, mi_row, mi_col, skip, 0);
+
+ write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+ if (!mbmi->skip_mode) write_is_inter(cm, xd, mbmi->segment_id, w, is_inter);
+
+ if (mbmi->skip_mode) return;
+
+ if (!is_inter) {
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 0, w);
+ } else {
+ int16_t mode_ctx;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ write_ref_frames(cm, xd, w);
+
+ mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+ // If segment skip is not enabled code the mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
+ if (is_inter_compound_mode(mode))
+ write_inter_compound_mode(xd, w, mode, mode_ctx);
+ else if (is_inter_singleref_mode(mode))
+ write_inter_mode(w, mode, ec_ctx, mode_ctx);
+
+ if (mode == NEWMV || mode == NEW_NEWMV || have_nearmv_in_inter_mode(mode))
+ write_drl_idx(ec_ctx, mbmi, mbmi_ext, w);
+ else
+ assert(mbmi->ref_mv_idx == 0);
+ }
+
+ if (mode == NEWMV || mode == NEW_NEWMV) {
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = av1_get_ref_mv(x, ref);
+ av1_encode_mv(cpi, w, &mbmi->mv[ref].as_mv, &ref_mv.as_mv, nmvc,
+ allow_hp);
+ }
+ } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = av1_get_ref_mv(x, 1);
+ av1_encode_mv(cpi, w, &mbmi->mv[1].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+ } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+ nmv_context *nmvc = &ec_ctx->nmvc;
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ av1_encode_mv(cpi, w, &mbmi->mv[0].as_mv, &ref_mv.as_mv, nmvc, allow_hp);
+ }
+
+ if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+ cpi->common.seq_params.enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+ const int bsize_group = size_group_lookup[bsize];
+ aom_write_symbol(w, interintra, ec_ctx->interintra_cdf[bsize_group], 2);
+ if (interintra) {
+ aom_write_symbol(w, mbmi->interintra_mode,
+ ec_ctx->interintra_mode_cdf[bsize_group],
+ INTERINTRA_MODES);
+ if (is_interintra_wedge_used(bsize)) {
+ aom_write_symbol(w, mbmi->use_wedge_interintra,
+ ec_ctx->wedge_interintra_cdf[bsize], 2);
+ if (mbmi->use_wedge_interintra) {
+ aom_write_symbol(w, mbmi->interintra_wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], 16);
+ assert(mbmi->interintra_wedge_sign == 0);
+ }
+ }
+ }
+ }
+
+ if (mbmi->ref_frame[1] != INTRA_FRAME) write_motion_mode(cm, xd, mbmi, w);
+
+ // First write idx to indicate current compound inter prediction mode group
+ // Group A (0): jnt_comp, compound_average
+ // Group B (1): interintra, compound_diffwtd, wedge
+ if (has_second_ref(mbmi)) {
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params.enable_masked_compound;
+
+ if (masked_compound_used) {
+ const int ctx_comp_group_idx = get_comp_group_idx_context(xd);
+ aom_write_symbol(w, mbmi->comp_group_idx,
+ ec_ctx->comp_group_idx_cdf[ctx_comp_group_idx], 2);
+ } else {
+ assert(mbmi->comp_group_idx == 0);
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ if (mbmi->compound_idx)
+ assert(mbmi->interinter_comp.type == COMPOUND_AVERAGE);
+
+ if (cm->seq_params.enable_jnt_comp) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ aom_write_symbol(w, mbmi->compound_idx,
+ ec_ctx->compound_index_cdf[comp_index_ctx], 2);
+ } else {
+ assert(mbmi->compound_idx == 1);
+ }
+ } else {
+ assert(cpi->common.reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+ assert(masked_compound_used);
+ // compound_diffwtd, wedge
+ assert(mbmi->interinter_comp.type == COMPOUND_WEDGE ||
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize))
+ aom_write_symbol(w, mbmi->interinter_comp.type - 1,
+ ec_ctx->compound_type_cdf[bsize],
+ COMPOUND_TYPES - 1);
+
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ aom_write_symbol(w, mbmi->interinter_comp.wedge_index,
+ ec_ctx->wedge_idx_cdf[bsize], 16);
+ aom_write_bit(w, mbmi->interinter_comp.wedge_sign);
+ } else {
+ assert(mbmi->interinter_comp.type == COMPOUND_DIFFWTD);
+ aom_write_literal(w, mbmi->interinter_comp.mask_type,
+ MAX_DIFFWTD_MASK_BITS);
+ }
+ }
+ }
+
+ write_mb_interp_filter(cpi, xd, w);
+ }
+}
+
+static void write_intrabc_info(MACROBLOCKD *xd,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ aom_writer *w) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int use_intrabc = is_intrabc_block(mbmi);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, use_intrabc, ec_ctx->intrabc_cdf, 2);
+ if (use_intrabc) {
+ assert(mbmi->mode == DC_PRED);
+ assert(mbmi->uv_mode == UV_DC_PRED);
+ assert(mbmi->motion_mode == SIMPLE_TRANSLATION);
+ int_mv dv_ref = mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv;
+ av1_encode_dv(w, &mbmi->mv[0].as_mv, &dv_ref.as_mv, &ec_ctx->ndvc);
+ }
+}
+
+static void write_mb_modes_kf(AV1_COMP *cpi, MACROBLOCKD *xd,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ const int mi_row, const int mi_col,
+ aom_writer *w) {
+ AV1_COMMON *const cm = &cpi->common;
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ const struct segmentation *const seg = &cm->seg;
+ struct segmentation_probs *const segp = &ec_ctx->seg;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, 0);
+
+ const int skip = write_skip(cm, xd, mbmi->segment_id, mbmi, w);
+
+ if (!seg->segid_preskip && seg->update_map)
+ write_segment_id(cpi, mbmi, w, seg, segp, mi_row, mi_col, skip);
+
+ write_cdef(cm, xd, w, skip, mi_col, mi_row);
+
+ write_delta_q_params(cpi, mi_row, mi_col, skip, w);
+
+ if (av1_allow_intrabc(cm)) {
+ write_intrabc_info(xd, mbmi_ext, w);
+ if (is_intrabc_block(mbmi)) return;
+ }
+
+ write_intra_prediction_modes(cpi, mi_row, mi_col, 1, w);
+}
+
+#if CONFIG_RD_DEBUG
+static void dump_mode_info(MODE_INFO *mi) {
+ printf("\nmi->mi_row == %d\n", mi->mi_row);
+ printf("&& mi->mi_col == %d\n", mi->mi_col);
+ printf("&& mi->sb_type == %d\n", mi->sb_type);
+ printf("&& mi->tx_size == %d\n", mi->tx_size);
+ printf("&& mi->mode == %d\n", mi->mode);
+}
+static int rd_token_stats_mismatch(RD_STATS *rd_stats, TOKEN_STATS *token_stats,
+ int plane) {
+ if (rd_stats->txb_coeff_cost[plane] != token_stats->cost) {
+ int r, c;
+ printf("\nplane %d rd_stats->txb_coeff_cost %d token_stats->cost %d\n",
+ plane, rd_stats->txb_coeff_cost[plane], token_stats->cost);
+ printf("rd txb_coeff_cost_map\n");
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ printf("%d ", rd_stats->txb_coeff_cost_map[plane][r][c]);
+ }
+ printf("\n");
+ }
+
+ printf("pack txb_coeff_cost_map\n");
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ printf("%d ", token_stats->txb_coeff_cost_map[r][c]);
+ }
+ printf("\n");
+ }
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+#if ENC_MISMATCH_DEBUG
+static void enc_dump_logs(AV1_COMP *cpi, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ const MB_MODE_INFO *const *mbmi = xd->mi[0];
+ if (is_inter_block(mbmi)) {
+#define FRAME_TO_CHECK 11
+ if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+
+ int_mv mv[2];
+ int is_comp_ref = has_second_ref(mbmi);
+ int ref;
+
+ for (ref = 0; ref < 1 + is_comp_ref; ++ref)
+ mv[ref].as_mv = mbmi->mv[ref].as_mv;
+
+ if (!is_comp_ref) {
+ mv[1].as_int = 0;
+ }
+
+ MACROBLOCK *const x = &cpi->td.mb;
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int16_t mode_ctx =
+ is_comp_ref ? mbmi_ext->compound_mode_context[mbmi->ref_frame[0]]
+ : av1_mode_context_analyzer(mbmi_ext->mode_context,
+ mbmi->ref_frame);
+
+ const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+ int16_t zeromv_ctx = -1;
+ int16_t refmv_ctx = -1;
+
+ if (mbmi->mode != NEWMV) {
+ zeromv_ctx = (mode_ctx >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mbmi->mode != GLOBALMV)
+ refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ }
+
+ printf(
+ "=== ENCODER ===: "
+ "Frame=%d, (mi_row,mi_col)=(%d,%d), skip_mode=%d, mode=%d, bsize=%d, "
+ "show_frame=%d, mv[0]=(%d,%d), mv[1]=(%d,%d), ref[0]=%d, "
+ "ref[1]=%d, motion_mode=%d, mode_ctx=%d, "
+ "newmv_ctx=%d, zeromv_ctx=%d, refmv_ctx=%d, tx_size=%d\n",
+ cm->current_video_frame, mi_row, mi_col, mbmi->skip_mode, mbmi->mode,
+ bsize, cm->show_frame, mv[0].as_mv.row, mv[0].as_mv.col,
+ mv[1].as_mv.row, mv[1].as_mv.col, mbmi->ref_frame[0],
+ mbmi->ref_frame[1], mbmi->motion_mode, mode_ctx, newmv_ctx,
+ zeromv_ctx, refmv_ctx, mbmi->tx_size);
+ }
+ }
+}
+#endif // ENC_MISMATCH_DEBUG
+
+static void write_mbmi_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, int mi_row, int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int bh, bw;
+ xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
+ MB_MODE_INFO *m = xd->mi[0];
+
+ assert(m->sb_type <= cm->seq_params.sb_size ||
+ (m->sb_type >= BLOCK_SIZES && m->sb_type < BLOCK_SIZES_ALL));
+
+ bh = mi_size_high[m->sb_type];
+ bw = mi_size_wide[m->sb_type];
+
+ cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ if (frame_is_intra_only(cm)) {
+ write_mb_modes_kf(cpi, xd, cpi->td.mb.mbmi_ext, mi_row, mi_col, w);
+ } else {
+ // has_subpel_mv_component needs the ref frame buffers set up to look
+ // up if they are scaled. has_subpel_mv_component is in turn needed by
+ // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+ set_ref_ptrs(cm, xd, m->ref_frame[0], m->ref_frame[1]);
+
+#if ENC_MISMATCH_DEBUG
+ enc_dump_logs(cpi, mi_row, mi_col);
+#endif // ENC_MISMATCH_DEBUG
+
+ pack_inter_mode_mvs(cpi, mi_row, mi_col, w);
+ }
+}
+
+static void write_inter_txb_coeff(AV1_COMMON *const cm, MACROBLOCK *const x,
+ MB_MODE_INFO *const mbmi, aom_writer *w,
+ const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end,
+ TOKEN_STATS *token_stats, const int row,
+ const int col, int *block, const int plane) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const BLOCK_SIZE bsizec =
+ scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int step =
+ tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ const int bkw = tx_size_wide_unit[max_tx_size];
+ const int bkh = tx_size_high_unit[max_tx_size];
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ int blk_row, blk_col;
+
+ const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+
+ const int unit_height =
+ AOMMIN(mu_blocks_high + (row >> pd->subsampling_y), num_4x4_h);
+ const int unit_width =
+ AOMMIN(mu_blocks_wide + (col >> pd->subsampling_x), num_4x4_w);
+ for (blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += bkh) {
+ for (blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += bkw) {
+ pack_txb_tokens(w, cm, x, tok, tok_end, xd, mbmi, plane, plane_bsize,
+ cm->seq_params.bit_depth, *block, blk_row, blk_col,
+ max_tx_size, token_stats);
+ *block += step;
+ }
+ }
+}
+
+static void write_tokens_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end, int mi_row,
+ int mi_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const int mi_offset = mi_row * cm->mi_stride + mi_col;
+ MB_MODE_INFO *const mbmi = *(cm->mi_grid_visible + mi_offset);
+ int plane;
+ int bh, bw;
+ MACROBLOCK *const x = &cpi->td.mb;
+ (void)tok;
+ (void)tok_end;
+ xd->mi = cm->mi_grid_visible + mi_offset;
+
+ assert(mbmi->sb_type <= cm->seq_params.sb_size ||
+ (mbmi->sb_type >= BLOCK_SIZES && mbmi->sb_type < BLOCK_SIZES_ALL));
+
+ bh = mi_size_high[mbmi->sb_type];
+ bw = mi_size_wide[mbmi->sb_type];
+ cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ if (!mbmi->skip) {
+ if (!is_inter_block(mbmi))
+ av1_write_coeffs_mb(cm, x, mi_row, mi_col, w, mbmi->sb_type);
+
+ if (is_inter_block(mbmi)) {
+ int block[MAX_MB_PLANE] = { 0 };
+ const BLOCK_SIZE plane_bsize = mbmi->sb_type;
+ assert(plane_bsize == get_plane_block_size(mbmi->sb_type,
+ xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int num_4x4_w =
+ block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h =
+ block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ int row, col;
+ TOKEN_STATS token_stats;
+ init_token_stats(&token_stats);
+
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ assert(max_unit_bsize ==
+ get_plane_block_size(BLOCK_64X64, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ int mu_blocks_wide =
+ block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high =
+ block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ mu_blocks_wide = AOMMIN(num_4x4_w, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(num_4x4_h, mu_blocks_high);
+
+ for (row = 0; row < num_4x4_h; row += mu_blocks_high) {
+ for (col = 0; col < num_4x4_w; col += mu_blocks_wide) {
+ for (plane = 0; plane < num_planes && is_inter_block(mbmi); ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (!is_chroma_reference(mi_row, mi_col, mbmi->sb_type,
+ pd->subsampling_x, pd->subsampling_y)) {
+ continue;
+ }
+ write_inter_txb_coeff(cm, x, mbmi, w, tok, tok_end, &token_stats,
+ row, col, &block[plane], plane);
+ }
+ }
+#if CONFIG_RD_DEBUG
+ if (mbmi->sb_type >= BLOCK_8X8 &&
+ rd_token_stats_mismatch(&mbmi->rd_stats, &token_stats, plane)) {
+ dump_mode_info(m);
+ assert(0);
+ }
+#endif // CONFIG_RD_DEBUG
+ }
+ }
+ }
+}
+
+static void write_modes_b(AV1_COMP *cpi, const TileInfo *const tile,
+ aom_writer *w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end, int mi_row,
+ int mi_col) {
+ write_mbmi_b(cpi, tile, w, mi_row, mi_col);
+
+ AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ for (int plane = 0; plane < AOMMIN(2, av1_num_planes(cm)); ++plane) {
+ const uint8_t palette_size_plane =
+ mbmi->palette_mode_info.palette_size[plane];
+ assert(!mbmi->skip_mode || !palette_size_plane);
+ if (palette_size_plane > 0) {
+ assert(mbmi->use_intrabc == 0);
+ assert(av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type));
+ int rows, cols;
+ av1_get_block_dimensions(mbmi->sb_type, plane, xd, NULL, NULL, &rows,
+ &cols);
+ assert(*tok < tok_end);
+ pack_map_tokens(w, tok, palette_size_plane, rows * cols);
+ }
+ }
+
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ int is_inter_tx = is_inter_block(mbmi) || is_intrabc_block(mbmi);
+ int skip = mbmi->skip;
+ int segment_id = mbmi->segment_id;
+ if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(bsize) &&
+ !(is_inter_tx && skip) && !xd->lossless[segment_id]) {
+ if (is_inter_tx) { // This implies skip flag is 0.
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, bsize, 0);
+ const int txbh = tx_size_high_unit[max_tx_size];
+ const int txbw = tx_size_wide_unit[max_tx_size];
+ const int width = block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int height = block_size_high[bsize] >> tx_size_high_log2[0];
+ int idx, idy;
+ for (idy = 0; idy < height; idy += txbh)
+ for (idx = 0; idx < width; idx += txbw)
+ write_tx_size_vartx(xd, mbmi, max_tx_size, 0, idy, idx, w);
+ } else {
+ write_selected_tx_size(xd, w);
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h, 0, xd);
+ }
+ } else {
+ set_txfm_ctxs(mbmi->tx_size, xd->n4_w, xd->n4_h,
+ skip && is_inter_block(mbmi), xd);
+ }
+
+ write_tokens_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+}
+
+static void write_partition(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int hbs, int mi_row,
+ int mi_col, PARTITION_TYPE p, BLOCK_SIZE bsize,
+ aom_writer *w) {
+ const int is_partition_point = bsize >= BLOCK_8X8;
+
+ if (!is_partition_point) return;
+
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+ const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+
+ if (!has_rows && !has_cols) {
+ assert(p == PARTITION_SPLIT);
+ return;
+ }
+
+ if (has_rows && has_cols) {
+ aom_write_symbol(w, p, ec_ctx->partition_cdf[ctx],
+ partition_cdf_length(bsize));
+ } else if (!has_rows && has_cols) {
+ assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_vert_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ } else {
+ assert(has_rows && !has_cols);
+ assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
+ assert(bsize > BLOCK_8X8);
+ aom_cdf_prob cdf[2];
+ partition_gather_horz_alike(cdf, ec_ctx->partition_cdf[ctx], bsize);
+ aom_write_cdf(w, p == PARTITION_SPLIT, cdf, 2);
+ }
+}
+
+static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
+ aom_writer *const w, const TOKENEXTRA **tok,
+ const TOKENEXTRA *const tok_end, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ const int num_planes = av1_num_planes(cm);
+ for (int plane = 0; plane < num_planes; ++plane) {
+ int rcol0, rcol1, rrow0, rrow1;
+ if (av1_loop_restoration_corners_in_sb(cm, plane, mi_row, mi_col, bsize,
+ &rcol0, &rcol1, &rrow0, &rrow1)) {
+ const int rstride = cm->rst_info[plane].horz_units_per_tile;
+ for (int rrow = rrow0; rrow < rrow1; ++rrow) {
+ for (int rcol = rcol0; rcol < rcol1; ++rcol) {
+ const int runit_idx = rcol + rrow * rstride;
+ const RestorationUnitInfo *rui =
+ &cm->rst_info[plane].unit_info[runit_idx];
+ loop_restoration_write_sb_coeffs(cm, xd, rui, w, plane,
+ cpi->td.counts);
+ }
+ }
+ }
+ }
+
+ write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+ switch (partition) {
+ case PARTITION_NONE:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ break;
+ case PARTITION_HORZ:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_row + hbs < cm->mi_rows)
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_VERT:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ if (mi_col + hbs < cm->mi_cols)
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_SPLIT:
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs, subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col, subsize);
+ write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs,
+ subsize);
+ break;
+ case PARTITION_HORZ_A:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ break;
+ case PARTITION_HORZ_B:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_VERT_A:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ break;
+ case PARTITION_VERT_B:
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + hbs);
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row + hbs, mi_col + hbs);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+ write_modes_b(cpi, tile, w, tok, tok_end, this_mi_row, mi_col);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+ write_modes_b(cpi, tile, w, tok, tok_end, mi_row, this_mi_col);
+ }
+ break;
+ default: assert(0);
+ }
+
+ // update partition context
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+static void write_modes(AV1_COMP *const cpi, const TileInfo *const tile,
+ aom_writer *const w, int tile_row, int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ const int mi_row_start = tile->mi_row_start;
+ const int mi_row_end = tile->mi_row_end;
+ const int mi_col_start = tile->mi_col_start;
+ const int mi_col_end = tile->mi_col_end;
+ int mi_row, mi_col, sb_row_in_tile;
+
+ av1_zero_above_context(cm, xd, mi_col_start, mi_col_end, tile->tile_row);
+ av1_init_above_context(cm, xd, tile->tile_row);
+
+ if (cpi->common.delta_q_present_flag) {
+ xd->current_qindex = cpi->common.base_qindex;
+ if (cpi->common.delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ for (mi_row = mi_row_start; mi_row < mi_row_end;
+ mi_row += cm->seq_params.mib_size) {
+ sb_row_in_tile =
+ (mi_row - tile->mi_row_start) >> cm->seq_params.mib_size_log2;
+ const TOKENEXTRA *tok =
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start;
+ const TOKENEXTRA *tok_end =
+ tok + cpi->tplist[tile_row][tile_col][sb_row_in_tile].count;
+
+ av1_zero_left_context(xd);
+
+ for (mi_col = mi_col_start; mi_col < mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ write_modes_sb(cpi, tile, w, &tok, tok_end, mi_row, mi_col,
+ cm->seq_params.sb_size);
+ }
+ assert(tok == cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop);
+ }
+}
+
+static void encode_restoration_mode(AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ assert(!cm->all_lossless);
+ if (!cm->seq_params.enable_restoration) return;
+ if (cm->allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int all_none = 1, chroma_none = 1;
+ for (int p = 0; p < num_planes; ++p) {
+ RestorationInfo *rsi = &cm->rst_info[p];
+ if (rsi->frame_restoration_type != RESTORE_NONE) {
+ all_none = 0;
+ chroma_none &= p == 0;
+ }
+ switch (rsi->frame_restoration_type) {
+ case RESTORE_NONE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_WIENER:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 0);
+ break;
+ case RESTORE_SGRPROJ:
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_bit(wb, 1);
+ break;
+ case RESTORE_SWITCHABLE:
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, 1);
+ break;
+ default: assert(0);
+ }
+ }
+ if (!all_none) {
+ assert(cm->seq_params.sb_size == BLOCK_64X64 ||
+ cm->seq_params.sb_size == BLOCK_128X128);
+ const int sb_size = cm->seq_params.sb_size == BLOCK_128X128 ? 128 : 64;
+
+ RestorationInfo *rsi = &cm->rst_info[0];
+
+ assert(rsi->restoration_unit_size >= sb_size);
+ assert(RESTORATION_UNITSIZE_MAX == 256);
+
+ if (sb_size == 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 64);
+ }
+ if (rsi->restoration_unit_size > 64) {
+ aom_wb_write_bit(wb, rsi->restoration_unit_size > 128);
+ }
+ }
+
+ if (num_planes > 1) {
+ int s = AOMMIN(cm->seq_params.subsampling_x, cm->seq_params.subsampling_y);
+ if (s && !chroma_none) {
+ aom_wb_write_bit(wb, cm->rst_info[1].restoration_unit_size !=
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size ||
+ cm->rst_info[1].restoration_unit_size ==
+ (cm->rst_info[0].restoration_unit_size >> s));
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ } else if (!s) {
+ assert(cm->rst_info[1].restoration_unit_size ==
+ cm->rst_info[0].restoration_unit_size);
+ assert(cm->rst_info[2].restoration_unit_size ==
+ cm->rst_info[1].restoration_unit_size);
+ }
+ }
+}
+
+static void write_wiener_filter(int wiener_win, const WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info, aom_writer *wb) {
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->vfilter[0] == 0 &&
+ wiener_info->vfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ else
+ assert(wiener_info->hfilter[0] == 0 &&
+ wiener_info->hfilter[WIENER_WIN - 1] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ aom_write_primitive_refsubexpfin(
+ wb, WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ memcpy(ref_wiener_info, wiener_info, sizeof(*wiener_info));
+}
+
+static void write_sgrproj_filter(const SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info,
+ aom_writer *wb) {
+ aom_write_literal(wb, sgrproj_info->ep, SGRPROJ_PARAMS_BITS);
+ const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+
+ if (params->r[0] == 0) {
+ assert(sgrproj_info->xqd[0] == 0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ } else if (params->r[1] == 0) {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ } else {
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ aom_write_primitive_refsubexpfin(
+ wb, SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ }
+
+ memcpy(ref_sgrproj_info, sgrproj_info, sizeof(*sgrproj_info));
+}
+
+static void loop_restoration_write_sb_coeffs(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd,
+ const RestorationUnitInfo *rui,
+ aom_writer *const w, int plane,
+ FRAME_COUNTS *counts) {
+ const RestorationInfo *rsi = cm->rst_info + plane;
+ RestorationType frame_rtype = rsi->frame_restoration_type;
+ if (frame_rtype == RESTORE_NONE) return;
+
+ (void)counts;
+ assert(!cm->all_lossless);
+
+ const int wiener_win = (plane > 0) ? WIENER_WIN_CHROMA : WIENER_WIN;
+ WienerInfo *wiener_info = xd->wiener_info + plane;
+ SgrprojInfo *sgrproj_info = xd->sgrproj_info + plane;
+ RestorationType unit_rtype = rui->restoration_type;
+
+ if (frame_rtype == RESTORE_SWITCHABLE) {
+ aom_write_symbol(w, unit_rtype, xd->tile_ctx->switchable_restore_cdf,
+ RESTORE_SWITCHABLE_TYPES);
+#if CONFIG_ENTROPY_STATS
+ ++counts->switchable_restore[unit_rtype];
+#endif
+ switch (unit_rtype) {
+ case RESTORE_WIENER:
+ write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+ break;
+ case RESTORE_SGRPROJ:
+ write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+ break;
+ default: assert(unit_rtype == RESTORE_NONE); break;
+ }
+ } else if (frame_rtype == RESTORE_WIENER) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->wiener_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->wiener_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+ write_wiener_filter(wiener_win, &rui->wiener_info, wiener_info, w);
+ }
+ } else if (frame_rtype == RESTORE_SGRPROJ) {
+ aom_write_symbol(w, unit_rtype != RESTORE_NONE,
+ xd->tile_ctx->sgrproj_restore_cdf, 2);
+#if CONFIG_ENTROPY_STATS
+ ++counts->sgrproj_restore[unit_rtype != RESTORE_NONE];
+#endif
+ if (unit_rtype != RESTORE_NONE) {
+ write_sgrproj_filter(&rui->sgrproj_info, sgrproj_info, w);
+ }
+ }
+}
+
+static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ assert(!cm->coded_lossless);
+ if (cm->allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int i;
+ struct loopfilter *lf = &cm->lf;
+
+ // Encode the loop filter level and type
+ aom_wb_write_literal(wb, lf->filter_level[0], 6);
+ aom_wb_write_literal(wb, lf->filter_level[1], 6);
+ if (num_planes > 1) {
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+ aom_wb_write_literal(wb, lf->filter_level_u, 6);
+ aom_wb_write_literal(wb, lf->filter_level_v, 6);
+ }
+ }
+ aom_wb_write_literal(wb, lf->sharpness_level, 3);
+
+ // Write out loop filter deltas applied at the MB level based on mode or
+ // ref frame (if they are enabled).
+ aom_wb_write_bit(wb, lf->mode_ref_delta_enabled);
+
+ if (lf->mode_ref_delta_enabled) {
+ aom_wb_write_bit(wb, lf->mode_ref_delta_update);
+
+ if (lf->mode_ref_delta_update) {
+ const int prime_idx = cm->primary_ref_frame;
+ const int buf_idx =
+ prime_idx == PRIMARY_REF_NONE ? -1 : cm->frame_refs[prime_idx].idx;
+ int8_t last_ref_deltas[REF_FRAMES];
+ if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+ av1_set_default_ref_deltas(last_ref_deltas);
+ } else {
+ memcpy(last_ref_deltas, cm->buffer_pool->frame_bufs[buf_idx].ref_deltas,
+ REF_FRAMES);
+ }
+ for (i = 0; i < REF_FRAMES; i++) {
+ const int delta = lf->ref_deltas[i];
+ const int changed = delta != last_ref_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+
+ int8_t last_mode_deltas[MAX_MODE_LF_DELTAS];
+ if (prime_idx == PRIMARY_REF_NONE || buf_idx < 0) {
+ av1_set_default_mode_deltas(last_mode_deltas);
+ } else {
+ memcpy(last_mode_deltas,
+ cm->buffer_pool->frame_bufs[buf_idx].mode_deltas,
+ MAX_MODE_LF_DELTAS);
+ }
+ for (i = 0; i < MAX_MODE_LF_DELTAS; i++) {
+ const int delta = lf->mode_deltas[i];
+ const int changed = delta != last_mode_deltas[i];
+ aom_wb_write_bit(wb, changed);
+ if (changed) aom_wb_write_inv_signed_literal(wb, delta, 6);
+ }
+ }
+ }
+}
+
+static void encode_cdef(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
+ assert(!cm->coded_lossless);
+ if (!cm->seq_params.enable_cdef) return;
+ if (cm->allow_intrabc) return;
+ const int num_planes = av1_num_planes(cm);
+ int i;
+ aom_wb_write_literal(wb, cm->cdef_pri_damping - 3, 2);
+ assert(cm->cdef_pri_damping == cm->cdef_sec_damping);
+ aom_wb_write_literal(wb, cm->cdef_bits, 2);
+ for (i = 0; i < cm->nb_cdef_strengths; i++) {
+ aom_wb_write_literal(wb, cm->cdef_strengths[i], CDEF_STRENGTH_BITS);
+ if (num_planes > 1)
+ aom_wb_write_literal(wb, cm->cdef_uv_strengths[i], CDEF_STRENGTH_BITS);
+ }
+}
+
+static void write_delta_q(struct aom_write_bit_buffer *wb, int delta_q) {
+ if (delta_q != 0) {
+ aom_wb_write_bit(wb, 1);
+ aom_wb_write_inv_signed_literal(wb, delta_q, 6);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ }
+}
+
+static void encode_quantization(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ const int num_planes = av1_num_planes(cm);
+
+ aom_wb_write_literal(wb, cm->base_qindex, QINDEX_BITS);
+ write_delta_q(wb, cm->y_dc_delta_q);
+ if (num_planes > 1) {
+ int diff_uv_delta = (cm->u_dc_delta_q != cm->v_dc_delta_q) ||
+ (cm->u_ac_delta_q != cm->v_ac_delta_q);
+ if (cm->seq_params.separate_uv_delta_q) aom_wb_write_bit(wb, diff_uv_delta);
+ write_delta_q(wb, cm->u_dc_delta_q);
+ write_delta_q(wb, cm->u_ac_delta_q);
+ if (diff_uv_delta) {
+ write_delta_q(wb, cm->v_dc_delta_q);
+ write_delta_q(wb, cm->v_ac_delta_q);
+ }
+ }
+ aom_wb_write_bit(wb, cm->using_qmatrix);
+ if (cm->using_qmatrix) {
+ aom_wb_write_literal(wb, cm->qm_y, QM_LEVEL_BITS);
+ aom_wb_write_literal(wb, cm->qm_u, QM_LEVEL_BITS);
+ if (!cm->seq_params.separate_uv_delta_q)
+ assert(cm->qm_u == cm->qm_v);
+ else
+ aom_wb_write_literal(wb, cm->qm_v, QM_LEVEL_BITS);
+ }
+}
+
+static void encode_segmentation(AV1_COMMON *cm, MACROBLOCKD *xd,
+ struct aom_write_bit_buffer *wb) {
+ int i, j;
+ struct segmentation *seg = &cm->seg;
+
+ aom_wb_write_bit(wb, seg->enabled);
+ if (!seg->enabled) return;
+
+ // Write update flags
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE) {
+ assert(seg->update_map == 1);
+ seg->temporal_update = 0;
+ assert(seg->update_data == 1);
+ } else {
+ aom_wb_write_bit(wb, seg->update_map);
+ if (seg->update_map) {
+ // Select the coding strategy (temporal or spatial)
+ av1_choose_segmap_coding_method(cm, xd);
+ aom_wb_write_bit(wb, seg->temporal_update);
+ }
+ aom_wb_write_bit(wb, seg->update_data);
+ }
+
+ // Segmentation data
+ if (seg->update_data) {
+ for (i = 0; i < MAX_SEGMENTS; i++) {
+ for (j = 0; j < SEG_LVL_MAX; j++) {
+ const int active = segfeature_active(seg, i, j);
+ aom_wb_write_bit(wb, active);
+ if (active) {
+ const int data_max = av1_seg_feature_data_max(j);
+ const int data_min = -data_max;
+ const int ubits = get_unsigned_bits(data_max);
+ const int data = clamp(get_segdata(seg, i, j), data_min, data_max);
+
+ if (av1_is_segfeature_signed(j)) {
+ aom_wb_write_inv_signed_literal(wb, data, ubits);
+ } else {
+ aom_wb_write_literal(wb, data, ubits);
+ }
+ }
+ }
+ }
+ }
+}
+
+static void write_tx_mode(AV1_COMMON *cm, TX_MODE *mode,
+ struct aom_write_bit_buffer *wb) {
+ if (cm->coded_lossless) {
+ *mode = ONLY_4X4;
+ return;
+ }
+ aom_wb_write_bit(wb, *mode == TX_MODE_SELECT);
+}
+
+static void write_frame_interp_filter(InterpFilter filter,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_bit(wb, filter == SWITCHABLE);
+ if (filter != SWITCHABLE)
+ aom_wb_write_literal(wb, filter, LOG_SWITCHABLE_FILTERS);
+}
+
+static void fix_interp_filter(AV1_COMMON *cm, FRAME_COUNTS *counts) {
+ if (cm->interp_filter == SWITCHABLE) {
+ // Check to see if only one of the filters is actually used
+ int count[SWITCHABLE_FILTERS];
+ int i, j, c = 0;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ count[i] = 0;
+ for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
+ count[i] += counts->switchable_interp[j][i];
+ c += (count[i] > 0);
+ }
+ if (c == 1) {
+ // Only one filter is used. So set the filter at frame level
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ if (count[i]) {
+ if (i == EIGHTTAP_REGULAR) cm->interp_filter = i;
+ break;
+ }
+ }
+ }
+ }
+}
+
+// Same function as write_uniform but writing to uncompresses header wb
+static void wb_write_uniform(struct aom_write_bit_buffer *wb, int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return;
+ if (v < m) {
+ aom_wb_write_literal(wb, v, l - 1);
+ } else {
+ aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+ aom_wb_write_literal(wb, (v - m) & 1, 1);
+ }
+}
+
+static void write_tile_info_max_tile(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ int width_mi = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+ int height_mi = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int width_sb = width_mi >> cm->seq_params.mib_size_log2;
+ int height_sb = height_mi >> cm->seq_params.mib_size_log2;
+ int size_sb, i;
+
+ aom_wb_write_bit(wb, cm->uniform_tile_spacing_flag);
+
+ if (cm->uniform_tile_spacing_flag) {
+ // Uniform spaced tiles with power-of-two number of rows and columns
+ // tile columns
+ int ones = cm->log2_tile_cols - cm->min_log2_tile_cols;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (cm->log2_tile_cols < cm->max_log2_tile_cols) {
+ aom_wb_write_bit(wb, 0);
+ }
+
+ // rows
+ ones = cm->log2_tile_rows - cm->min_log2_tile_rows;
+ while (ones--) {
+ aom_wb_write_bit(wb, 1);
+ }
+ if (cm->log2_tile_rows < cm->max_log2_tile_rows) {
+ aom_wb_write_bit(wb, 0);
+ }
+ } else {
+ // Explicit tiles with configurable tile widths and heights
+ // columns
+ for (i = 0; i < cm->tile_cols; i++) {
+ size_sb = cm->tile_col_start_sb[i + 1] - cm->tile_col_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(width_sb, cm->max_tile_width_sb),
+ size_sb - 1);
+ width_sb -= size_sb;
+ }
+ assert(width_sb == 0);
+
+ // rows
+ for (i = 0; i < cm->tile_rows; i++) {
+ size_sb = cm->tile_row_start_sb[i + 1] - cm->tile_row_start_sb[i];
+ wb_write_uniform(wb, AOMMIN(height_sb, cm->max_tile_height_sb),
+ size_sb - 1);
+ height_sb -= size_sb;
+ }
+ assert(height_sb == 0);
+ }
+}
+
+static void write_tile_info(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ write_tile_info_max_tile(cm, wb);
+
+ *saved_wb = *wb;
+ if (cm->tile_rows * cm->tile_cols > 1) {
+ // tile id used for cdf update
+ aom_wb_write_literal(wb, 0, cm->log2_tile_cols + cm->log2_tile_rows);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 3, 2);
+ }
+}
+
+static void write_ext_tile_info(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ // This information is stored as a separate byte.
+ int mod = wb->bit_offset % CHAR_BIT;
+ if (mod > 0) aom_wb_write_literal(wb, 0, CHAR_BIT - mod);
+ assert(aom_wb_is_byte_aligned(wb));
+
+ *saved_wb = *wb;
+ if (cm->tile_rows * cm->tile_cols > 1) {
+ // Note that the last item in the uncompressed header is the data
+ // describing tile configuration.
+ // Number of bytes in tile column size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ // Number of bytes in tile size - 1
+ aom_wb_write_literal(wb, 0, 2);
+ }
+}
+
+static int get_refresh_mask(AV1_COMP *cpi) {
+ if ((cpi->common.frame_type == KEY_FRAME && cpi->common.show_frame) ||
+ frame_is_sframe(&cpi->common))
+ return 0xFF;
+
+ int refresh_mask = 0;
+
+ // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
+ // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
+ // the 3 LAST reference frames will be updated accordingly, i.e.:
+ // (1) The original virtual index for LAST3_FRAME will become the new virtual
+ // index for LAST_FRAME; and
+ // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
+ // shifted and become the new virtual indexes for LAST2_FRAME and
+ // LAST3_FRAME.
+ refresh_mask |=
+ (cpi->refresh_last_frame << cpi->ref_fb_idx[LAST_REF_FRAMES - 1]);
+#if USE_SYMM_MULTI_LAYER
+ refresh_mask |=
+ (cpi->new_bwdref_update_rule == 1)
+ ? (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[EXTREF_FRAME - 1])
+ : (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#else
+ refresh_mask |=
+ (cpi->refresh_bwd_ref_frame << cpi->ref_fb_idx[BWDREF_FRAME - 1]);
+#endif
+ refresh_mask |=
+ (cpi->refresh_alt2_ref_frame << cpi->ref_fb_idx[ALTREF2_FRAME - 1]);
+
+ if (av1_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term we leave it in the GF slot and,
+ // if we're updating the GF with the current decoded frame, we save it
+ // instead to the ARF slot.
+ // Later, in the function av1_encoder.c:av1_update_reference_frames() we
+ // will swap gld_fb_idx and alt_fb_idx to achieve our objective. We do it
+ // there so that it can be done outside of the recode loop.
+ // Note: This is highly specific to the use of ARF as a forward reference,
+ // and this needs to be generalized as other uses are implemented
+ // (like RTC/temporal scalability).
+
+ if (cpi->preserve_arf_as_gld) {
+ return refresh_mask;
+ } else {
+ return refresh_mask |
+ (cpi->refresh_golden_frame << cpi->ref_fb_idx[ALTREF_FRAME - 1]);
+ }
+ } else {
+ const int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+ return refresh_mask |
+ (cpi->refresh_golden_frame << cpi->ref_fb_idx[GOLDEN_FRAME - 1]) |
+ (cpi->refresh_alt_ref_frame << arf_idx);
+ }
+}
+
+static INLINE int find_identical_tile(
+ const int tile_row, const int tile_col,
+ TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS]) {
+ const MV32 candidate_offset[1] = { { 1, 0 } };
+ const uint8_t *const cur_tile_data =
+ tile_buffers[tile_row][tile_col].data + 4;
+ const size_t cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+ int i;
+
+ if (tile_row == 0) return 0;
+
+ // (TODO: yunqingwang) For now, only above tile is checked and used.
+ // More candidates such as left tile can be added later.
+ for (i = 0; i < 1; i++) {
+ int row_offset = candidate_offset[0].row;
+ int col_offset = candidate_offset[0].col;
+ int row = tile_row - row_offset;
+ int col = tile_col - col_offset;
+ uint8_t tile_hdr;
+ const uint8_t *tile_data;
+ TileBufferEnc *candidate;
+
+ if (row < 0 || col < 0) continue;
+
+ tile_hdr = *(tile_buffers[row][col].data);
+
+ // Read out tcm bit
+ if ((tile_hdr >> 7) == 1) {
+ // The candidate is a copy tile itself
+ row_offset += tile_hdr & 0x7f;
+ row = tile_row - row_offset;
+ }
+
+ candidate = &tile_buffers[row][col];
+
+ if (row_offset >= 128 || candidate->size != cur_tile_size) continue;
+
+ tile_data = candidate->data + 4;
+
+ if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0) continue;
+
+ // Identical tile found
+ assert(row_offset > 0);
+ return row_offset;
+ }
+
+ // No identical tile found
+ return 0;
+}
+
+static void write_render_size(const AV1_COMMON *cm,
+ struct aom_write_bit_buffer *wb) {
+ const int scaling_active = av1_resize_scaled(cm);
+ aom_wb_write_bit(wb, scaling_active);
+ if (scaling_active) {
+ aom_wb_write_literal(wb, cm->render_width - 1, 16);
+ aom_wb_write_literal(wb, cm->render_height - 1, 16);
+ }
+}
+
+static void write_superres_scale(const AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ if (!seq_params->enable_superres) {
+ assert(cm->superres_scale_denominator == SCALE_NUMERATOR);
+ return;
+ }
+
+ // First bit is whether to to scale or not
+ if (cm->superres_scale_denominator == SCALE_NUMERATOR) {
+ aom_wb_write_bit(wb, 0); // no scaling
+ } else {
+ aom_wb_write_bit(wb, 1); // scaling, write scale factor
+ assert(cm->superres_scale_denominator >= SUPERRES_SCALE_DENOMINATOR_MIN);
+ assert(cm->superres_scale_denominator <
+ SUPERRES_SCALE_DENOMINATOR_MIN + (1 << SUPERRES_SCALE_BITS));
+ aom_wb_write_literal(
+ wb, cm->superres_scale_denominator - SUPERRES_SCALE_DENOMINATOR_MIN,
+ SUPERRES_SCALE_BITS);
+ }
+}
+
+static void write_frame_size(const AV1_COMMON *cm, int frame_size_override,
+ struct aom_write_bit_buffer *wb) {
+ const int coded_width = cm->superres_upscaled_width - 1;
+ const int coded_height = cm->superres_upscaled_height - 1;
+
+ if (frame_size_override) {
+ const SequenceHeader *seq_params = &cm->seq_params;
+ int num_bits_width = seq_params->num_bits_width;
+ int num_bits_height = seq_params->num_bits_height;
+ aom_wb_write_literal(wb, coded_width, num_bits_width);
+ aom_wb_write_literal(wb, coded_height, num_bits_height);
+ }
+
+ write_superres_scale(cm, wb);
+ write_render_size(cm, wb);
+}
+
+static void write_frame_size_with_refs(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ int found = 0;
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, ref_frame);
+
+ if (cfg != NULL) {
+ found = cm->superres_upscaled_width == cfg->y_crop_width &&
+ cm->superres_upscaled_height == cfg->y_crop_height;
+ found &= cm->render_width == cfg->render_width &&
+ cm->render_height == cfg->render_height;
+ }
+ aom_wb_write_bit(wb, found);
+ if (found) {
+ write_superres_scale(cm, wb);
+ break;
+ }
+ }
+
+ if (!found) {
+ int frame_size_override = 1; // Always equal to 1 in this function
+ write_frame_size(cm, frame_size_override, wb);
+ }
+}
+
+static void write_profile(BITSTREAM_PROFILE profile,
+ struct aom_write_bit_buffer *wb) {
+ assert(profile >= PROFILE_0 && profile < MAX_PROFILES);
+ aom_wb_write_literal(wb, profile, PROFILE_BITS);
+}
+
+static void write_bitdepth(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ // Profile 0/1: [0] for 8 bit, [1] 10-bit
+ // Profile 2: [0] for 8 bit, [10] 10-bit, [11] - 12-bit
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_8 ? 0 : 1);
+ if (seq_params->profile == PROFILE_2 && seq_params->bit_depth != AOM_BITS_8) {
+ aom_wb_write_bit(wb, seq_params->bit_depth == AOM_BITS_10 ? 0 : 1);
+ }
+}
+
+static void write_color_config(const SequenceHeader *const seq_params,
+ struct aom_write_bit_buffer *wb) {
+ write_bitdepth(seq_params, wb);
+ const int is_monochrome = seq_params->monochrome;
+ // monochrome bit
+ if (seq_params->profile != PROFILE_1)
+ aom_wb_write_bit(wb, is_monochrome);
+ else
+ assert(!is_monochrome);
+ if (seq_params->color_primaries == AOM_CICP_CP_UNSPECIFIED &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_UNSPECIFIED &&
+ seq_params->matrix_coefficients == AOM_CICP_MC_UNSPECIFIED) {
+ aom_wb_write_bit(wb, 0); // No color description present
+ } else {
+ aom_wb_write_bit(wb, 1); // Color description present
+ aom_wb_write_literal(wb, seq_params->color_primaries, 8);
+ aom_wb_write_literal(wb, seq_params->transfer_characteristics, 8);
+ aom_wb_write_literal(wb, seq_params->matrix_coefficients, 8);
+ }
+ if (is_monochrome) {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ return;
+ }
+ if (seq_params->color_primaries == AOM_CICP_CP_BT_709 &&
+ seq_params->transfer_characteristics == AOM_CICP_TC_SRGB &&
+ seq_params->matrix_coefficients ==
+ AOM_CICP_MC_IDENTITY) { // it would be better to remove this
+ // dependency too
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ assert(seq_params->profile == PROFILE_1 ||
+ (seq_params->profile == PROFILE_2 &&
+ seq_params->bit_depth == AOM_BITS_12));
+ } else {
+ // 0: [16, 235] (i.e. xvYCC), 1: [0, 255]
+ aom_wb_write_bit(wb, seq_params->color_range);
+ if (seq_params->profile == PROFILE_0) {
+ // 420 only
+ assert(seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1);
+ } else if (seq_params->profile == PROFILE_1) {
+ // 444 only
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ } else if (seq_params->profile == PROFILE_2) {
+ if (seq_params->bit_depth == AOM_BITS_12) {
+ // 420, 444 or 422
+ aom_wb_write_bit(wb, seq_params->subsampling_x);
+ if (seq_params->subsampling_x == 0) {
+ assert(seq_params->subsampling_y == 0 &&
+ "4:4:0 subsampling not allowed in AV1");
+ } else {
+ aom_wb_write_bit(wb, seq_params->subsampling_y);
+ }
+ } else {
+ // 422 only
+ assert(seq_params->subsampling_x == 1 &&
+ seq_params->subsampling_y == 0);
+ }
+ }
+ if (seq_params->matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ assert(seq_params->subsampling_x == 0 && seq_params->subsampling_y == 0);
+ }
+ if (seq_params->subsampling_x == 1 && seq_params->subsampling_y == 1) {
+ aom_wb_write_literal(wb, seq_params->chroma_sample_position, 2);
+ }
+ }
+ aom_wb_write_bit(wb, seq_params->separate_uv_delta_q);
+}
+
+static void write_timing_info_header(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(wb, cm->timing_info.num_units_in_display_tick,
+ 32); // Number of units in tick
+ aom_wb_write_unsigned_literal(wb, cm->timing_info.time_scale,
+ 32); // Time scale
+ aom_wb_write_bit(
+ wb,
+ cm->timing_info.equal_picture_interval); // Equal picture interval bit
+ if (cm->timing_info.equal_picture_interval) {
+ aom_wb_write_uvlc(
+ wb,
+ cm->timing_info.num_ticks_per_picture - 1); // ticks per picture
+ }
+}
+
+static void write_decoder_model_info(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_literal(
+ wb, cm->buffer_model.encoder_decoder_buffer_delay_length - 1, 5);
+ aom_wb_write_unsigned_literal(wb, cm->buffer_model.num_units_in_decoding_tick,
+ 32); // Number of units in decoding tick
+ aom_wb_write_literal(wb, cm->buffer_model.buffer_removal_time_length - 1, 5);
+ aom_wb_write_literal(wb, cm->buffer_model.frame_presentation_time_length - 1,
+ 5);
+}
+
+static void write_dec_model_op_parameters(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb,
+ int op_num) {
+ if (op_num > MAX_NUM_OPERATING_POINTS)
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Encoder does not support %d decoder model operating points", op_num);
+
+ // aom_wb_write_bit(wb, cm->op_params[op_num].has_parameters);
+ // if (!cm->op_params[op_num].has_parameters) return;
+
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_params[op_num].decoder_buffer_delay,
+ cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_params[op_num].encoder_buffer_delay,
+ cm->buffer_model.encoder_decoder_buffer_delay_length);
+
+ aom_wb_write_bit(wb, cm->op_params[op_num].low_delay_mode_flag);
+
+ cm->op_frame_timing[op_num].buffer_removal_time =
+ 0; // reset the decoded frame counter
+}
+
+static void write_tu_pts_info(AV1_COMMON *const cm,
+ struct aom_write_bit_buffer *wb) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->frame_presentation_time,
+ cm->buffer_model.frame_presentation_time_length);
+}
+
+static void write_film_grain_params(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ aom_film_grain_t *pars = &cm->film_grain_params;
+
+ cm->cur_frame->film_grain_params = *pars;
+
+ aom_wb_write_bit(wb, pars->apply_grain);
+ if (!pars->apply_grain) return;
+
+ aom_wb_write_literal(wb, pars->random_seed, 16);
+
+ pars->random_seed += 3381; // Changing random seed for film grain
+ if (!pars->random_seed) // Random seed should not be zero
+ pars->random_seed += 7391;
+ if (cm->frame_type == INTER_FRAME)
+ aom_wb_write_bit(wb, pars->update_parameters);
+ else
+ pars->update_parameters = 1;
+ if (!pars->update_parameters) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ int ref_frame, ref_idx, buf_idx;
+ for (ref_frame = LAST_FRAME; ref_frame < REF_FRAMES; ref_frame++) {
+ ref_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ assert(ref_idx != INVALID_IDX);
+ buf_idx = cm->ref_frame_map[ref_idx];
+ if (frame_bufs[buf_idx].film_grain_params_present &&
+ memcmp(pars, &frame_bufs[buf_idx].film_grain_params, sizeof(*pars))) {
+ break;
+ }
+ }
+ assert(ref_frame < REF_FRAMES);
+ aom_wb_write_literal(wb, ref_idx, 3);
+ return;
+ }
+
+ // Scaling functions parameters
+ aom_wb_write_literal(wb, pars->num_y_points, 4); // max 14
+ for (int i = 0; i < pars->num_y_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_y[i][1], 8);
+ }
+
+ if (!cm->seq_params.monochrome)
+ aom_wb_write_bit(wb, pars->chroma_scaling_from_luma);
+ else
+ pars->chroma_scaling_from_luma = 0; // for monochrome override to 0
+
+ if (cm->seq_params.monochrome || pars->chroma_scaling_from_luma ||
+ ((cm->seq_params.subsampling_x == 1) &&
+ (cm->seq_params.subsampling_y == 1) && (pars->num_y_points == 0))) {
+ pars->num_cb_points = 0;
+ pars->num_cr_points = 0;
+ } else {
+ aom_wb_write_literal(wb, pars->num_cb_points, 4); // max 10
+ for (int i = 0; i < pars->num_cb_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cb[i][1], 8);
+ }
+
+ aom_wb_write_literal(wb, pars->num_cr_points, 4); // max 10
+ for (int i = 0; i < pars->num_cr_points; i++) {
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][0], 8);
+ aom_wb_write_literal(wb, pars->scaling_points_cr[i][1], 8);
+ }
+ }
+
+ aom_wb_write_literal(wb, pars->scaling_shift - 8, 2); // 8 + value
+
+ // AR coefficients
+ // Only sent if the corresponsing scaling function has
+ // more than 0 points
+
+ aom_wb_write_literal(wb, pars->ar_coeff_lag, 2);
+
+ int num_pos_luma = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+ int num_pos_chroma = num_pos_luma;
+ if (pars->num_y_points > 0) ++num_pos_chroma;
+
+ if (pars->num_y_points)
+ for (int i = 0; i < num_pos_luma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_y[i] + 128, 8);
+
+ if (pars->num_cb_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cb[i] + 128, 8);
+
+ if (pars->num_cr_points || pars->chroma_scaling_from_luma)
+ for (int i = 0; i < num_pos_chroma; i++)
+ aom_wb_write_literal(wb, pars->ar_coeffs_cr[i] + 128, 8);
+
+ aom_wb_write_literal(wb, pars->ar_coeff_shift - 6, 2); // 8 + value
+
+ aom_wb_write_literal(wb, pars->grain_scale_shift, 2);
+
+ if (pars->num_cb_points) {
+ aom_wb_write_literal(wb, pars->cb_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cb_offset, 9);
+ }
+
+ if (pars->num_cr_points) {
+ aom_wb_write_literal(wb, pars->cr_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_luma_mult, 8);
+ aom_wb_write_literal(wb, pars->cr_offset, 9);
+ }
+
+ aom_wb_write_bit(wb, pars->overlap_flag);
+
+ aom_wb_write_bit(wb, pars->clip_to_restricted_range);
+}
+
+static void write_sb_size(SequenceHeader *seq_params,
+ struct aom_write_bit_buffer *wb) {
+ (void)seq_params;
+ (void)wb;
+ assert(seq_params->mib_size == mi_size_wide[seq_params->sb_size]);
+ assert(seq_params->mib_size == 1 << seq_params->mib_size_log2);
+ assert(seq_params->sb_size == BLOCK_128X128 ||
+ seq_params->sb_size == BLOCK_64X64);
+ aom_wb_write_bit(wb, seq_params->sb_size == BLOCK_128X128 ? 1 : 0);
+}
+
+static void write_sequence_header(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *seq_params = &cm->seq_params;
+
+ int max_frame_width = cpi->oxcf.forced_max_frame_width
+ ? cpi->oxcf.forced_max_frame_width
+ : cpi->oxcf.width;
+ int max_frame_height = cpi->oxcf.forced_max_frame_height
+ ? cpi->oxcf.forced_max_frame_height
+ : cpi->oxcf.height;
+ // max((int)ceil(log2(max_frame_width)), 1)
+ const int num_bits_width =
+ (max_frame_width > 1) ? get_msb(max_frame_width - 1) + 1 : 1;
+ // max((int)ceil(log2(max_frame_height)), 1)
+ const int num_bits_height =
+ (max_frame_height > 1) ? get_msb(max_frame_height - 1) + 1 : 1;
+ assert(num_bits_width <= 16);
+ assert(num_bits_height <= 16);
+
+ seq_params->num_bits_width = num_bits_width;
+ seq_params->num_bits_height = num_bits_height;
+ seq_params->max_frame_width = max_frame_width;
+ seq_params->max_frame_height = max_frame_height;
+
+ aom_wb_write_literal(wb, num_bits_width - 1, 4);
+ aom_wb_write_literal(wb, num_bits_height - 1, 4);
+ aom_wb_write_literal(wb, max_frame_width - 1, num_bits_width);
+ aom_wb_write_literal(wb, max_frame_height - 1, num_bits_height);
+
+ /* Placeholder for actually writing to the bitstream */
+ if (!seq_params->reduced_still_picture_hdr) {
+ seq_params->frame_id_numbers_present_flag =
+ cm->large_scale_tile ? 0 : cm->error_resilient_mode;
+ seq_params->frame_id_length = FRAME_ID_LENGTH;
+ seq_params->delta_frame_id_length = DELTA_FRAME_ID_LENGTH;
+
+ aom_wb_write_bit(wb, seq_params->frame_id_numbers_present_flag);
+ if (seq_params->frame_id_numbers_present_flag) {
+ // We must always have delta_frame_id_length < frame_id_length,
+ // in order for a frame to be referenced with a unique delta.
+ // Avoid wasting bits by using a coding that enforces this restriction.
+ aom_wb_write_literal(wb, seq_params->delta_frame_id_length - 2, 4);
+ aom_wb_write_literal(
+ wb,
+ seq_params->frame_id_length - seq_params->delta_frame_id_length - 1,
+ 3);
+ }
+ }
+
+ write_sb_size(seq_params, wb);
+
+ aom_wb_write_bit(wb, seq_params->enable_filter_intra);
+ aom_wb_write_bit(wb, seq_params->enable_intra_edge_filter);
+
+ if (!seq_params->reduced_still_picture_hdr) {
+ aom_wb_write_bit(wb, seq_params->enable_interintra_compound);
+ aom_wb_write_bit(wb, seq_params->enable_masked_compound);
+ aom_wb_write_bit(wb, seq_params->enable_warped_motion);
+ aom_wb_write_bit(wb, seq_params->enable_dual_filter);
+
+ aom_wb_write_bit(wb, seq_params->enable_order_hint);
+
+ if (seq_params->enable_order_hint) {
+ aom_wb_write_bit(wb, seq_params->enable_jnt_comp);
+ aom_wb_write_bit(wb, seq_params->enable_ref_frame_mvs);
+ }
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_screen_content_tools);
+ }
+ if (seq_params->force_screen_content_tools > 0) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, 1);
+ } else {
+ aom_wb_write_bit(wb, 0);
+ aom_wb_write_bit(wb, seq_params->force_integer_mv);
+ }
+ } else {
+ assert(seq_params->force_integer_mv == 2);
+ }
+ if (seq_params->enable_order_hint)
+ aom_wb_write_literal(wb, seq_params->order_hint_bits_minus_1, 3);
+ }
+
+ aom_wb_write_bit(wb, seq_params->enable_superres);
+ aom_wb_write_bit(wb, seq_params->enable_cdef);
+ aom_wb_write_bit(wb, seq_params->enable_restoration);
+}
+
+static void write_global_motion_params(const WarpedMotionParams *params,
+ const WarpedMotionParams *ref_params,
+ struct aom_write_bit_buffer *wb,
+ int allow_hp) {
+ const TransformationType type = params->wmtype;
+
+ aom_wb_write_bit(wb, type != IDENTITY);
+ if (type != IDENTITY) {
+ aom_wb_write_bit(wb, type == ROTZOOM);
+ if (type != ROTZOOM) aom_wb_write_bit(wb, type == TRANSLATION);
+ }
+
+ if (type >= ROTZOOM) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[2] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ }
+
+ if (type >= AFFINE) {
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (params->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (params->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+
+ if (type >= TRANSLATION) {
+ const int trans_bits = (type == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ const int trans_prec_diff = (type == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[0] >> trans_prec_diff),
+ (params->wmmat[0] >> trans_prec_diff));
+ aom_wb_write_signed_primitive_refsubexpfin(
+ wb, (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_params->wmmat[1] >> trans_prec_diff),
+ (params->wmmat[1] >> trans_prec_diff));
+ }
+}
+
+static void write_global_motion(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ int frame;
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ write_global_motion_params(&cm->global_motion[frame], ref_params, wb,
+ cm->allow_high_precision_mv);
+ // TODO(sarahparker, debargha): The logic in the commented out code below
+ // does not work currently and causes mismatches when resize is on.
+ // Fix it before turning the optimization back on.
+ /*
+ YV12_BUFFER_CONFIG *ref_buf = get_ref_frame_buffer(cpi, frame);
+ if (cpi->source->y_crop_width == ref_buf->y_crop_width &&
+ cpi->source->y_crop_height == ref_buf->y_crop_height) {
+ write_global_motion_params(&cm->global_motion[frame],
+ &cm->prev_frame->global_motion[frame], wb,
+ cm->allow_high_precision_mv);
+ } else {
+ assert(cm->global_motion[frame].wmtype == IDENTITY &&
+ "Invalid warp type for frames of different resolutions");
+ }
+ */
+ /*
+ printf("Frame %d/%d: Enc Ref %d: %d %d %d %d\n",
+ cm->current_video_frame, cm->show_frame, frame,
+ cm->global_motion[frame].wmmat[0],
+ cm->global_motion[frame].wmmat[1], cm->global_motion[frame].wmmat[2],
+ cm->global_motion[frame].wmmat[3]);
+ */
+ }
+}
+
+static void check_frame_refs_short_signaling(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cm->frame_refs_short_signaling) return;
+
+ // Check whether all references are distinct frames.
+ int buf_markers[FRAME_BUFFERS] = { 0 };
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ if (buf_idx != INVALID_IDX) {
+ assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+ buf_markers[buf_idx] = 1;
+ }
+ }
+
+ int num_refs = 0;
+ for (int buf_idx = 0; buf_idx < FRAME_BUFFERS; ++buf_idx) {
+ num_refs += buf_markers[buf_idx];
+ }
+
+ // We only turn on frame_refs_short_signaling when all references are
+ // distinct.
+ if (num_refs < INTER_REFS_PER_FRAME) {
+ // It indicates that there exist more than one reference frame pointing to
+ // the same reference buffer, i.e. two or more references are duplicate.
+ cm->frame_refs_short_signaling = 0;
+ return;
+ }
+
+ // Check whether the encoder side ref frame choices are aligned with that to
+ // be derived at the decoder side.
+ RefBuffer frame_refs_copy[INTER_REFS_PER_FRAME];
+
+ // Backup the frame refs info
+ memcpy(frame_refs_copy, cm->frame_refs,
+ INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+
+ const int lst_map_idx = get_ref_frame_map_idx(cpi, LAST_FRAME);
+ const int gld_map_idx = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+
+ // Set up the frame refs mapping indexes according to the
+ // frame_refs_short_signaling policy.
+ av1_set_frame_refs(cm, lst_map_idx, gld_map_idx);
+
+ // We only turn on frame_refs_short_signaling when the encoder side decision
+ // on ref frames is identical to that at the decoder side.
+ for (int ref_idx = 0; ref_idx < INTER_REFS_PER_FRAME; ++ref_idx) {
+ // Compare the buffer index between two reference frames indexed
+ // respectively by the encoder and the decoder side decisions.
+ if (cm->frame_refs[ref_idx].idx != frame_refs_copy[ref_idx].idx) {
+ cm->frame_refs_short_signaling = 0;
+ break;
+ }
+ }
+
+#if 0 // For debug
+ printf("\nFrame=%d: \n", cm->current_video_frame);
+ printf("***frame_refs_short_signaling=%d\n", cm->frame_refs_short_signaling);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ printf("enc_ref(map_idx=%d, buf_idx=%d)=%d, vs. "
+ "dec_ref(map_idx=%d, buf_idx=%d)=%d\n",
+ get_ref_frame_map_idx(cpi, ref_frame),
+ get_ref_frame_buf_idx(cpi, ref_frame), ref_frame,
+ cm->frame_refs[ref_frame - LAST_FRAME].map_idx,
+ cm->frame_refs[ref_frame - LAST_FRAME].idx, ref_frame);
+ }
+#endif // 0
+
+ // Restore the frame refs info if frame_refs_short_signaling is off.
+ if (!cm->frame_refs_short_signaling)
+ memcpy(cm->frame_refs, frame_refs_copy,
+ INTER_REFS_PER_FRAME * sizeof(RefBuffer));
+}
+
+// New function based on HLS R18
+static void write_uncompressed_header_obu(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *saved_wb,
+ struct aom_write_bit_buffer *wb) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ // NOTE: By default all coded frames to be used as a reference
+ cm->is_reference_frame = 1;
+ cm->frame_type = cm->intra_only ? INTRA_ONLY_FRAME : cm->frame_type;
+
+ if (seq_params->still_picture) {
+ assert(cm->show_existing_frame == 0);
+ assert(cm->show_frame == 1);
+ assert(cm->frame_type == KEY_FRAME);
+ }
+ if (!seq_params->reduced_still_picture_hdr) {
+ if (encode_show_existing_frame(cm)) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ const int frame_to_show = cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+ if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Buffer %d does not contain a reconstructed frame",
+ frame_to_show);
+ }
+ ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+ aom_wb_write_bit(wb, 1); // show_existing_frame
+ aom_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+ if (seq_params->decoder_model_info_present_flag &&
+ cm->timing_info.equal_picture_interval == 0) {
+ write_tu_pts_info(cm, wb);
+ }
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ int display_frame_id = cm->ref_frame_id[cpi->existing_fb_idx_to_show];
+ aom_wb_write_literal(wb, display_frame_id, frame_id_len);
+ }
+
+ if (cm->reset_decoder_state &&
+ frame_bufs[frame_to_show].frame_type != KEY_FRAME) {
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "show_existing_frame to reset state on KEY_FRAME only");
+ }
+
+ return;
+ } else {
+ aom_wb_write_bit(wb, 0); // show_existing_frame
+ }
+
+ aom_wb_write_literal(wb, cm->frame_type, 2);
+
+ aom_wb_write_bit(wb, cm->show_frame);
+ if (cm->show_frame) {
+ if (seq_params->decoder_model_info_present_flag &&
+ cm->timing_info.equal_picture_interval == 0)
+ write_tu_pts_info(cm, wb);
+ } else {
+ aom_wb_write_bit(wb, cm->showable_frame);
+ }
+ if (frame_is_sframe(cm)) {
+ assert(cm->error_resilient_mode);
+ } else if (!(cm->frame_type == KEY_FRAME && cm->show_frame)) {
+ aom_wb_write_bit(wb, cm->error_resilient_mode);
+ }
+ }
+ aom_wb_write_bit(wb, cm->disable_cdf_update);
+
+ if (seq_params->force_screen_content_tools == 2) {
+ aom_wb_write_bit(wb, cm->allow_screen_content_tools);
+ } else {
+ assert(cm->allow_screen_content_tools ==
+ seq_params->force_screen_content_tools);
+ }
+
+ if (cm->allow_screen_content_tools) {
+ if (seq_params->force_integer_mv == 2) {
+ aom_wb_write_bit(wb, cm->cur_frame_force_integer_mv);
+ } else {
+ assert(cm->cur_frame_force_integer_mv == seq_params->force_integer_mv);
+ }
+ } else {
+ assert(cm->cur_frame_force_integer_mv == 0);
+ }
+
+ cm->invalid_delta_frame_id_minus_1 = 0;
+ int frame_size_override_flag = 0;
+ cm->frame_refs_short_signaling = 0;
+
+ if (seq_params->reduced_still_picture_hdr) {
+ assert(cm->width == seq_params->max_frame_width &&
+ cm->height == seq_params->max_frame_height);
+ } else {
+ if (seq_params->frame_id_numbers_present_flag) {
+ int frame_id_len = seq_params->frame_id_length;
+ aom_wb_write_literal(wb, cm->current_frame_id, frame_id_len);
+ }
+
+ if (cm->width > seq_params->max_frame_width ||
+ cm->height > seq_params->max_frame_height) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "Frame dimensions are larger than the maximum values");
+ }
+
+ frame_size_override_flag =
+ frame_is_sframe(cm) ? 1
+ : (cm->width != seq_params->max_frame_width ||
+ cm->height != seq_params->max_frame_height);
+ if (!frame_is_sframe(cm)) aom_wb_write_bit(wb, frame_size_override_flag);
+
+ if (seq_params->enable_order_hint)
+ aom_wb_write_literal(wb, cm->frame_offset,
+ seq_params->order_hint_bits_minus_1 + 1);
+
+ if (!cm->error_resilient_mode && !frame_is_intra_only(cm)) {
+ aom_wb_write_literal(wb, cm->primary_ref_frame, PRIMARY_REF_BITS);
+ }
+ }
+
+ if (seq_params->decoder_model_info_present_flag) {
+ aom_wb_write_bit(wb, cm->buffer_removal_time_present);
+ if (cm->buffer_removal_time_present) {
+ for (int op_num = 0;
+ op_num < seq_params->operating_points_cnt_minus_1 + 1; op_num++) {
+ if (cm->op_params[op_num].decoder_model_param_present_flag) {
+ if (((seq_params->operating_point_idc[op_num] >>
+ cm->temporal_layer_id) &
+ 0x1 &&
+ (seq_params->operating_point_idc[op_num] >>
+ (cm->spatial_layer_id + 8)) &
+ 0x1) ||
+ seq_params->operating_point_idc[op_num] == 0) {
+ aom_wb_write_unsigned_literal(
+ wb, cm->op_frame_timing[op_num].buffer_removal_time,
+ cm->buffer_model.buffer_removal_time_length);
+ cm->op_frame_timing[op_num].buffer_removal_time++;
+ if (cm->op_frame_timing[op_num].buffer_removal_time == 0) {
+ aom_internal_error(&cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "buffer_removal_time overflowed");
+ }
+ }
+ }
+ }
+ }
+ }
+ cpi->refresh_frame_mask = get_refresh_mask(cpi);
+ if (cm->frame_type == KEY_FRAME) {
+ if (!cm->show_frame) { // unshown keyframe (forward keyframe)
+ aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+ } else {
+ assert(cpi->refresh_frame_mask == 0xFF);
+ }
+ } else {
+ if (cm->frame_type == INTRA_ONLY_FRAME) {
+ assert(cpi->refresh_frame_mask != 0xFF);
+ int updated_fb = -1;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ // If more than one frame is refreshed, it doesn't matter which one
+ // we pick, so pick the first.
+ if (cpi->refresh_frame_mask & (1 << i)) {
+ updated_fb = i;
+ break;
+ }
+ }
+ assert(updated_fb >= 0);
+ cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+ aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+ } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+ if (cm->frame_type == INTER_FRAME) {
+ aom_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+ } else {
+ assert(frame_is_sframe(cm) && cpi->refresh_frame_mask == 0xFF);
+ }
+ int updated_fb = -1;
+ for (int i = 0; i < REF_FRAMES; i++) {
+ // If more than one frame is refreshed, it doesn't matter which one
+ // we pick, so pick the first.
+ if (cpi->refresh_frame_mask & (1 << i)) {
+ updated_fb = i;
+ break;
+ }
+ }
+ // large scale tile sometimes won't refresh any fbs
+ if (updated_fb >= 0) {
+ cm->fb_of_context_type[cm->frame_context_idx] = updated_fb;
+ }
+
+ if (!cpi->refresh_frame_mask) {
+ // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+ // will not be used as a reference
+ cm->is_reference_frame = 0;
+ }
+ }
+ }
+
+ if (!frame_is_intra_only(cm) || cpi->refresh_frame_mask != 0xFF) {
+ // Write all ref frame order hints if error_resilient_mode == 1
+ if (cm->error_resilient_mode && seq_params->enable_order_hint) {
+ RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+ for (int ref_idx = 0; ref_idx < REF_FRAMES; ref_idx++) {
+ // Get buffer index
+ const int buf_idx = cm->ref_frame_map[ref_idx];
+ assert(buf_idx >= 0 && buf_idx < FRAME_BUFFERS);
+
+ // Write order hint to bit stream
+ aom_wb_write_literal(wb, frame_bufs[buf_idx].cur_frame_offset,
+ seq_params->order_hint_bits_minus_1 + 1);
+ }
+ }
+ }
+
+ if (cm->frame_type == KEY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+ if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, cm->allow_intrabc);
+ // all eight fbs are refreshed, pick one that will live long enough
+ cm->fb_of_context_type[REGULAR_FRAME] = 0;
+ } else {
+ if (cm->frame_type == INTRA_ONLY_FRAME) {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ assert(!av1_superres_scaled(cm) || !cm->allow_intrabc);
+ if (cm->allow_screen_content_tools && !av1_superres_scaled(cm))
+ aom_wb_write_bit(wb, cm->allow_intrabc);
+ } else if (cm->frame_type == INTER_FRAME || frame_is_sframe(cm)) {
+ MV_REFERENCE_FRAME ref_frame;
+
+ // NOTE: Error resilient mode turns off frame_refs_short_signaling
+ // automatically.
+#define FRAME_REFS_SHORT_SIGNALING 0
+#if FRAME_REFS_SHORT_SIGNALING
+ cm->frame_refs_short_signaling = seq_params->enable_order_hint;
+#endif // FRAME_REFS_SHORT_SIGNALING
+
+ if (cm->frame_refs_short_signaling) {
+ // NOTE(zoeliu@google.com):
+ // An example solution for encoder-side implementation on frame refs
+ // short signaling, which is only turned on when the encoder side
+ // decision on ref frames is identical to that at the decoder side.
+ check_frame_refs_short_signaling(cpi);
+ }
+
+ if (seq_params->enable_order_hint)
+ aom_wb_write_bit(wb, cm->frame_refs_short_signaling);
+
+ if (cm->frame_refs_short_signaling) {
+ const int lst_ref = get_ref_frame_map_idx(cpi, LAST_FRAME);
+ aom_wb_write_literal(wb, lst_ref, REF_FRAMES_LOG2);
+
+ const int gld_ref = get_ref_frame_map_idx(cpi, GOLDEN_FRAME);
+ aom_wb_write_literal(wb, gld_ref, REF_FRAMES_LOG2);
+ }
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
+ if (!cm->frame_refs_short_signaling)
+ aom_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
+ REF_FRAMES_LOG2);
+ if (seq_params->frame_id_numbers_present_flag) {
+ int i = get_ref_frame_map_idx(cpi, ref_frame);
+ int frame_id_len = seq_params->frame_id_length;
+ int diff_len = seq_params->delta_frame_id_length;
+ int delta_frame_id_minus_1 =
+ ((cm->current_frame_id - cm->ref_frame_id[i] +
+ (1 << frame_id_len)) %
+ (1 << frame_id_len)) -
+ 1;
+ if (delta_frame_id_minus_1 < 0 ||
+ delta_frame_id_minus_1 >= (1 << diff_len))
+ cm->invalid_delta_frame_id_minus_1 = 1;
+ aom_wb_write_literal(wb, delta_frame_id_minus_1, diff_len);
+ }
+ }
+
+ if (!cm->error_resilient_mode && frame_size_override_flag) {
+ write_frame_size_with_refs(cpi, wb);
+ } else {
+ write_frame_size(cm, frame_size_override_flag, wb);
+ }
+
+ if (cm->cur_frame_force_integer_mv) {
+ cm->allow_high_precision_mv = 0;
+ } else {
+ aom_wb_write_bit(wb, cm->allow_high_precision_mv);
+ }
+ fix_interp_filter(cm, cpi->td.counts);
+ write_frame_interp_filter(cm->interp_filter, wb);
+ aom_wb_write_bit(wb, cm->switchable_motion_mode);
+ if (frame_might_allow_ref_frame_mvs(cm)) {
+ aom_wb_write_bit(wb, cm->allow_ref_frame_mvs);
+ } else {
+ assert(cm->allow_ref_frame_mvs == 0);
+ }
+ }
+ }
+
+ const int might_bwd_adapt =
+ !(seq_params->reduced_still_picture_hdr) && !(cm->disable_cdf_update);
+ if (cm->large_scale_tile)
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (might_bwd_adapt) {
+ aom_wb_write_bit(
+ wb, cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_DISABLED);
+ }
+
+ write_tile_info(cm, saved_wb, wb);
+ encode_quantization(cm, wb);
+ encode_segmentation(cm, xd, wb);
+
+ if (cm->delta_q_present_flag) assert(cm->base_qindex > 0);
+ if (cm->base_qindex > 0) {
+ aom_wb_write_bit(wb, cm->delta_q_present_flag);
+ if (cm->delta_q_present_flag) {
+ aom_wb_write_literal(wb, get_msb(cm->delta_q_res), 2);
+ xd->current_qindex = cm->base_qindex;
+ if (cm->allow_intrabc)
+ assert(cm->delta_lf_present_flag == 0);
+ else
+ aom_wb_write_bit(wb, cm->delta_lf_present_flag);
+ if (cm->delta_lf_present_flag) {
+ aom_wb_write_literal(wb, get_msb(cm->delta_lf_res), 2);
+ aom_wb_write_bit(wb, cm->delta_lf_multi);
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+ }
+
+ if (cm->all_lossless) {
+ assert(!av1_superres_scaled(cm));
+ } else {
+ if (!cm->coded_lossless) {
+ encode_loopfilter(cm, wb);
+ encode_cdef(cm, wb);
+ }
+ encode_restoration_mode(cm, wb);
+ }
+
+ write_tx_mode(cm, &cm->tx_mode, wb);
+
+ if (cpi->allow_comp_inter_inter) {
+ const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
+
+ aom_wb_write_bit(wb, use_hybrid_pred);
+ }
+
+ if (cm->is_skip_mode_allowed) aom_wb_write_bit(wb, cm->skip_mode_flag);
+
+ if (frame_might_allow_warped_motion(cm))
+ aom_wb_write_bit(wb, cm->allow_warped_motion);
+ else
+ assert(!cm->allow_warped_motion);
+
+ aom_wb_write_bit(wb, cm->reduced_tx_set_used);
+
+ if (!frame_is_intra_only(cm)) write_global_motion(cpi, wb);
+
+ if (seq_params->film_grain_params_present &&
+ (cm->show_frame || cm->showable_frame)) {
+ int flip_back_update_parameters_flag = 0;
+ if (cm->frame_type != INTER_FRAME &&
+ cm->film_grain_params.update_parameters == 0) {
+ cm->film_grain_params.update_parameters = 1;
+ flip_back_update_parameters_flag = 1;
+ }
+ write_film_grain_params(cpi, wb);
+
+ if (flip_back_update_parameters_flag)
+ cm->film_grain_params.update_parameters = 0;
+ }
+
+ if (cm->large_scale_tile) write_ext_tile_info(cm, saved_wb, wb);
+}
+
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+ // Choose the number of bytes required to represent size, without
+ // using the 'spare_msbs' number of most significant bits.
+
+ // Make sure we will fit in 4 bytes to start with..
+ if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0) return -1;
+
+ // Normalise to 32 bits
+ size <<= spare_msbs;
+
+ if (size >> 24 != 0)
+ return 4;
+ else if (size >> 16 != 0)
+ return 3;
+ else if (size >> 8 != 0)
+ return 2;
+ else
+ return 1;
+}
+
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+ switch (sz) {
+ case 1: dst[0] = (uint8_t)(val & 0xff); break;
+ case 2: mem_put_le16(dst, val); break;
+ case 3: mem_put_le24(dst, val); break;
+ case 4: mem_put_le32(dst, val); break;
+ default: assert(0 && "Invalid size"); break;
+ }
+}
+
+static int remux_tiles(const AV1_COMMON *const cm, uint8_t *dst,
+ const uint32_t data_size, const uint32_t max_tile_size,
+ const uint32_t max_tile_col_size,
+ int *const tile_size_bytes,
+ int *const tile_col_size_bytes) {
+ // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+ int tsb;
+ int tcsb;
+
+ if (cm->large_scale_tile) {
+ // The top bit in the tile size field indicates tile copy mode, so we
+ // have 1 less bit to code the tile size
+ tsb = choose_size_bytes(max_tile_size, 1);
+ tcsb = choose_size_bytes(max_tile_col_size, 0);
+ } else {
+ tsb = choose_size_bytes(max_tile_size, 0);
+ tcsb = 4; // This is ignored
+ (void)max_tile_col_size;
+ }
+
+ assert(tsb > 0);
+ assert(tcsb > 0);
+
+ *tile_size_bytes = tsb;
+ *tile_col_size_bytes = tcsb;
+ if (tsb == 4 && tcsb == 4) return data_size;
+
+ uint32_t wpos = 0;
+ uint32_t rpos = 0;
+
+ if (cm->large_scale_tile) {
+ int tile_row;
+ int tile_col;
+
+ for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+ // All but the last column has a column header
+ if (tile_col < cm->tile_cols - 1) {
+ uint32_t tile_col_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // Adjust the tile column size by the number of bytes removed
+ // from the tile size fields.
+ tile_col_size -= (4 - tsb) * cm->tile_rows;
+
+ mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+ wpos += tcsb;
+ }
+
+ for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ // All, including the last row has a header
+ uint32_t tile_header = mem_get_le32(dst + rpos);
+ rpos += 4;
+
+ // If this is a copy tile, we need to shift the MSB to the
+ // top bit of the new width, and there is no data to copy.
+ if (tile_header >> 31 != 0) {
+ if (tsb < 4) tile_header >>= 32 - 8 * tsb;
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+ } else {
+ mem_put_varsize(dst + wpos, tsb, tile_header);
+ wpos += tsb;
+
+ tile_header += AV1_MIN_TILE_SIZE_BYTES;
+ memmove(dst + wpos, dst + rpos, tile_header);
+ rpos += tile_header;
+ wpos += tile_header;
+ }
+ }
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+ }
+ const int n_tiles = cm->tile_cols * cm->tile_rows;
+ int n;
+
+ for (n = 0; n < n_tiles; n++) {
+ int tile_size;
+
+ if (n == n_tiles - 1) {
+ tile_size = data_size - rpos;
+ } else {
+ tile_size = mem_get_le32(dst + rpos);
+ rpos += 4;
+ mem_put_varsize(dst + wpos, tsb, tile_size);
+ tile_size += AV1_MIN_TILE_SIZE_BYTES;
+ wpos += tsb;
+ }
+
+ memmove(dst + wpos, dst + rpos, tile_size);
+
+ rpos += tile_size;
+ wpos += tile_size;
+ }
+
+ assert(rpos > wpos);
+ assert(rpos == data_size);
+
+ return wpos;
+}
+
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+ uint8_t *const dst) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ aom_wb_write_literal(&wb, 0, 1); // forbidden bit.
+ aom_wb_write_literal(&wb, (int)obu_type, 4);
+ aom_wb_write_literal(&wb, obu_extension ? 1 : 0, 1);
+ aom_wb_write_literal(&wb, 1, 1); // obu_has_payload_length_field
+ aom_wb_write_literal(&wb, 0, 1); // reserved
+
+ if (obu_extension) {
+ aom_wb_write_literal(&wb, obu_extension & 0xFF, 8);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+ uint8_t *dest) {
+ const uint32_t obu_size = obu_payload_size;
+ const uint32_t offset = obu_header_size;
+ size_t coded_obu_size = 0;
+
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), dest + offset,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static size_t obu_memmove(uint32_t obu_header_size, uint32_t obu_payload_size,
+ uint8_t *data) {
+ const size_t length_field_size = aom_uleb_size_in_bytes(obu_payload_size);
+ const uint32_t move_dst_offset =
+ (uint32_t)length_field_size + obu_header_size;
+ const uint32_t move_src_offset = obu_header_size;
+ const uint32_t move_size = obu_payload_size;
+ memmove(data + move_dst_offset, data + move_src_offset, move_size);
+ return length_field_size;
+}
+
+static void add_trailing_bits(struct aom_write_bit_buffer *wb) {
+ if (aom_wb_is_byte_aligned(wb)) {
+ aom_wb_write_literal(wb, 0x80, 8);
+ } else {
+ // assumes that the other bits are already 0s
+ aom_wb_write_bit(wb, 1);
+ }
+}
+
+static void write_bitstream_level(BitstreamLevel bl,
+ struct aom_write_bit_buffer *wb) {
+ uint8_t seq_level_idx = major_minor_to_seq_level_idx(bl);
+ assert(is_valid_seq_level_idx(seq_level_idx));
+ aom_wb_write_literal(wb, seq_level_idx, LEVEL_BITS);
+}
+
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst) {
+ AV1_COMMON *const cm = &cpi->common;
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ write_profile(cm->seq_params.profile, &wb);
+
+ // Still picture or not
+ aom_wb_write_bit(&wb, cm->seq_params.still_picture);
+ assert(IMPLIES(!cm->seq_params.still_picture,
+ !cm->seq_params.reduced_still_picture_hdr));
+ // whether to use reduced still picture header
+ aom_wb_write_bit(&wb, cm->seq_params.reduced_still_picture_hdr);
+
+ if (cm->seq_params.reduced_still_picture_hdr) {
+ assert(cm->timing_info_present == 0);
+ assert(cm->seq_params.decoder_model_info_present_flag == 0);
+ assert(cm->seq_params.display_model_info_present_flag == 0);
+ write_bitstream_level(cm->seq_params.level[0], &wb);
+ } else {
+ aom_wb_write_bit(&wb, cm->timing_info_present); // timing info present flag
+
+ if (cm->timing_info_present) {
+ // timing_info
+ write_timing_info_header(cm, &wb);
+ aom_wb_write_bit(&wb, cm->seq_params.decoder_model_info_present_flag);
+ if (cm->seq_params.decoder_model_info_present_flag) {
+ write_decoder_model_info(cm, &wb);
+ }
+ }
+ aom_wb_write_bit(&wb, cm->seq_params.display_model_info_present_flag);
+ aom_wb_write_literal(&wb, cm->seq_params.operating_points_cnt_minus_1,
+ OP_POINTS_CNT_MINUS_1_BITS);
+ int i;
+ for (i = 0; i < cm->seq_params.operating_points_cnt_minus_1 + 1; i++) {
+ aom_wb_write_literal(&wb, cm->seq_params.operating_point_idc[i],
+ OP_POINTS_IDC_BITS);
+ write_bitstream_level(cm->seq_params.level[i], &wb);
+ if (cm->seq_params.level[i].major > 3)
+ aom_wb_write_bit(&wb, cm->seq_params.tier[i]);
+ if (cm->seq_params.decoder_model_info_present_flag) {
+ aom_wb_write_bit(&wb,
+ cm->op_params[i].decoder_model_param_present_flag);
+ if (cm->op_params[i].decoder_model_param_present_flag)
+ write_dec_model_op_parameters(cm, &wb, i);
+ }
+ if (cm->seq_params.display_model_info_present_flag) {
+ aom_wb_write_bit(&wb,
+ cm->op_params[i].display_model_param_present_flag);
+ if (cm->op_params[i].display_model_param_present_flag) {
+ assert(cm->op_params[i].initial_display_delay <= 10);
+ aom_wb_write_literal(&wb, cm->op_params[i].initial_display_delay - 1,
+ 4);
+ }
+ }
+ }
+ }
+ write_sequence_header(cpi, &wb);
+
+ write_color_config(&cm->seq_params, &wb);
+
+ aom_wb_write_bit(&wb, cm->seq_params.film_grain_params_present);
+
+ add_trailing_bits(&wb);
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+static uint32_t write_frame_header_obu(AV1_COMP *cpi,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t *const dst,
+ int append_trailing_bits) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ write_uncompressed_header_obu(cpi, saved_wb, &wb);
+ if (append_trailing_bits) add_trailing_bits(&wb);
+ return aom_wb_bytes_written(&wb);
+}
+
+static uint32_t write_tile_group_header(uint8_t *const dst, int startTile,
+ int endTile, int tiles_log2,
+ int tile_start_and_end_present_flag) {
+ struct aom_write_bit_buffer wb = { dst, 0 };
+ uint32_t size = 0;
+
+ if (!tiles_log2) return size;
+
+ aom_wb_write_bit(&wb, tile_start_and_end_present_flag);
+
+ if (tile_start_and_end_present_flag) {
+ aom_wb_write_literal(&wb, startTile, tiles_log2);
+ aom_wb_write_literal(&wb, endTile, tiles_log2);
+ }
+
+ size = aom_wb_bytes_written(&wb);
+ return size;
+}
+
+typedef struct {
+ uint8_t *frame_header;
+ size_t obu_header_byte_offset;
+ size_t total_length;
+} FrameHeaderInfo;
+
+static uint32_t write_tiles_in_tg_obus(AV1_COMP *const cpi, uint8_t *const dst,
+ struct aom_write_bit_buffer *saved_wb,
+ uint8_t obu_extension_header,
+ const FrameHeaderInfo *fh_info) {
+ AV1_COMMON *const cm = &cpi->common;
+ aom_writer mode_bc;
+ int tile_row, tile_col;
+ TileBufferEnc(*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
+ uint32_t total_size = 0;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ unsigned int tile_size = 0;
+ unsigned int max_tile_size = 0;
+ unsigned int max_tile_col_size = 0;
+ const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
+ // Fixed size tile groups for the moment
+ const int num_tg_hdrs = cm->num_tg;
+ const int tg_size =
+ (cm->large_scale_tile)
+ ? 1
+ : (tile_rows * tile_cols + num_tg_hdrs - 1) / num_tg_hdrs;
+ int tile_count = 0;
+ int curr_tg_data_size = 0;
+ uint8_t *data = dst;
+ int new_tg = 1;
+ const int have_tiles = tile_cols * tile_rows > 1;
+ int first_tg = 1;
+
+ cm->largest_tile_id = 0;
+
+ if (cm->large_scale_tile) {
+ // For large_scale_tile case, we always have only one tile group, so it can
+ // be written as an OBU_FRAME.
+ const OBU_TYPE obu_type = OBU_FRAME;
+ const uint32_t tg_hdr_size = write_obu_header(obu_type, 0, data);
+ data += tg_hdr_size;
+
+ const uint32_t frame_header_size =
+ write_frame_header_obu(cpi, saved_wb, data, 0);
+ data += frame_header_size;
+ total_size += frame_header_size;
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+ {
+ char fn[20] = "./fh";
+ fn[4] = cm->current_video_frame / 100 + '0';
+ fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+ fn[6] = (cm->current_video_frame % 10) + '0';
+ fn[7] = '\0';
+ av1_print_uncompressed_frame_header(data - frame_header_size,
+ frame_header_size, fn);
+ }
+#endif // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+ int tile_size_bytes = 0;
+ int tile_col_size_bytes = 0;
+
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ TileInfo tile_info;
+ const int is_last_col = (tile_col == tile_cols - 1);
+ const uint32_t col_offset = total_size;
+
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+ // The last column does not have a column header
+ if (!is_last_col) total_size += 4;
+
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+ const int data_offset = have_tiles ? 4 : 0;
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ av1_tile_set_row(&tile_info, cm, tile_row);
+
+ buf->data = dst + total_size + tg_hdr_size;
+
+ // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+ // even for the last one, unless no tiling is used at all.
+ total_size += data_offset;
+ // Initialise tile context from the frame context
+ this_tile->tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ mode_bc.allow_update_cdf = !cm->large_scale_tile;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+ aom_start_encode(&mode_bc, buf->data + data_offset);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+ aom_stop_encode(&mode_bc);
+ tile_size = mode_bc.pos;
+ buf->size = tile_size;
+
+ // Record the maximum tile size we see, so we can compact headers later.
+ if (tile_size > max_tile_size) {
+ max_tile_size = tile_size;
+ cm->largest_tile_id = tile_cols * tile_row + tile_col;
+ }
+
+ if (have_tiles) {
+ // tile header: size of this tile, or copy offset
+ uint32_t tile_header = tile_size - AV1_MIN_TILE_SIZE_BYTES;
+ const int tile_copy_mode =
+ ((AOMMAX(cm->tile_width, cm->tile_height) << MI_SIZE_LOG2) <= 256)
+ ? 1
+ : 0;
+
+ // If tile_copy_mode = 1, check if this tile is a copy tile.
+ // Very low chances to have copy tiles on the key frames, so don't
+ // search on key frames to reduce unnecessary search.
+ if (cm->frame_type != KEY_FRAME && tile_copy_mode) {
+ const int identical_tile_offset =
+ find_identical_tile(tile_row, tile_col, tile_buffers);
+
+ if (identical_tile_offset > 0) {
+ tile_size = 0;
+ tile_header = identical_tile_offset | 0x80;
+ tile_header <<= 24;
+ }
+ }
+
+ mem_put_le32(buf->data, tile_header);
+ }
+
+ total_size += tile_size;
+ }
+
+ if (!is_last_col) {
+ uint32_t col_size = total_size - col_offset - 4;
+ mem_put_le32(dst + col_offset + tg_hdr_size, col_size);
+
+ // Record the maximum tile column size we see.
+ max_tile_col_size = AOMMAX(max_tile_col_size, col_size);
+ }
+ }
+
+ if (have_tiles) {
+ total_size = remux_tiles(cm, data, total_size - frame_header_size,
+ max_tile_size, max_tile_col_size,
+ &tile_size_bytes, &tile_col_size_bytes);
+ total_size += frame_header_size;
+ }
+
+ // In EXT_TILE case, only use 1 tile group. Follow the obu syntax, write
+ // current tile group size before tile data(include tile column header).
+ // Tile group size doesn't include the bytes storing tg size.
+ total_size += tg_hdr_size;
+ const uint32_t obu_payload_size = total_size - tg_hdr_size;
+ const size_t length_field_size =
+ obu_memmove(tg_hdr_size, obu_payload_size, dst);
+ if (write_uleb_obu_size(tg_hdr_size, obu_payload_size, dst) !=
+ AOM_CODEC_OK) {
+ assert(0);
+ }
+ total_size += (uint32_t)length_field_size;
+ saved_wb->bit_buffer += length_field_size;
+
+ // Now fill in the gaps in the uncompressed header.
+ if (have_tiles) {
+ assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_col_size_bytes - 1, 2);
+
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+ }
+ return total_size;
+ }
+
+ uint32_t obu_header_size = 0;
+ uint8_t *tile_data_start = dst + total_size;
+ for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, tile_row);
+
+ for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+ const int tile_idx = tile_row * tile_cols + tile_col;
+ TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+ TileDataEnc *this_tile = &cpi->tile_data[tile_idx];
+ int is_last_tile_in_tg = 0;
+
+ if (new_tg) {
+ data = dst + total_size;
+
+ // A new tile group begins at this tile. Write the obu header and
+ // tile group header
+ const OBU_TYPE obu_type =
+ (num_tg_hdrs == 1) ? OBU_FRAME : OBU_TILE_GROUP;
+ curr_tg_data_size =
+ write_obu_header(obu_type, obu_extension_header, data);
+ obu_header_size = curr_tg_data_size;
+
+ if (num_tg_hdrs == 1) {
+ curr_tg_data_size += write_frame_header_obu(
+ cpi, saved_wb, data + curr_tg_data_size, 0);
+ }
+ curr_tg_data_size += write_tile_group_header(
+ data + curr_tg_data_size, tile_idx,
+ AOMMIN(tile_idx + tg_size - 1, tile_cols * tile_rows - 1),
+ n_log2_tiles, cm->num_tg > 1);
+ total_size += curr_tg_data_size;
+ tile_data_start += curr_tg_data_size;
+ new_tg = 0;
+ tile_count = 0;
+ }
+ tile_count++;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+
+ if (tile_count == tg_size || tile_idx == (tile_cols * tile_rows - 1)) {
+ is_last_tile_in_tg = 1;
+ new_tg = 1;
+ } else {
+ is_last_tile_in_tg = 0;
+ }
+
+ buf->data = dst + total_size;
+
+ // The last tile of the tile group does not have a header.
+ if (!is_last_tile_in_tg) total_size += 4;
+
+ // Initialise tile context from the frame context
+ this_tile->tctx = *cm->fc;
+ cpi->td.mb.e_mbd.tile_ctx = &this_tile->tctx;
+ mode_bc.allow_update_cdf = 1;
+ mode_bc.allow_update_cdf =
+ mode_bc.allow_update_cdf && !cm->disable_cdf_update;
+ const int num_planes = av1_num_planes(cm);
+ av1_reset_loop_restoration(&cpi->td.mb.e_mbd, num_planes);
+
+ aom_start_encode(&mode_bc, dst + total_size);
+ write_modes(cpi, &tile_info, &mode_bc, tile_row, tile_col);
+ aom_stop_encode(&mode_bc);
+ tile_size = mode_bc.pos;
+ assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES);
+
+ curr_tg_data_size += (tile_size + (is_last_tile_in_tg ? 0 : 4));
+ buf->size = tile_size;
+ if (tile_size > max_tile_size) {
+ cm->largest_tile_id = tile_cols * tile_row + tile_col;
+ max_tile_size = tile_size;
+ }
+
+ if (!is_last_tile_in_tg) {
+ // size of this tile
+ mem_put_le32(buf->data, tile_size - AV1_MIN_TILE_SIZE_BYTES);
+ } else {
+ // write current tile group size
+ const uint32_t obu_payload_size = curr_tg_data_size - obu_header_size;
+ const size_t length_field_size =
+ obu_memmove(obu_header_size, obu_payload_size, data);
+ if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ assert(0);
+ }
+ curr_tg_data_size += (int)length_field_size;
+ total_size += (uint32_t)length_field_size;
+ tile_data_start += length_field_size;
+ if (num_tg_hdrs == 1) {
+ // if this tg is combined with the frame header then update saved
+ // frame header base offset accroding to length field size
+ saved_wb->bit_buffer += length_field_size;
+ }
+
+ if (!first_tg && cm->error_resilient_mode) {
+ // Make room for a duplicate Frame Header OBU.
+ memmove(data + fh_info->total_length, data, curr_tg_data_size);
+
+ // Insert a copy of the Frame Header OBU.
+ memcpy(data, fh_info->frame_header, fh_info->total_length);
+
+ // Force context update tile to be the first tile in error
+ // resiliant mode as the duplicate frame headers will have
+ // context_update_tile_id set to 0
+ cm->largest_tile_id = 0;
+
+ // Rewrite the OBU header to change the OBU type to Redundant Frame
+ // Header.
+ write_obu_header(OBU_REDUNDANT_FRAME_HEADER, obu_extension_header,
+ &data[fh_info->obu_header_byte_offset]);
+
+ data += fh_info->total_length;
+
+ curr_tg_data_size += (int)(fh_info->total_length);
+ total_size += (uint32_t)(fh_info->total_length);
+ }
+ first_tg = 0;
+ }
+
+ total_size += tile_size;
+ }
+ }
+
+ if (have_tiles) {
+ // Fill in context_update_tile_id indicating the tile to use for the
+ // cdf update. The encoder currently sets it to the largest tile
+ // (but is up to the encoder)
+ aom_wb_overwrite_literal(saved_wb, cm->largest_tile_id,
+ cm->log2_tile_cols + cm->log2_tile_rows);
+ // If more than one tile group. tile_size_bytes takes the default value 4
+ // and does not need to be set. For a single tile group it is set in the
+ // section below.
+ if (num_tg_hdrs == 1) {
+ int tile_size_bytes = 4, unused;
+ const uint32_t tile_data_offset = (uint32_t)(tile_data_start - dst);
+ const uint32_t tile_data_size = total_size - tile_data_offset;
+
+ total_size =
+ remux_tiles(cm, tile_data_start, tile_data_size, max_tile_size,
+ max_tile_col_size, &tile_size_bytes, &unused);
+ total_size += tile_data_offset;
+ assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+
+ aom_wb_overwrite_literal(saved_wb, tile_size_bytes - 1, 2);
+
+ // Update the OBU length if remux_tiles() reduced the size.
+ uint64_t payload_size;
+ size_t length_field_size;
+ int res =
+ aom_uleb_decode(dst + obu_header_size, total_size - obu_header_size,
+ &payload_size, &length_field_size);
+ assert(res == 0);
+ (void)res;
+
+ const uint64_t new_payload_size =
+ total_size - obu_header_size - length_field_size;
+ if (new_payload_size != payload_size) {
+ size_t new_length_field_size;
+ res = aom_uleb_encode(new_payload_size, length_field_size,
+ dst + obu_header_size, &new_length_field_size);
+ assert(res == 0);
+ if (new_length_field_size < length_field_size) {
+ const size_t src_offset = obu_header_size + length_field_size;
+ const size_t dst_offset = obu_header_size + new_length_field_size;
+ memmove(dst + dst_offset, dst + src_offset, (size_t)payload_size);
+ total_size -= (int)(length_field_size - new_length_field_size);
+ }
+ }
+ }
+ }
+ return total_size;
+}
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dst, size_t *size) {
+ uint8_t *data = dst;
+ uint32_t data_size;
+ AV1_COMMON *const cm = &cpi->common;
+ uint32_t obu_header_size = 0;
+ uint32_t obu_payload_size = 0;
+ FrameHeaderInfo fh_info = { NULL, 0, 0 };
+ const uint8_t obu_extension_header =
+ cm->temporal_layer_id << 5 | cm->spatial_layer_id << 3 | 0;
+
+#if CONFIG_BITSTREAM_DEBUG
+ bitstream_queue_reset_write();
+#endif
+
+ // The TD is now written outside the frame encode loop
+
+ // write sequence header obu if KEY_FRAME, preceded by 4-byte size
+ if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+ obu_header_size = write_obu_header(OBU_SEQUENCE_HEADER, 0, data);
+
+ obu_payload_size = write_sequence_header_obu(cpi, data + obu_header_size);
+ const size_t length_field_size =
+ obu_memmove(obu_header_size, obu_payload_size, data);
+ if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ data += obu_header_size + obu_payload_size + length_field_size;
+ }
+
+ const int write_frame_header =
+ (cm->num_tg > 1 || encode_show_existing_frame(cm));
+ struct aom_write_bit_buffer saved_wb;
+ if (write_frame_header) {
+ // Write Frame Header OBU.
+ fh_info.frame_header = data;
+ obu_header_size =
+ write_obu_header(OBU_FRAME_HEADER, obu_extension_header, data);
+ obu_payload_size =
+ write_frame_header_obu(cpi, &saved_wb, data + obu_header_size, 1);
+
+ const size_t length_field_size =
+ obu_memmove(obu_header_size, obu_payload_size, data);
+ if (write_uleb_obu_size(obu_header_size, obu_payload_size, data) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+ fh_info.obu_header_byte_offset = 0;
+ fh_info.total_length =
+ obu_header_size + obu_payload_size + length_field_size;
+ data += fh_info.total_length;
+
+ // Since length_field_size is determined adaptively after frame header
+ // encoding, saved_wb must be adjusted accordingly.
+ saved_wb.bit_buffer += length_field_size;
+ }
+
+ if (encode_show_existing_frame(cm)) {
+ data_size = 0;
+ } else {
+ // Each tile group obu will be preceded by 4-byte size of the tile group
+ // obu
+ data_size = write_tiles_in_tg_obus(cpi, data, &saved_wb,
+ obu_extension_header, &fh_info);
+ }
+ data += data_size;
+ *size = data - dst;
+ return AOM_CODEC_OK;
+}
diff --git a/third_party/aom/av1/encoder/bitstream.h b/third_party/aom/av1/encoder/bitstream.h
new file mode 100644
index 0000000000..465ccaed57
--- /dev/null
+++ b/third_party/aom/av1/encoder/bitstream.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BITSTREAM_H_
+#define AOM_AV1_ENCODER_BITSTREAM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct aom_write_bit_buffer;
+
+// Writes only the OBU Sequence Header payload, and returns the size of the
+// payload written to 'dst'. This function does not write the OBU header, the
+// optional extension, or the OBU size to 'dst'.
+uint32_t write_sequence_header_obu(AV1_COMP *cpi, uint8_t *const dst);
+
+// Writes the OBU header byte, and the OBU header extension byte when
+// 'obu_extension' is non-zero. Returns number of bytes written to 'dst'.
+uint32_t write_obu_header(OBU_TYPE obu_type, int obu_extension,
+ uint8_t *const dst);
+
+int write_uleb_obu_size(uint32_t obu_header_size, uint32_t obu_payload_size,
+ uint8_t *dest);
+
+int av1_pack_bitstream(AV1_COMP *const cpi, uint8_t *dest, size_t *size);
+
+static INLINE int av1_preserve_existing_gf(AV1_COMP *cpi) {
+ // Do not swap gf and arf indices for internal overlay frames
+ return cpi->rc.is_src_frame_alt_ref && !cpi->rc.is_src_frame_ext_arf;
+}
+
+void av1_write_tx_type(const AV1_COMMON *const cm, const MACROBLOCKD *xd,
+ int blk_row, int blk_col, int plane, TX_SIZE tx_size,
+ aom_writer *w);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BITSTREAM_H_
diff --git a/third_party/aom/av1/encoder/block.h b/third_party/aom/av1/encoder/block.h
new file mode 100644
index 0000000000..0bc5dea825
--- /dev/null
+++ b/third_party/aom/av1/encoder/block.h
@@ -0,0 +1,452 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_BLOCK_H_
+#define AOM_AV1_ENCODER_BLOCK_H_
+
+#include "av1/common/entropymv.h"
+#include "av1/common/entropy.h"
+#include "av1/common/mvref_common.h"
+#include "av1/encoder/hash.h"
+#if CONFIG_DIST_8X8
+#include "aom/aomcx.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ unsigned int sse;
+ int sum;
+ unsigned int var;
+} DIFF;
+
+typedef struct macroblock_plane {
+ DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
+ tran_low_t *qcoeff;
+ tran_low_t *coeff;
+ uint16_t *eobs;
+ uint8_t *txb_entropy_ctx;
+ struct buf_2d src;
+
+ // Quantizer setings
+ // These are used/accessed only in the quantization process
+ // RDO does not / must not depend on any of these values
+ // All values below share the coefficient scale/shift used in TX
+ const int16_t *quant_fp_QTX;
+ const int16_t *round_fp_QTX;
+ const int16_t *quant_QTX;
+ const int16_t *quant_shift_QTX;
+ const int16_t *zbin_QTX;
+ const int16_t *round_QTX;
+ const int16_t *dequant_QTX;
+} MACROBLOCK_PLANE;
+
+typedef struct {
+ int txb_skip_cost[TXB_SKIP_CONTEXTS][2];
+ int base_eob_cost[SIG_COEF_CONTEXTS_EOB][3];
+ int base_cost[SIG_COEF_CONTEXTS][4];
+ int eob_extra_cost[EOB_COEF_CONTEXTS][2];
+ int dc_sign_cost[DC_SIGN_CONTEXTS][2];
+ int lps_cost[LEVEL_CONTEXTS][COEFF_BASE_RANGE + 1];
+} LV_MAP_COEFF_COST;
+
+typedef struct {
+ int eob_cost[2][11];
+} LV_MAP_EOB_COST;
+
+typedef struct {
+ tran_low_t tcoeff[MAX_MB_PLANE][MAX_SB_SQUARE];
+ uint16_t eobs[MAX_MB_PLANE][MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ uint8_t txb_skip_ctx[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+ int dc_sign_ctx[MAX_MB_PLANE]
+ [MAX_SB_SQUARE / (TX_SIZE_W_MIN * TX_SIZE_H_MIN)];
+} CB_COEFF_BUFFER;
+
+typedef struct {
+ int16_t mode_context[MODE_CTX_REF_FRAMES];
+ // TODO(angiebird): Reduce the buffer size according to sb_type
+ tran_low_t *tcoeff[MAX_MB_PLANE];
+ uint16_t *eobs[MAX_MB_PLANE];
+ uint8_t *txb_skip_ctx[MAX_MB_PLANE];
+ int *dc_sign_ctx[MAX_MB_PLANE];
+ uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+ CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+ int_mv global_mvs[REF_FRAMES];
+ int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+} MB_MODE_INFO_EXT;
+
+typedef struct {
+ int col_min;
+ int col_max;
+ int row_min;
+ int row_max;
+} MvLimits;
+
+typedef struct {
+ uint8_t best_palette_color_map[MAX_PALETTE_SQUARE];
+ int kmeans_data_buf[2 * MAX_PALETTE_SQUARE];
+} PALETTE_BUFFER;
+
+typedef struct {
+ TX_SIZE tx_size;
+ TX_SIZE inter_tx_size[INTER_TX_SIZE_BUF_LEN];
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ TX_TYPE txk_type[TXK_TYPE_BUF_LEN];
+ RD_STATS rd_stats;
+ uint32_t hash_value;
+} MB_RD_INFO;
+
+#define RD_RECORD_BUFFER_LEN 8
+typedef struct {
+ MB_RD_INFO tx_rd_info[RD_RECORD_BUFFER_LEN]; // Circular buffer.
+ int index_start;
+ int num;
+ CRC32C crc_calculator; // Hash function.
+} MB_RD_RECORD;
+
+typedef struct {
+ int64_t dist;
+ int64_t sse;
+ int rate;
+ uint16_t eob;
+ TX_TYPE tx_type;
+ uint16_t entropy_context;
+ uint8_t txb_entropy_ctx;
+ uint8_t valid;
+ uint8_t fast; // This is not being used now.
+} TXB_RD_INFO;
+
+#define TX_SIZE_RD_RECORD_BUFFER_LEN 256
+typedef struct {
+ uint32_t hash_vals[TX_SIZE_RD_RECORD_BUFFER_LEN];
+ TXB_RD_INFO tx_rd_info[TX_SIZE_RD_RECORD_BUFFER_LEN];
+ int index_start;
+ int num;
+} TXB_RD_RECORD;
+
+typedef struct tx_size_rd_info_node {
+ TXB_RD_INFO *rd_info_array; // Points to array of size TX_TYPES.
+ struct tx_size_rd_info_node *children[4];
+} TXB_RD_INFO_NODE;
+
+// Region size for mode decision sampling in the first pass of partition
+// search(two_pass_partition_search speed feature), in units of mi size(4).
+// Used by the mode_pruning_based_on_two_pass_partition_search speed feature.
+#define FIRST_PARTITION_PASS_SAMPLE_REGION 8
+#define FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2 3
+#define FIRST_PARTITION_PASS_STATS_TABLES \
+ (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2) * \
+ (MAX_MIB_SIZE >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+#define FIRST_PARTITION_PASS_STATS_STRIDE \
+ (MAX_MIB_SIZE_LOG2 - FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2)
+
+static INLINE int av1_first_partition_pass_stats_index(int mi_row, int mi_col) {
+ const int row =
+ (mi_row & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+ const int col =
+ (mi_col & MAX_MIB_MASK) >> FIRST_PARTITION_PASS_SAMPLE_REGION_LOG2;
+ return (row << FIRST_PARTITION_PASS_STATS_STRIDE) + col;
+}
+
+typedef struct {
+ uint8_t ref0_counts[REF_FRAMES]; // Counters for ref_frame[0].
+ uint8_t ref1_counts[REF_FRAMES]; // Counters for ref_frame[1].
+ int sample_counts; // Number of samples collected.
+} FIRST_PARTITION_PASS_STATS;
+
+#define MAX_INTERP_FILTER_STATS 64
+typedef struct {
+ InterpFilters filters;
+ int_mv mv[2];
+ int8_t ref_frames[2];
+ COMPOUND_TYPE comp_type;
+} INTERPOLATION_FILTER_STATS;
+
+typedef struct macroblock MACROBLOCK;
+struct macroblock {
+ struct macroblock_plane plane[MAX_MB_PLANE];
+
+ // Determine if one would go with reduced complexity transform block
+ // search model to select prediction modes, or full complexity model
+ // to select transform kernel.
+ int rd_model;
+
+ // Indicate if the encoder is running in the first pass partition search.
+ // In that case, apply certain speed features therein to reduce the overhead
+ // cost in the first pass search.
+ int cb_partition_scan;
+
+ FIRST_PARTITION_PASS_STATS
+ first_partition_pass_stats[FIRST_PARTITION_PASS_STATS_TABLES];
+
+ // [comp_idx][saved stat_idx]
+ INTERPOLATION_FILTER_STATS interp_filter_stats[2][MAX_INTERP_FILTER_STATS];
+ int interp_filter_stats_idx[2];
+
+ // Activate constrained coding block partition search range.
+ int use_cb_search_range;
+
+ // Inter macroblock RD search info.
+ MB_RD_RECORD mb_rd_record;
+
+ // Inter transform block RD search info. for square TX sizes.
+ TXB_RD_RECORD txb_rd_record_8X8[(MAX_MIB_SIZE >> 1) * (MAX_MIB_SIZE >> 1)];
+ TXB_RD_RECORD txb_rd_record_16X16[(MAX_MIB_SIZE >> 2) * (MAX_MIB_SIZE >> 2)];
+ TXB_RD_RECORD txb_rd_record_32X32[(MAX_MIB_SIZE >> 3) * (MAX_MIB_SIZE >> 3)];
+ TXB_RD_RECORD txb_rd_record_64X64[(MAX_MIB_SIZE >> 4) * (MAX_MIB_SIZE >> 4)];
+
+ // Intra transform block RD search info. for square TX sizes.
+ TXB_RD_RECORD txb_rd_record_intra;
+
+ MACROBLOCKD e_mbd;
+ MB_MODE_INFO_EXT *mbmi_ext;
+ int skip_block;
+ int qindex;
+
+ // The equivalent error at the current rdmult of one whole bit (not one
+ // bitcost unit).
+ int errorperbit;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for large blocks.
+ int sadperbit16;
+ // The equivalend SAD error of one (whole) bit at the current quantizer
+ // for sub-8x8 blocks.
+ int sadperbit4;
+ int rdmult;
+ int mb_energy;
+ int sb_energy_level;
+ int *m_search_count_ptr;
+ int *ex_search_count_ptr;
+
+ unsigned int txb_split_count;
+
+ // These are set to their default values at the beginning, and then adjusted
+ // further in the encoding process.
+ BLOCK_SIZE min_partition_size;
+ BLOCK_SIZE max_partition_size;
+
+ unsigned int max_mv_context[REF_FRAMES];
+ unsigned int source_variance;
+ unsigned int pred_sse[REF_FRAMES];
+ int pred_mv_sad[REF_FRAMES];
+
+ int *nmvjointcost;
+ int nmv_vec_cost[MV_JOINTS];
+ int *nmvcost[2];
+ int *nmvcost_hp[2];
+ int **mv_cost_stack;
+ int **mvcost;
+
+ int32_t *wsrc_buf;
+ int32_t *mask_buf;
+ uint8_t *above_pred_buf;
+ uint8_t *left_pred_buf;
+
+ PALETTE_BUFFER *palette_buffer;
+
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+
+ // buffer for hash value calculation of a block
+ // used only in av1_get_block_hash_value()
+ // [first hash/second hash]
+ // [two buffers used ping-pong]
+ uint32_t *hash_value_buffer[2][2];
+
+ CRC_CALCULATOR crc_calculator1;
+ CRC_CALCULATOR crc_calculator2;
+ int g_crc_initialized;
+
+ // These define limits to motion vector components to prevent them
+ // from extending outside the UMV borders
+ MvLimits mv_limits;
+
+ uint8_t blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ int skip;
+ int skip_chroma_rd;
+ int skip_cost[SKIP_CONTEXTS][2];
+
+ int skip_mode; // 0: off; 1: on
+ int skip_mode_cost[SKIP_CONTEXTS][2];
+
+ int compound_idx;
+
+ LV_MAP_COEFF_COST coeff_costs[TX_SIZES][PLANE_TYPES];
+ LV_MAP_EOB_COST eob_costs[7][2];
+ uint16_t cb_offset;
+
+ // mode costs
+ int intra_inter_cost[INTRA_INTER_CONTEXTS][2];
+
+ int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+ int zeromv_mode_cost[GLOBALMV_MODE_CONTEXTS][2];
+ int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+ int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+
+ int comp_inter_cost[COMP_INTER_CONTEXTS][2];
+ int single_ref_cost[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ int comp_ref_type_cost[COMP_REF_TYPE_CONTEXTS]
+ [CDF_SIZE(COMP_REFERENCE_TYPES)];
+ int uni_comp_ref_cost[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1]
+ [CDF_SIZE(2)];
+ // Cost for signaling ref_frame[0] (LAST_FRAME, LAST2_FRAME, LAST3_FRAME or
+ // GOLDEN_FRAME) in bidir-comp mode.
+ int comp_ref_cost[REF_CONTEXTS][FWD_REFS - 1][2];
+ // Cost for signaling ref_frame[1] (ALTREF_FRAME, ALTREF2_FRAME, or
+ // BWDREF_FRAME) in bidir-comp mode.
+ int comp_bwdref_cost[REF_CONTEXTS][BWD_REFS - 1][2];
+ int inter_compound_mode_cost[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ int compound_type_cost[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+ int wedge_idx_cost[BLOCK_SIZES_ALL][16];
+ int interintra_cost[BLOCK_SIZE_GROUPS][2];
+ int wedge_interintra_cost[BLOCK_SIZES_ALL][2];
+ int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ int motion_mode_cost[BLOCK_SIZES_ALL][MOTION_MODES];
+ int motion_mode_cost1[BLOCK_SIZES_ALL][2];
+ int intra_uv_mode_cost[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
+ int filter_intra_cost[BLOCK_SIZES_ALL][2];
+ int filter_intra_mode_cost[FILTER_INTRA_MODES];
+ int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+ int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ int palette_y_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ int palette_uv_size_cost[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ int palette_y_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ int palette_uv_color_cost[PALETTE_SIZES][PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ int palette_y_mode_cost[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ int palette_uv_mode_cost[PALETTE_UV_MODE_CONTEXTS][2];
+ // The rate associated with each alpha codeword
+ int cfl_cost[CFL_JOINT_SIGNS][CFL_PRED_PLANES][CFL_ALPHABET_SIZE];
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+ int txfm_partition_cost[TXFM_PARTITION_CONTEXTS][2];
+ int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ int angle_delta_cost[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+ int switchable_restore_cost[RESTORE_SWITCHABLE_TYPES];
+ int wiener_restore_cost[2];
+ int sgrproj_restore_cost[2];
+ int intrabc_cost[2];
+
+ // Used to store sub partition's choices.
+ MV pred_mv[REF_FRAMES];
+
+ // Store the best motion vector during motion search
+ int_mv best_mv;
+ // Store the second best motion vector during full-pixel motion search
+ int_mv second_best_mv;
+
+ // use default transform and skip transform type search for intra modes
+ int use_default_intra_tx_type;
+ // use default transform and skip transform type search for inter modes
+ int use_default_inter_tx_type;
+#if CONFIG_DIST_8X8
+ int using_dist_8x8;
+ aom_tune_metric tune_metric;
+#endif // CONFIG_DIST_8X8
+ int comp_idx_cost[COMP_INDEX_CONTEXTS][2];
+ int comp_group_idx_cost[COMP_GROUP_IDX_CONTEXTS][2];
+ // Bit flags for pruning tx type search, tx split, etc.
+ int tx_search_prune[EXT_TX_SET_TYPES];
+ int must_find_valid_partition;
+ int tx_split_prune_flag; // Flag to skip tx split RD search.
+ int recalc_luma_mc_data; // Flag to indicate recalculation of MC data during
+ // interpolation filter search
+};
+
+static INLINE int is_rect_tx_allowed_bsize(BLOCK_SIZE bsize) {
+ static const char LUT[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 1, // BLOCK_4X8
+ 1, // BLOCK_8X4
+ 0, // BLOCK_8X8
+ 1, // BLOCK_8X16
+ 1, // BLOCK_16X8
+ 0, // BLOCK_16X16
+ 1, // BLOCK_16X32
+ 1, // BLOCK_32X16
+ 0, // BLOCK_32X32
+ 1, // BLOCK_32X64
+ 1, // BLOCK_64X32
+ 0, // BLOCK_64X64
+ 0, // BLOCK_64X128
+ 0, // BLOCK_128X64
+ 0, // BLOCK_128X128
+ 1, // BLOCK_4X16
+ 1, // BLOCK_16X4
+ 1, // BLOCK_8X32
+ 1, // BLOCK_32X8
+ 1, // BLOCK_16X64
+ 1, // BLOCK_64X16
+ };
+
+ return LUT[bsize];
+}
+
+static INLINE int is_rect_tx_allowed(const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ return is_rect_tx_allowed_bsize(mbmi->sb_type) &&
+ !xd->lossless[mbmi->segment_id];
+}
+
+static INLINE int tx_size_to_depth(TX_SIZE tx_size, BLOCK_SIZE bsize) {
+ TX_SIZE ctx_size = max_txsize_rect_lookup[bsize];
+ int depth = 0;
+ while (tx_size != ctx_size) {
+ depth++;
+ ctx_size = sub_tx_size_map[ctx_size];
+ assert(depth <= MAX_TX_DEPTH);
+ }
+ return depth;
+}
+
+static INLINE void set_blk_skip(MACROBLOCK *x, int plane, int blk_idx,
+ int skip) {
+ if (skip)
+ x->blk_skip[blk_idx] |= 1UL << plane;
+ else
+ x->blk_skip[blk_idx] &= ~(1UL << plane);
+#ifndef NDEBUG
+ // Set chroma planes to uninitialized states when luma is set to check if
+ // it will be set later
+ if (plane == 0) {
+ x->blk_skip[blk_idx] |= 1UL << (1 + 4);
+ x->blk_skip[blk_idx] |= 1UL << (2 + 4);
+ }
+
+ // Clear the initialization checking bit
+ x->blk_skip[blk_idx] &= ~(1UL << (plane + 4));
+#endif
+}
+
+static INLINE int is_blk_skip(MACROBLOCK *x, int plane, int blk_idx) {
+#ifndef NDEBUG
+ // Check if this is initialized
+ assert(!(x->blk_skip[blk_idx] & (1UL << (plane + 4))));
+
+ // The magic number is 0x77, this is to test if there is garbage data
+ assert((x->blk_skip[blk_idx] & 0x88) == 0);
+#endif
+ return (x->blk_skip[blk_idx] >> plane) & 1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_BLOCK_H_
diff --git a/third_party/aom/av1/encoder/blockiness.c b/third_party/aom/av1/encoder/blockiness.c
new file mode 100644
index 0000000000..f7cff9e532
--- /dev/null
+++ b/third_party/aom/av1/encoder/blockiness.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/common.h"
+#include "av1/common/filter.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+static int horizontal_filter(const uint8_t *s) {
+ return (s[1] - s[-2]) * 2 + (s[-1] - s[0]) * 6;
+}
+
+static int vertical_filter(const uint8_t *s, int p) {
+ return (s[p] - s[-2 * p]) * 2 + (s[-p] - s[0]) * 6;
+}
+
+static int variance(int sum, int sum_squared, int size) {
+ return sum_squared / size - (sum / size) * (sum / size);
+}
+// Calculate a blockiness level for a vertical block edge.
+// This function returns a new blockiness metric that's defined as
+
+// p0 p1 p2 p3
+// q0 q1 q2 q3
+// block edge ->
+// r0 r1 r2 r3
+// s0 s1 s2 s3
+
+// blockiness = p0*-2+q0*6+r0*-6+s0*2 +
+// p1*-2+q1*6+r1*-6+s1*2 +
+// p2*-2+q2*6+r2*-6+s2*2 +
+// p3*-2+q3*6+r3*-6+s3*2 ;
+
+// reconstructed_blockiness = abs(blockiness from reconstructed buffer -
+// blockiness from source buffer,0)
+//
+// I make the assumption that flat blocks are much more visible than high
+// contrast blocks. As such, I scale the result of the blockiness calc
+// by dividing the blockiness by the variance of the pixels on either side
+// of the edge as follows:
+// var_0 = (q0^2+q1^2+q2^2+q3^2) - ((q0 + q1 + q2 + q3) / 4 )^2
+// var_1 = (r0^2+r1^2+r2^2+r3^2) - ((r0 + r1 + r2 + r3) / 4 )^2
+// The returned blockiness is the scaled value
+// Reconstructed blockiness / ( 1 + var_0 + var_1 ) ;
+static int blockiness_vertical(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, s += sp, r += rp) {
+ s_blockiness += horizontal_filter(s);
+ r_blockiness += horizontal_filter(r);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-1];
+ sum_sq_1 += s[-1] * s[-1];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// Calculate a blockiness level for a horizontal block edge
+// same as above.
+static int blockiness_horizontal(const uint8_t *s, int sp, const uint8_t *r,
+ int rp, int size) {
+ int s_blockiness = 0;
+ int r_blockiness = 0;
+ int sum_0 = 0;
+ int sum_sq_0 = 0;
+ int sum_1 = 0;
+ int sum_sq_1 = 0;
+ int i;
+ int var_0;
+ int var_1;
+ for (i = 0; i < size; ++i, ++s, ++r) {
+ s_blockiness += vertical_filter(s, sp);
+ r_blockiness += vertical_filter(r, rp);
+ sum_0 += s[0];
+ sum_sq_0 += s[0] * s[0];
+ sum_1 += s[-sp];
+ sum_sq_1 += s[-sp] * s[-sp];
+ }
+ var_0 = variance(sum_0, sum_sq_0, size);
+ var_1 = variance(sum_1, sum_sq_1, size);
+ r_blockiness = abs(r_blockiness);
+ s_blockiness = abs(s_blockiness);
+
+ if (r_blockiness > s_blockiness)
+ return (r_blockiness - s_blockiness) / (1 + var_0 + var_1);
+ else
+ return 0;
+}
+
+// This function returns the blockiness for the entire frame currently by
+// looking at all borders in steps of 4.
+double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch, int width,
+ int height) {
+ double blockiness = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = 0; i < height;
+ i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+ for (j = 0; j < width; j += 4) {
+ if (i > 0 && i < height && j > 0 && j < width) {
+ blockiness +=
+ blockiness_vertical(img1 + j, img1_pitch, img2 + j, img2_pitch, 4);
+ blockiness += blockiness_horizontal(img1 + j, img1_pitch, img2 + j,
+ img2_pitch, 4);
+ }
+ }
+ }
+ blockiness /= width * height / 16;
+ return blockiness;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.c b/third_party/aom/av1/encoder/context_tree.c
new file mode 100644
index 0000000000..57f59f304b
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 1] = {
+ BLOCK_4X4, BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64, BLOCK_128X128,
+};
+
+static void alloc_mode_context(AV1_COMMON *cm, int num_pix,
+ PICK_MODE_CONTEXT *ctx) {
+ const int num_planes = av1_num_planes(cm);
+ int i;
+ const int num_blk = num_pix / 16;
+ ctx->num_4x4_blk = num_blk;
+
+ CHECK_MEM_ERROR(cm, ctx->blk_skip, aom_calloc(num_blk, sizeof(uint8_t)));
+ for (i = 0; i < num_planes; ++i) {
+ CHECK_MEM_ERROR(cm, ctx->coeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->coeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->qcoeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->qcoeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->dqcoeff[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i])));
+ CHECK_MEM_ERROR(cm, ctx->eobs[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->eobs[i])));
+ CHECK_MEM_ERROR(
+ cm, ctx->txb_entropy_ctx[i],
+ aom_memalign(32, num_blk * sizeof(*ctx->txb_entropy_ctx[i])));
+ }
+
+ if (num_pix <= MAX_PALETTE_SQUARE) {
+ for (i = 0; i < 2; ++i) {
+ CHECK_MEM_ERROR(
+ cm, ctx->color_index_map[i],
+ aom_memalign(32, num_pix * sizeof(*ctx->color_index_map[i])));
+ }
+ }
+}
+
+static void free_mode_context(PICK_MODE_CONTEXT *ctx, const int num_planes) {
+ int i;
+ aom_free(ctx->blk_skip);
+ ctx->blk_skip = 0;
+ for (i = 0; i < num_planes; ++i) {
+ aom_free(ctx->coeff[i]);
+ ctx->coeff[i] = 0;
+ aom_free(ctx->qcoeff[i]);
+ ctx->qcoeff[i] = 0;
+ aom_free(ctx->dqcoeff[i]);
+ ctx->dqcoeff[i] = 0;
+ aom_free(ctx->eobs[i]);
+ ctx->eobs[i] = 0;
+ aom_free(ctx->txb_entropy_ctx[i]);
+ ctx->txb_entropy_ctx[i] = 0;
+ }
+
+ for (i = 0; i < 2; ++i) {
+ aom_free(ctx->color_index_map[i]);
+ ctx->color_index_map[i] = 0;
+ }
+}
+
+static void alloc_tree_contexts(AV1_COMMON *cm, PC_TREE *tree, int num_pix,
+ int is_leaf) {
+ alloc_mode_context(cm, num_pix, &tree->none);
+
+ if (is_leaf) return;
+
+ alloc_mode_context(cm, num_pix / 2, &tree->horizontal[0]);
+ alloc_mode_context(cm, num_pix / 2, &tree->vertical[0]);
+
+ alloc_mode_context(cm, num_pix / 2, &tree->horizontal[1]);
+ alloc_mode_context(cm, num_pix / 2, &tree->vertical[1]);
+
+ alloc_mode_context(cm, num_pix / 4, &tree->horizontala[0]);
+ alloc_mode_context(cm, num_pix / 4, &tree->horizontala[1]);
+ alloc_mode_context(cm, num_pix / 2, &tree->horizontala[2]);
+
+ alloc_mode_context(cm, num_pix / 2, &tree->horizontalb[0]);
+ alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[1]);
+ alloc_mode_context(cm, num_pix / 4, &tree->horizontalb[2]);
+
+ alloc_mode_context(cm, num_pix / 4, &tree->verticala[0]);
+ alloc_mode_context(cm, num_pix / 4, &tree->verticala[1]);
+ alloc_mode_context(cm, num_pix / 2, &tree->verticala[2]);
+
+ alloc_mode_context(cm, num_pix / 2, &tree->verticalb[0]);
+ alloc_mode_context(cm, num_pix / 4, &tree->verticalb[1]);
+ alloc_mode_context(cm, num_pix / 4, &tree->verticalb[2]);
+
+ for (int i = 0; i < 4; ++i) {
+ alloc_mode_context(cm, num_pix / 4, &tree->horizontal4[i]);
+ alloc_mode_context(cm, num_pix / 4, &tree->vertical4[i]);
+ }
+}
+
+static void free_tree_contexts(PC_TREE *tree, const int num_planes) {
+ int i;
+ for (i = 0; i < 3; i++) {
+ free_mode_context(&tree->horizontala[i], num_planes);
+ free_mode_context(&tree->horizontalb[i], num_planes);
+ free_mode_context(&tree->verticala[i], num_planes);
+ free_mode_context(&tree->verticalb[i], num_planes);
+ }
+ for (i = 0; i < 4; ++i) {
+ free_mode_context(&tree->horizontal4[i], num_planes);
+ free_mode_context(&tree->vertical4[i], num_planes);
+ }
+ free_mode_context(&tree->none, num_planes);
+ free_mode_context(&tree->horizontal[0], num_planes);
+ free_mode_context(&tree->horizontal[1], num_planes);
+ free_mode_context(&tree->vertical[0], num_planes);
+ free_mode_context(&tree->vertical[1], num_planes);
+}
+
+// This function sets up a tree of contexts such that at each square
+// partition level. There are contexts for none, horizontal, vertical, and
+// split. Along with a block_size value and a selected block_size which
+// represents the state of our search.
+void av1_setup_pc_tree(AV1_COMMON *cm, ThreadData *td) {
+ int i, j;
+ const int tree_nodes_inc = 1024;
+ const int leaf_factor = 4;
+ const int leaf_nodes = 256 * leaf_factor;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ int pc_tree_index = 0;
+ PC_TREE *this_pc;
+ int square_index = 1;
+ int nodes;
+
+ aom_free(td->pc_tree);
+ CHECK_MEM_ERROR(cm, td->pc_tree,
+ aom_calloc(tree_nodes, sizeof(*td->pc_tree)));
+ this_pc = &td->pc_tree[0];
+
+ // Sets up all the leaf nodes in the tree.
+ for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ tree->block_size = square[0];
+ alloc_tree_contexts(cm, tree, 16, 1);
+ }
+
+ // Each node has 4 leaf nodes, fill each block_size level of the tree
+ // from leafs to the root.
+ for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+ for (i = 0; i < nodes; ++i) {
+ PC_TREE *const tree = &td->pc_tree[pc_tree_index];
+ alloc_tree_contexts(cm, tree, 16 << (2 * square_index), 0);
+ tree->block_size = square[square_index];
+ for (j = 0; j < 4; j++) tree->split[j] = this_pc++;
+ ++pc_tree_index;
+ }
+ ++square_index;
+ }
+
+ // Set up the root node for the largest superblock size
+ i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+ td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+ td->pc_root[i]->none.best_mode_index = 2;
+ // Set up the root nodes for the rest of the possible superblock sizes
+ while (--i >= 0) {
+ td->pc_root[i] = td->pc_root[i + 1]->split[0];
+ td->pc_root[i]->none.best_mode_index = 2;
+ }
+}
+
+void av1_free_pc_tree(ThreadData *td, const int num_planes) {
+ if (td->pc_tree != NULL) {
+ const int tree_nodes_inc = 1024;
+ const int tree_nodes = tree_nodes_inc + 256 + 64 + 16 + 4 + 1;
+ for (int i = 0; i < tree_nodes; ++i) {
+ free_tree_contexts(&td->pc_tree[i], num_planes);
+ }
+ aom_free(td->pc_tree);
+ td->pc_tree = NULL;
+ }
+}
+
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx) {
+ dst_ctx->mic = src_ctx->mic;
+ dst_ctx->mbmi_ext = src_ctx->mbmi_ext;
+
+ dst_ctx->num_4x4_blk = src_ctx->num_4x4_blk;
+ dst_ctx->skip = src_ctx->skip;
+ dst_ctx->skippable = src_ctx->skippable;
+ dst_ctx->best_mode_index = src_ctx->best_mode_index;
+
+ memcpy(dst_ctx->blk_skip, src_ctx->blk_skip,
+ sizeof(uint8_t) * src_ctx->num_4x4_blk);
+
+ dst_ctx->hybrid_pred_diff = src_ctx->hybrid_pred_diff;
+ dst_ctx->comp_pred_diff = src_ctx->comp_pred_diff;
+ dst_ctx->single_pred_diff = src_ctx->single_pred_diff;
+
+ dst_ctx->rate = src_ctx->rate;
+ dst_ctx->dist = src_ctx->dist;
+ dst_ctx->rdcost = src_ctx->rdcost;
+ dst_ctx->rd_mode_is_ready = src_ctx->rd_mode_is_ready;
+
+ memcpy(dst_ctx->pred_mv, src_ctx->pred_mv, sizeof(MV) * REF_FRAMES);
+ dst_ctx->pred_interp_filter = src_ctx->pred_interp_filter;
+
+ dst_ctx->partition = src_ctx->partition;
+}
diff --git a/third_party/aom/av1/encoder/context_tree.h b/third_party/aom/av1/encoder/context_tree.h
new file mode 100644
index 0000000000..4efc349852
--- /dev/null
+++ b/third_party/aom/av1/encoder/context_tree.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CONTEXT_TREE_H_
+#define AOM_AV1_ENCODER_CONTEXT_TREE_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/block.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct AV1Common;
+struct ThreadData;
+
+typedef enum {
+ // Search all the partition types in this plane.
+ SEARCH_FULL_PLANE = 0,
+ // Only search none_partition coding block.
+ NONE_PARTITION_PLANE = 1,
+ // Search all the partition types in this plane except split.
+ SEARCH_SAME_PLANE = 2,
+ // Skip search partition on this plane. Go split directly.
+ SPLIT_PLANE = 3,
+} CB_TREE_SEARCH;
+
+// Structure to hold snapshot of coding context during the mode picking process
+typedef struct {
+ MB_MODE_INFO mic;
+ MB_MODE_INFO_EXT mbmi_ext;
+ uint8_t *color_index_map[2];
+ uint8_t *blk_skip;
+
+ tran_low_t *coeff[MAX_MB_PLANE];
+ tran_low_t *qcoeff[MAX_MB_PLANE];
+ tran_low_t *dqcoeff[MAX_MB_PLANE];
+ uint16_t *eobs[MAX_MB_PLANE];
+ uint8_t *txb_entropy_ctx[MAX_MB_PLANE];
+
+ int num_4x4_blk;
+ int skip;
+ // For current partition, only if all Y, U, and V transform blocks'
+ // coefficients are quantized to 0, skippable is set to 1.
+ int skippable;
+ int best_mode_index;
+ int hybrid_pred_diff;
+ int comp_pred_diff;
+ int single_pred_diff;
+ // Skip certain ref frames during RD search of rectangular partitions.
+ int skip_ref_frame_mask;
+
+ // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
+ // scope of refactoring.
+ int rate;
+ int64_t dist;
+ int64_t rdcost;
+ int rd_mode_is_ready; // Flag to indicate whether rd pick mode decision has
+ // been made.
+
+ // motion vector cache for adaptive motion search control in partition
+ // search loop
+ MV pred_mv[REF_FRAMES];
+ InterpFilter pred_interp_filter;
+ PARTITION_TYPE partition;
+} PICK_MODE_CONTEXT;
+
+typedef struct {
+ int valid;
+ int split;
+ int skip;
+ int64_t rdcost;
+ int sub_block_split[4];
+ int sub_block_skip[4];
+ int64_t sub_block_rdcost[4];
+} PC_TREE_STATS;
+
+typedef struct PC_TREE {
+ int index;
+ PARTITION_TYPE partitioning;
+ BLOCK_SIZE block_size;
+ PICK_MODE_CONTEXT none;
+ PICK_MODE_CONTEXT horizontal[2];
+ PICK_MODE_CONTEXT vertical[2];
+ PICK_MODE_CONTEXT horizontala[3];
+ PICK_MODE_CONTEXT horizontalb[3];
+ PICK_MODE_CONTEXT verticala[3];
+ PICK_MODE_CONTEXT verticalb[3];
+ PICK_MODE_CONTEXT horizontal4[4];
+ PICK_MODE_CONTEXT vertical4[4];
+ CB_TREE_SEARCH cb_search_range;
+ struct PC_TREE *split[4];
+ PC_TREE_STATS pc_tree_stats;
+} PC_TREE;
+
+void av1_setup_pc_tree(struct AV1Common *cm, struct ThreadData *td);
+void av1_free_pc_tree(struct ThreadData *td, const int num_planes);
+void av1_copy_tree_context(PICK_MODE_CONTEXT *dst_ctx,
+ PICK_MODE_CONTEXT *src_ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_CONTEXT_TREE_H_
diff --git a/third_party/aom/av1/encoder/corner_detect.c b/third_party/aom/av1/encoder/corner_detect.c
new file mode 100644
index 0000000000..e4c59dd9c6
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.c
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "av1/encoder/corner_detect.h"
+
+// Fast_9 wrapper
+#define FAST_BARRIER 18
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+ int *points, int max_points) {
+ int num_points;
+ xy *const frm_corners_xy = fast9_detect_nonmax(buf, width, height, stride,
+ FAST_BARRIER, &num_points);
+ num_points = (num_points <= max_points ? num_points : max_points);
+ if (num_points > 0 && frm_corners_xy) {
+ memcpy(points, frm_corners_xy, sizeof(*frm_corners_xy) * num_points);
+ free(frm_corners_xy);
+ return num_points;
+ }
+ free(frm_corners_xy);
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/corner_detect.h b/third_party/aom/av1/encoder/corner_detect.h
new file mode 100644
index 0000000000..cab59a7743
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_detect.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_CORNER_DETECT_H_
+#define AOM_AV1_ENCODER_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+int fast_corner_detect(unsigned char *buf, int width, int height, int stride,
+ int *points, int max_points);
+
+#endif // AOM_AV1_ENCODER_CORNER_DETECT_H_
diff --git a/third_party/aom/av1/encoder/corner_match.c b/third_party/aom/av1/encoder/corner_match.c
new file mode 100644
index 0000000000..29e934debd
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/corner_match.h"
+
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(im) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of im,
+ centered at (x, y).
+*/
+static double compute_variance(unsigned char *im, int stride, int x, int y) {
+ int sum = 0;
+ int sumsq = 0;
+ int var;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ sum += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ sumsq += im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+ im[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+ }
+ var = sumsq * MATCH_SZ_SQ - sum * sum;
+ return (double)var;
+}
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+ correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+ of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_c(unsigned char *im1, int stride1, int x1,
+ int y1, unsigned char *im2, int stride2,
+ int x2, int y2) {
+ int v1, v2;
+ int sum1 = 0;
+ int sum2 = 0;
+ int sumsq2 = 0;
+ int cross = 0;
+ int var2, cov;
+ int i, j;
+ for (i = 0; i < MATCH_SZ; ++i)
+ for (j = 0; j < MATCH_SZ; ++j) {
+ v1 = im1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+ v2 = im2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+ sum1 += v1;
+ sum2 += v2;
+ sumsq2 += v2 * v2;
+ cross += v1 * v2;
+ }
+ var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+ cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+ return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+ return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+ pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+ int point2y, int width, int height) {
+ const int thresh = (width < height ? height : width) >> 4;
+ return ((point1x - point2x) * (point1x - point2x) +
+ (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(unsigned char *frm, unsigned char *ref,
+ int width, int height, int frm_stride,
+ int ref_stride,
+ Correspondence *correspondences,
+ int num_correspondences) {
+ int i;
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(correspondences[i].rx + x,
+ correspondences[i].ry + y, width, height))
+ continue;
+ if (!is_eligible_distance(correspondences[i].x, correspondences[i].y,
+ correspondences[i].rx + x,
+ correspondences[i].ry + y, width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ frm, frm_stride, correspondences[i].x, correspondences[i].y, ref,
+ ref_stride, correspondences[i].rx + x, correspondences[i].ry + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ }
+ correspondences[i].rx += best_x;
+ correspondences[i].ry += best_y;
+ }
+ for (i = 0; i < num_correspondences; ++i) {
+ int x, y, best_x = 0, best_y = 0;
+ double best_match_ncc = 0.0;
+ for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+ for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+ double match_ncc;
+ if (!is_eligible_point(correspondences[i].x + x,
+ correspondences[i].y + y, width, height))
+ continue;
+ if (!is_eligible_distance(
+ correspondences[i].x + x, correspondences[i].y + y,
+ correspondences[i].rx, correspondences[i].ry, width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ ref, ref_stride, correspondences[i].rx, correspondences[i].ry, frm,
+ frm_stride, correspondences[i].x + x, correspondences[i].y + y);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_y = y;
+ best_x = x;
+ }
+ }
+ correspondences[i].x += best_x;
+ correspondences[i].y += best_y;
+ }
+}
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+ int num_frm_corners, unsigned char *ref,
+ int *ref_corners, int num_ref_corners, int width,
+ int height, int frm_stride, int ref_stride,
+ int *correspondence_pts) {
+ // TODO(sarahparker) Improve this to include 2-way match
+ int i, j;
+ Correspondence *correspondences = (Correspondence *)correspondence_pts;
+ int num_correspondences = 0;
+ for (i = 0; i < num_frm_corners; ++i) {
+ double best_match_ncc = 0.0;
+ double template_norm;
+ int best_match_j = -1;
+ if (!is_eligible_point(frm_corners[2 * i], frm_corners[2 * i + 1], width,
+ height))
+ continue;
+ for (j = 0; j < num_ref_corners; ++j) {
+ double match_ncc;
+ if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+ height))
+ continue;
+ if (!is_eligible_distance(frm_corners[2 * i], frm_corners[2 * i + 1],
+ ref_corners[2 * j], ref_corners[2 * j + 1],
+ width, height))
+ continue;
+ match_ncc = compute_cross_correlation(
+ frm, frm_stride, frm_corners[2 * i], frm_corners[2 * i + 1], ref,
+ ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+ if (match_ncc > best_match_ncc) {
+ best_match_ncc = match_ncc;
+ best_match_j = j;
+ }
+ }
+ // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+ // but need to account for the normalization in compute_cross_correlation.
+ template_norm = compute_variance(frm, frm_stride, frm_corners[2 * i],
+ frm_corners[2 * i + 1]);
+ if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+ correspondences[num_correspondences].x = frm_corners[2 * i];
+ correspondences[num_correspondences].y = frm_corners[2 * i + 1];
+ correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+ correspondences[num_correspondences].ry =
+ ref_corners[2 * best_match_j + 1];
+ num_correspondences++;
+ }
+ }
+ improve_correspondence(frm, ref, width, height, frm_stride, ref_stride,
+ correspondences, num_correspondences);
+ return num_correspondences;
+}
diff --git a/third_party/aom/av1/encoder/corner_match.h b/third_party/aom/av1/encoder/corner_match.h
new file mode 100644
index 0000000000..535d2faed1
--- /dev/null
+++ b/third_party/aom/av1/encoder/corner_match.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_CORNER_MATCH_H_
+#define AOM_AV1_ENCODER_CORNER_MATCH_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
+typedef struct {
+ int x, y;
+ int rx, ry;
+} Correspondence;
+
+int determine_correspondence(unsigned char *frm, int *frm_corners,
+ int num_frm_corners, unsigned char *ref,
+ int *ref_corners, int num_ref_corners, int width,
+ int height, int frm_stride, int ref_stride,
+ int *correspondence_pts);
+
+#endif // AOM_AV1_ENCODER_CORNER_MATCH_H_
diff --git a/third_party/aom/av1/encoder/cost.c b/third_party/aom/av1/encoder/cost.c
new file mode 100644
index 0000000000..323e2aed58
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/common/entropy.h"
+
+// round(-log2(i/256.) * (1 << AV1_PROB_COST_SHIFT)); i = 128~255.
+const uint16_t av1_prob_cost[128] = {
+ 512, 506, 501, 495, 489, 484, 478, 473, 467, 462, 456, 451, 446, 441, 435,
+ 430, 425, 420, 415, 410, 405, 400, 395, 390, 385, 380, 375, 371, 366, 361,
+ 356, 352, 347, 343, 338, 333, 329, 324, 320, 316, 311, 307, 302, 298, 294,
+ 289, 285, 281, 277, 273, 268, 264, 260, 256, 252, 248, 244, 240, 236, 232,
+ 228, 224, 220, 216, 212, 209, 205, 201, 197, 194, 190, 186, 182, 179, 175,
+ 171, 168, 164, 161, 157, 153, 150, 146, 143, 139, 136, 132, 129, 125, 122,
+ 119, 115, 112, 109, 105, 102, 99, 95, 92, 89, 86, 82, 79, 76, 73,
+ 70, 66, 63, 60, 57, 54, 51, 48, 45, 42, 38, 35, 32, 29, 26,
+ 23, 20, 18, 15, 12, 9, 6, 3,
+};
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map) {
+ int i;
+ aom_cdf_prob prev_cdf = 0;
+ for (i = 0;; ++i) {
+ aom_cdf_prob p15 = AOM_ICDF(cdf[i]) - prev_cdf;
+ p15 = (p15 < EC_MIN_PROB) ? EC_MIN_PROB : p15;
+ prev_cdf = AOM_ICDF(cdf[i]);
+
+ if (inv_map)
+ costs[inv_map[i]] = av1_cost_symbol(p15);
+ else
+ costs[i] = av1_cost_symbol(p15);
+
+ // Stop once we reach the end of the CDF
+ if (cdf[i] == AOM_ICDF(CDF_PROB_TOP)) break;
+ }
+}
diff --git a/third_party/aom/av1/encoder/cost.h b/third_party/aom/av1/encoder/cost.h
new file mode 100644
index 0000000000..af5b098370
--- /dev/null
+++ b/third_party/aom/av1/encoder/cost.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_COST_H_
+#define AOM_AV1_ENCODER_COST_H_
+
+#include "aom_dsp/prob.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern const uint16_t av1_prob_cost[128];
+
+// The factor to scale from cost in bits to cost in av1_prob_cost units.
+#define AV1_PROB_COST_SHIFT 9
+
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define av1_cost_literal(n) ((n) * (1 << AV1_PROB_COST_SHIFT))
+
+// Calculate the cost of a symbol with probability p15 / 2^15
+static INLINE int av1_cost_symbol(aom_cdf_prob p15) {
+ assert(0 < p15 && p15 < CDF_PROB_TOP);
+ const int shift = CDF_PROB_BITS - 1 - get_msb(p15);
+ const int prob = get_prob(p15 << shift, CDF_PROB_TOP);
+ assert(prob >= 128);
+ return av1_prob_cost[prob - 128] + av1_cost_literal(shift);
+}
+
+void av1_cost_tokens_from_cdf(int *costs, const aom_cdf_prob *cdf,
+ const int *inv_map);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_COST_H_
diff --git a/third_party/aom/av1/encoder/dwt.c b/third_party/aom/av1/encoder/dwt.c
new file mode 100644
index 0000000000..04088b25f9
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "config/av1_rtcd.h"
+#include "av1/encoder/dwt.h"
+
+// Note: block length must be even for this implementation
+static void analysis_53_row(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++) * 2;
+ *b++ = *x - ((r + x[1] + 1) >> 1);
+ x++;
+ }
+ *a = (r = *x++) * 2;
+ *b = *x - r;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void analysis_53_col(int length, tran_low_t *x, tran_low_t *lowpass,
+ tran_low_t *highpass) {
+ int n;
+ tran_low_t r, *a, *b;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ while (--n) {
+ *a++ = (r = *x++);
+ *b++ = (((*x) * 2) - (r + x[1]) + 2) >> 2;
+ x++;
+ }
+ *a = (r = *x++);
+ *b = (*x - r + 1) >> 1;
+
+ n = length >> 1;
+ b = highpass;
+ a = lowpass;
+ r = *highpass;
+ while (n--) {
+ *a++ += (r + (*b) + 1) >> 1;
+ r = *b++;
+ }
+}
+
+static void dyadic_analyze_53_uint8_input(int levels, int width, int height,
+ uint8_t *x, int pitch_x,
+ tran_low_t *c, int pitch_c,
+ int dwt_scale_bits, int hbd) {
+ int lv, i, j, nh, nw, hh = height, hw = width;
+ tran_low_t buffer[2 * DWT_MAX_LENGTH];
+
+ if (hbd) {
+ uint16_t *x16 = CONVERT_TO_SHORTPTR(x);
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x16[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ } else {
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < width; j++) {
+ c[i * pitch_c + j] = x[i * pitch_x + j] << dwt_scale_bits;
+ }
+ }
+ }
+
+ for (lv = 0; lv < levels; lv++) {
+ nh = hh;
+ hh = (hh + 1) >> 1;
+ nw = hw;
+ hw = (hw + 1) >> 1;
+ if ((nh < 2) || (nw < 2)) return;
+ for (i = 0; i < nh; i++) {
+ memcpy(buffer, &c[i * pitch_c], nw * sizeof(tran_low_t));
+ analysis_53_row(nw, buffer, &c[i * pitch_c], &c[i * pitch_c] + hw);
+ }
+ for (j = 0; j < nw; j++) {
+ for (i = 0; i < nh; i++) buffer[i + nh] = c[i * pitch_c + j];
+ analysis_53_col(nh, buffer + nh, buffer, buffer + hh);
+ for (i = 0; i < nh; i++) c[i * pitch_c + j] = buffer[i];
+ }
+ }
+}
+
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+ int hbd) {
+ dyadic_analyze_53_uint8_input(4, 8, 8, input, stride, output, 8, 2, hbd);
+}
+
+int av1_haar_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+ int acsad = 0;
+
+ for (int r = 0; r < bh; ++r)
+ for (int c = 0; c < bw; ++c) {
+ if (r >= bh / 2 || c >= bw / 2) acsad += abs(output[r * stride + c]);
+ }
+ return acsad;
+}
+
+uint64_t av1_dct_ac_sad(tran_low_t *output, int bw, int bh, int stride) {
+ uint64_t acsad = 0;
+
+ for (int r = 0; r < bh; ++r)
+ for (int c = 0; c < bw; ++c) {
+ if (r > 0 || c > 0) acsad += abs(output[r * stride + c]);
+ }
+
+ return acsad;
+}
+
+uint32_t av1_variance(uint8_t *input, int bw, int bh, int stride) {
+ int sum = 0;
+ uint32_t sse = 0;
+
+ for (int r = 0; r < bh; ++r)
+ for (int c = 0; c < bw; ++c) {
+ sum += input[r * stride + c];
+ sse += input[r * stride + c] * input[r * stride + c];
+ }
+ return sse - (uint32_t)(((int64_t)sum * sum) / (bw * bh));
+}
+
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd) {
+ tran_low_t output[64];
+
+ av1_fdwt8x8_uint8_input_c(input, output, stride, hbd);
+ return av1_haar_ac_sad(output, 8, 8, 8);
+}
diff --git a/third_party/aom/av1/encoder/dwt.h b/third_party/aom/av1/encoder/dwt.h
new file mode 100644
index 0000000000..37306c6a5f
--- /dev/null
+++ b/third_party/aom/av1/encoder/dwt.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_DWT_H_
+#define AOM_AV1_ENCODER_DWT_H_
+
+#include "av1/common/common.h"
+#include "av1/common/enums.h"
+
+#define DWT_MAX_LENGTH 64
+
+void av1_fdwt8x8(tran_low_t *input, tran_low_t *output, int stride);
+void av1_fdwt8x8_uint8_input_c(uint8_t *input, tran_low_t *output, int stride,
+ int hbd);
+int av1_haar_ac_sad_8x8_uint8_input(uint8_t *input, int stride, int hbd);
+
+#endif // AOM_AV1_ENCODER_DWT_H_
diff --git a/third_party/aom/av1/encoder/encodeframe.c b/third_party/aom/av1/encoder/encodeframe.c
new file mode 100644
index 0000000000..cb226c59e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.c
@@ -0,0 +1,5739 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/system_state.h"
+
+#if CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mv.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/tile_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/global_motion.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/partition_model_weights.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/tokenize.h"
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rate);
+
+// This is used as a reference when computing the source variance for the
+// purposes of activity masking.
+// Eventually this should be replaced by custom no-reference routines,
+// which will be faster.
+static const uint8_t AV1_VAR_OFFS[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4,
+ 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4, 128 * 4
+};
+
+static const uint16_t AV1_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16, 128 * 16,
+ 128 * 16, 128 * 16
+};
+
+#if CONFIG_FP_MB_STATS
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES_ALL] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 1, 1, 1, 2, 2, 4
+};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES_ALL] = {
+ 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 1, 1, 2, 1, 4, 2
+};
+#endif // CONFIG_FP_MB_STATS
+
+unsigned int av1_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs) {
+ unsigned int sse;
+ const unsigned int var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride, AV1_VAR_OFFS, 0, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+unsigned int av1_high_get_sby_perpixel_variance(const AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd) {
+ unsigned int var, sse;
+ switch (bd) {
+ case 10:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10), 0, &sse);
+ break;
+ case 12:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12), 0, &sse);
+ break;
+ case 8:
+ default:
+ var =
+ cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
+ CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8), 0, &sse);
+ break;
+ }
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static unsigned int get_sby_perpixel_diff_variance(const AV1_COMP *const cpi,
+ const struct buf_2d *ref,
+ int mi_row, int mi_col,
+ BLOCK_SIZE bs) {
+ unsigned int sse, var;
+ uint8_t *last_y;
+ const YV12_BUFFER_CONFIG *last = get_ref_frame_buffer(cpi, LAST_FRAME);
+
+ assert(last != NULL);
+ last_y =
+ &last->y_buffer[mi_row * MI_SIZE * last->y_stride + mi_col * MI_SIZE];
+ var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride, last_y, last->y_stride, &sse);
+ return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+}
+
+static BLOCK_SIZE get_rd_var_based_fixed_partition(AV1_COMP *cpi, MACROBLOCK *x,
+ int mi_row, int mi_col) {
+ unsigned int var = get_sby_perpixel_diff_variance(
+ cpi, &x->plane[0].src, mi_row, mi_col, BLOCK_64X64);
+ if (var < 8)
+ return BLOCK_64X64;
+ else if (var < 128)
+ return BLOCK_32X32;
+ else if (var < 2048)
+ return BLOCK_16X16;
+ else
+ return BLOCK_8X8;
+}
+
+// Lighter version of set_offsets that only sets the mode info
+// pointers.
+static void set_mode_info_offsets(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, MACROBLOCKD *const xd,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int idx_str = xd->mi_stride * mi_row + mi_col;
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
+ x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
+}
+
+static void set_offsets_without_segment_id(const AV1_COMP *const cpi,
+ const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row,
+ int mi_col, BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+ set_skip_context(xd, mi_row, mi_col, num_planes);
+ xd->above_txfm_context = cm->above_txfm_context[tile->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ // Set up destination pointers.
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col, 0, num_planes);
+
+ // Set up limit values for MV components.
+ // Mv beyond the range do not produce new/different prediction block.
+ x->mv_limits.row_min =
+ -(((mi_row + mi_height) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.col_min = -(((mi_col + mi_width) * MI_SIZE) + AOM_INTERP_EXTEND);
+ x->mv_limits.row_max = (cm->mi_rows - mi_row) * MI_SIZE + AOM_INTERP_EXTEND;
+ x->mv_limits.col_max = (cm->mi_cols - mi_col) * MI_SIZE + AOM_INTERP_EXTEND;
+
+ set_plane_n4(xd, mi_width, mi_height, num_planes);
+
+ // Set up distance of MB to edge of frame in 1/8th pel units.
+ assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+ set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width, cm->mi_rows,
+ cm->mi_cols);
+
+ // Set up source buffers.
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+ // R/D setup.
+ x->rdmult = cpi->rd.RDMULT;
+
+ // required by av1_append_sub8x8_mvs_for_idx() and av1_find_best_ref_mvs()
+ xd->tile = *tile;
+}
+
+static void set_offsets(const AV1_COMP *const cpi, const TileInfo *const tile,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ const struct segmentation *const seg = &cm->seg;
+
+ set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+ mbmi = xd->mi[0];
+ xd->cfl.mi_row = mi_row;
+ xd->cfl.mi_col = mi_col;
+
+ mbmi->segment_id = 0;
+
+ // Setup segment ID.
+ if (seg->enabled) {
+ if (seg->enabled && !cpi->vaq_refresh) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mbmi->segment_id =
+ map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+ }
+ av1_init_plane_quantizers(cpi, x, mbmi->segment_id);
+ }
+}
+
+static void reset_intmv_filter_type(MB_MODE_INFO *mbmi) {
+ InterpFilter filters[2];
+
+ for (int dir = 0; dir < 2; ++dir) {
+ filters[dir] = av1_extract_interp_filter(mbmi->interp_filters, dir);
+ }
+ mbmi->interp_filters = av1_make_interp_filters(filters[0], filters[1]);
+}
+
+static void update_filter_type_count(uint8_t allow_update_cdf,
+ FRAME_COUNTS *counts,
+ const MACROBLOCKD *xd,
+ const MB_MODE_INFO *mbmi) {
+ int dir;
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ InterpFilter filter = av1_extract_interp_filter(mbmi->interp_filters, dir);
+ ++counts->switchable_interp[ctx][filter];
+ if (allow_update_cdf) {
+ update_cdf(xd->tile_ctx->switchable_interp_cdf[ctx], filter,
+ SWITCHABLE_FILTERS);
+ }
+ }
+}
+
+static void update_global_motion_used(PREDICTION_MODE mode, BLOCK_SIZE bsize,
+ const MB_MODE_INFO *mbmi,
+ RD_COUNTS *rdc) {
+ if (mode == GLOBALMV || mode == GLOBAL_GLOBALMV) {
+ const int num_4x4s = mi_size_wide[bsize] * mi_size_high[bsize];
+ int ref;
+ for (ref = 0; ref < 1 + has_second_ref(mbmi); ++ref) {
+ rdc->global_motion_used[mbmi->ref_frame[ref]] += num_4x4s;
+ }
+ }
+}
+
+static void reset_tx_size(MACROBLOCK *x, MB_MODE_INFO *mbmi,
+ const TX_MODE tx_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ if (xd->lossless[mbmi->segment_id]) {
+ mbmi->tx_size = TX_4X4;
+ } else if (tx_mode != TX_MODE_SELECT) {
+ mbmi->tx_size = tx_size_from_tx_mode(mbmi->sb_type, tx_mode);
+ } else {
+ BLOCK_SIZE bsize = mbmi->sb_type;
+ TX_SIZE min_tx_size = depth_to_tx_size(MAX_TX_DEPTH, bsize);
+ mbmi->tx_size = (TX_SIZE)TXSIZEMAX(mbmi->tx_size, min_tx_size);
+ }
+ if (is_inter_block(mbmi)) {
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ }
+ memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+ av1_zero(x->blk_skip);
+ x->skip = 0;
+}
+
+static void update_state(const AV1_COMP *const cpi,
+ const TileDataEnc *const tile_data, ThreadData *td,
+ const PICK_MODE_CONTEXT *const ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, RUN_TYPE dry_run) {
+ int i, x_idx, y;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ RD_COUNTS *const rdc = &td->rd_counts;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const MB_MODE_INFO *const mi = &ctx->mic;
+ MB_MODE_INFO *const mi_addr = xd->mi[0];
+ const struct segmentation *const seg = &cm->seg;
+ const int bw = mi_size_wide[mi->sb_type];
+ const int bh = mi_size_high[mi->sb_type];
+ const int mis = cm->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+
+ assert(mi->sb_type == bsize);
+
+ *mi_addr = *mi;
+ *x->mbmi_ext = ctx->mbmi_ext;
+
+ reset_intmv_filter_type(mi_addr);
+
+ memcpy(x->blk_skip, ctx->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+ x->skip = ctx->skip;
+
+ // If segmentation in use
+ if (seg->enabled) {
+ // For in frame complexity AQ copy the segment id from the segment map.
+ if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ mi_addr->segment_id =
+ map ? get_segment_id(cm, map, bsize, mi_row, mi_col) : 0;
+ reset_tx_size(x, mi_addr, cm->tx_mode);
+ }
+ // Else for cyclic refresh mode update the segment map, set the segment id
+ // and then update the quantizer.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_update_segment(cpi, mi_addr, mi_row, mi_col, bsize,
+ ctx->rate, ctx->dist, x->skip);
+ reset_tx_size(x, mi_addr, cm->tx_mode);
+ }
+ if (mi_addr->uv_mode == UV_CFL_PRED && !is_cfl_allowed(xd))
+ mi_addr->uv_mode = UV_DC_PRED;
+ }
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+ // Restore the coding context of the MB to that that was in place
+ // when the mode was picked for it
+ for (y = 0; y < mi_height; y++)
+ for (x_idx = 0; x_idx < mi_width; x_idx++)
+ if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx &&
+ (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+ xd->mi[x_idx + y * mis] = mi_addr;
+ }
+
+ if (cpi->oxcf.aq_mode) av1_init_plane_quantizers(cpi, x, mi_addr->segment_id);
+
+ if (dry_run) return;
+
+#if CONFIG_INTERNAL_STATS
+ {
+ unsigned int *const mode_chosen_counts =
+ (unsigned int *)cpi->mode_chosen_counts; // Cast const away.
+ if (frame_is_intra_only(cm)) {
+ static const int kf_mode_index[] = {
+ THR_DC /*DC_PRED*/,
+ THR_V_PRED /*V_PRED*/,
+ THR_H_PRED /*H_PRED*/,
+ THR_D45_PRED /*D45_PRED*/,
+ THR_D135_PRED /*D135_PRED*/,
+ THR_D113_PRED /*D113_PRED*/,
+ THR_D157_PRED /*D157_PRED*/,
+ THR_D203_PRED /*D203_PRED*/,
+ THR_D67_PRED /*D67_PRED*/,
+ THR_SMOOTH, /*SMOOTH_PRED*/
+ THR_SMOOTH_V, /*SMOOTH_V_PRED*/
+ THR_SMOOTH_H, /*SMOOTH_H_PRED*/
+ THR_PAETH /*PAETH_PRED*/,
+ };
+ ++mode_chosen_counts[kf_mode_index[mi_addr->mode]];
+ } else {
+ // Note how often each mode chosen as best
+ ++mode_chosen_counts[ctx->best_mode_index];
+ }
+ }
+#endif
+ if (!frame_is_intra_only(cm)) {
+ if (is_inter_block(mi_addr)) {
+ // TODO(sarahparker): global motion stats need to be handled per-tile
+ // to be compatible with tile-based threading.
+ update_global_motion_used(mi_addr->mode, bsize, mi_addr, rdc);
+ }
+
+ if (cm->interp_filter == SWITCHABLE &&
+ mi_addr->motion_mode != WARPED_CAUSAL &&
+ !is_nontrans_global_motion(xd, xd->mi[0])) {
+ update_filter_type_count(tile_data->allow_update_cdf, td->counts, xd,
+ mi_addr);
+ }
+
+ rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+ rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+ rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+ }
+
+ const int x_mis = AOMMIN(bw, cm->mi_cols - mi_col);
+ const int y_mis = AOMMIN(bh, cm->mi_rows - mi_row);
+ av1_copy_frame_mvs(cm, mi, mi_row, mi_col, x_mis, y_mis);
+}
+
+void av1_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
+ int mi_row, int mi_col, const int num_planes) {
+ // Set current frame pointer.
+ x->e_mbd.cur_buf = src;
+
+ // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
+ // the static analysis warnings.
+ for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); i++) {
+ const int is_uv = i > 0;
+ setup_pred_plane(&x->plane[i].src, x->e_mbd.mi[0]->sb_type, src->buffers[i],
+ src->crop_widths[is_uv], src->crop_heights[is_uv],
+ src->strides[is_uv], mi_row, mi_col, NULL,
+ x->e_mbd.plane[i].subsampling_x,
+ x->e_mbd.plane[i].subsampling_y);
+ }
+}
+
+static int set_segment_rdmult(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int8_t segment_id) {
+ const AV1_COMMON *const cm = &cpi->common;
+ av1_init_plane_quantizers(cpi, x, segment_id);
+ aom_clear_system_state();
+ int segment_qindex = av1_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+ return av1_compute_rd_mult(cpi, segment_qindex + cm->y_dc_delta_q);
+}
+
+static int set_deltaq_rdmult(const AV1_COMP *const cpi, MACROBLOCKD *const xd) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ return av1_compute_rd_mult(
+ cpi, cm->base_qindex + xd->delta_qindex + cm->y_dc_delta_q);
+}
+
+static void rd_pick_sb_modes(AV1_COMP *const cpi, TileDataEnc *tile_data,
+ MACROBLOCK *const x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, PARTITION_TYPE partition,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi;
+ MB_MODE_INFO *ctx_mbmi = &ctx->mic;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const AQ_MODE aq_mode = cpi->oxcf.aq_mode;
+ const DELTAQ_MODE deltaq_mode = cpi->oxcf.deltaq_mode;
+ int i, orig_rdmult;
+
+ if (best_rd < 0) {
+ ctx->rdcost = INT64_MAX;
+ ctx->skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+
+ aom_clear_system_state();
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ mbmi = xd->mi[0];
+
+ if (ctx->rd_mode_is_ready) {
+ assert(ctx_mbmi->sb_type == bsize);
+ assert(ctx_mbmi->partition == partition);
+ *mbmi = *ctx_mbmi;
+ rd_cost->rate = ctx->rate;
+ rd_cost->dist = ctx->dist;
+ rd_cost->rdcost = ctx->rdcost;
+ } else {
+ mbmi->sb_type = bsize;
+ mbmi->partition = partition;
+ }
+
+#if CONFIG_RD_DEBUG
+ mbmi->mi_row = mi_row;
+ mbmi->mi_col = mi_col;
+#endif
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+
+ for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i];
+
+ if (!ctx->rd_mode_is_ready) {
+ ctx->skippable = 0;
+
+ // Set to zero to make sure we do not use the previous encoded frame stats
+ mbmi->skip = 0;
+
+ // Reset skip mode flag.
+ mbmi->skip_mode = 0;
+ }
+
+ x->skip_chroma_rd =
+ !is_chroma_reference(mi_row, mi_col, bsize, xd->plane[1].subsampling_x,
+ xd->plane[1].subsampling_y);
+
+ if (ctx->rd_mode_is_ready) {
+ x->skip = ctx->skip;
+ *x->mbmi_ext = ctx->mbmi_ext;
+ return;
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ x->source_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ x->source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+
+ // Save rdmult before it might be changed, so it can be restored later.
+ orig_rdmult = x->rdmult;
+
+ if (aq_mode == VARIANCE_AQ) {
+ if (cpi->vaq_refresh) {
+ const int energy = bsize <= BLOCK_16X16
+ ? x->mb_energy
+ : av1_log_block_var(cpi, x, bsize);
+ mbmi->segment_id = energy;
+ }
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == COMPLEXITY_AQ) {
+ x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
+ } else if (aq_mode == CYCLIC_REFRESH_AQ) {
+ // If segment is boosted, use rdmult for that segment.
+ if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
+ x->rdmult = av1_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
+ }
+
+ if (deltaq_mode > 0) x->rdmult = set_deltaq_rdmult(cpi, xd);
+
+ // Find best coding mode & reconstruct the MB so it is available
+ // as a predictor for MBs that follow in the SB
+ if (frame_is_intra_only(cm)) {
+ av1_rd_pick_intra_mode_sb(cpi, x, mi_row, mi_col, rd_cost, bsize, ctx,
+ best_rd);
+ } else {
+ if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col,
+ rd_cost, bsize, ctx, best_rd);
+ } else {
+ av1_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+ bsize, ctx, best_rd);
+ }
+ }
+
+ // Examine the resulting rate and for AQ mode 2 make a segment choice.
+ if ((rd_cost->rate != INT_MAX) && (aq_mode == COMPLEXITY_AQ) &&
+ (bsize >= BLOCK_16X16) &&
+ (cm->frame_type == KEY_FRAME || cpi->refresh_alt_ref_frame ||
+ cpi->refresh_alt2_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref))) {
+ av1_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
+ }
+
+ x->rdmult = orig_rdmult;
+
+ // TODO(jingning) The rate-distortion optimization flow needs to be
+ // refactored to provide proper exit/return handle.
+ if (rd_cost->rate == INT_MAX) rd_cost->rdcost = INT64_MAX;
+
+ ctx->rate = rd_cost->rate;
+ ctx->dist = rd_cost->dist;
+ ctx->rdcost = rd_cost->rdcost;
+}
+
+static void update_inter_mode_stats(FRAME_CONTEXT *fc, FRAME_COUNTS *counts,
+ PREDICTION_MODE mode, int16_t mode_context,
+ uint8_t allow_update_cdf) {
+ (void)counts;
+
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+ if (mode == NEWMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][0];
+#endif
+ if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 0, 2);
+ return;
+ } else {
+#if CONFIG_ENTROPY_STATS
+ ++counts->newmv_mode[mode_ctx][1];
+#endif
+ if (allow_update_cdf) update_cdf(fc->newmv_cdf[mode_ctx], 1, 2);
+
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+ if (mode == GLOBALMV) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][0];
+#endif
+ if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 0, 2);
+ return;
+ } else {
+#if CONFIG_ENTROPY_STATS
+ ++counts->zeromv_mode[mode_ctx][1];
+#endif
+ if (allow_update_cdf) update_cdf(fc->zeromv_cdf[mode_ctx], 1, 2);
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+#if CONFIG_ENTROPY_STATS
+ ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->refmv_cdf[mode_ctx], mode != NEARESTMV, 2);
+ }
+ }
+}
+
+static void update_palette_cdf(MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ FRAME_COUNTS *counts, uint8_t allow_update_cdf) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int palette_bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+
+ (void)counts;
+
+ if (mbmi->mode == DC_PRED) {
+ const int n = pmi->palette_size[0];
+ const int palette_mode_ctx = av1_get_palette_mode_ctx(xd);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_mode[palette_bsize_ctx][palette_mode_ctx][n > 0];
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->palette_y_mode_cdf[palette_bsize_ctx][palette_mode_ctx],
+ n > 0, 2);
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_y_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->palette_y_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+ }
+
+ if (mbmi->uv_mode == UV_DC_PRED) {
+ const int n = pmi->palette_size[1];
+ const int palette_uv_mode_ctx = (pmi->palette_size[0] > 0);
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_mode[palette_uv_mode_ctx][n > 0];
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->palette_uv_mode_cdf[palette_uv_mode_ctx], n > 0, 2);
+
+ if (n > 0) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->palette_uv_size[palette_bsize_ctx][n - PALETTE_MIN_SIZE];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->palette_uv_size_cdf[palette_bsize_ctx],
+ n - PALETTE_MIN_SIZE, PALETTE_SIZES);
+ }
+ }
+ }
+}
+
+static void sum_intra_stats(const AV1_COMMON *const cm, FRAME_COUNTS *counts,
+ MACROBLOCKD *xd, const MB_MODE_INFO *const mbmi,
+ const MB_MODE_INFO *above_mi,
+ const MB_MODE_INFO *left_mi, const int intraonly,
+ const int mi_row, const int mi_col,
+ uint8_t allow_update_cdf) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const PREDICTION_MODE y_mode = mbmi->mode;
+ const UV_PREDICTION_MODE uv_mode = mbmi->uv_mode;
+ (void)counts;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+
+ if (intraonly) {
+#if CONFIG_ENTROPY_STATS
+ const PREDICTION_MODE above = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE left = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[above];
+ const int left_ctx = intra_mode_context[left];
+ ++counts->kf_y_mode[above_ctx][left_ctx][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(get_y_mode_cdf(fc, above_mi, left_mi), y_mode, INTRA_MODES);
+ } else {
+#if CONFIG_ENTROPY_STATS
+ ++counts->y_mode[size_group_lookup[bsize]][y_mode];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(fc->y_mode_cdf[size_group_lookup[bsize]], y_mode, INTRA_MODES);
+ }
+
+ if (av1_filter_intra_allowed(cm, mbmi)) {
+ const int use_filter_intra_mode =
+ mbmi->filter_intra_mode_info.use_filter_intra;
+#if CONFIG_ENTROPY_STATS
+ ++counts->filter_intra[mbmi->sb_type][use_filter_intra_mode];
+ if (use_filter_intra_mode) {
+ ++counts
+ ->filter_intra_mode[mbmi->filter_intra_mode_info.filter_intra_mode];
+ }
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(fc->filter_intra_cdfs[mbmi->sb_type], use_filter_intra_mode,
+ 2);
+ if (use_filter_intra_mode) {
+ update_cdf(fc->filter_intra_mode_cdf,
+ mbmi->filter_intra_mode_info.filter_intra_mode,
+ FILTER_INTRA_MODES);
+ }
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[mbmi->mode - V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->angle_delta_cdf[mbmi->mode - V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_Y] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+ }
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y))
+ return;
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->uv_mode[is_cfl_allowed(xd)][y_mode][uv_mode];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ const CFL_ALLOWED_TYPE cfl_allowed = is_cfl_allowed(xd);
+ update_cdf(fc->uv_mode_cdf[cfl_allowed][y_mode], uv_mode,
+ UV_INTRA_MODES - !cfl_allowed);
+ }
+ if (uv_mode == UV_CFL_PRED) {
+ const int joint_sign = mbmi->cfl_alpha_signs;
+ const int idx = mbmi->cfl_alpha_idx;
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_sign[joint_sign];
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->cfl_sign_cdf, joint_sign, CFL_JOINT_SIGNS);
+ if (CFL_SIGN_U(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_U(joint_sign)][CFL_IDX_U(idx)];
+#endif
+ if (allow_update_cdf)
+ update_cdf(cdf_u, CFL_IDX_U(idx), CFL_ALPHABET_SIZE);
+ }
+ if (CFL_SIGN_V(joint_sign) != CFL_SIGN_ZERO) {
+ aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->cfl_alpha[CFL_CONTEXT_V(joint_sign)][CFL_IDX_V(idx)];
+#endif
+ if (allow_update_cdf)
+ update_cdf(cdf_v, CFL_IDX_V(idx), CFL_ALPHABET_SIZE);
+ }
+ }
+ if (av1_is_directional_mode(get_uv_mode(uv_mode)) &&
+ av1_use_angle_delta(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->angle_delta[uv_mode - UV_V_PRED]
+ [mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->angle_delta_cdf[uv_mode - UV_V_PRED],
+ mbmi->angle_delta[PLANE_TYPE_UV] + MAX_ANGLE_DELTA,
+ 2 * MAX_ANGLE_DELTA + 1);
+ }
+ }
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize))
+ update_palette_cdf(xd, mbmi, counts, allow_update_cdf);
+}
+
+static void update_stats(const AV1_COMMON *const cm, TileDataEnc *tile_data,
+ ThreadData *td, int mi_row, int mi_col) {
+ MACROBLOCK *x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ const uint8_t allow_update_cdf = tile_data->allow_update_cdf;
+
+ // delta quant applies to both intra and inter
+ const int super_block_upper_left =
+ ((mi_row & (cm->seq_params.mib_size - 1)) == 0) &&
+ ((mi_col & (cm->seq_params.mib_size - 1)) == 0);
+
+ const int seg_ref_active =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_REF_FRAME);
+
+ if (cm->skip_mode_flag && !seg_ref_active && is_comp_ref_allowed(bsize)) {
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip_mode[skip_mode_ctx][mbmi->skip_mode]++;
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->skip_mode_cdfs[skip_mode_ctx], mbmi->skip_mode, 2);
+ }
+
+ if (!mbmi->skip_mode) {
+ if (!seg_ref_active) {
+ const int skip_ctx = av1_get_skip_context(xd);
+#if CONFIG_ENTROPY_STATS
+ td->counts->skip[skip_ctx][mbmi->skip]++;
+#endif
+ if (allow_update_cdf) update_cdf(fc->skip_cdfs[skip_ctx], mbmi->skip, 2);
+ }
+ }
+
+ if (cm->delta_q_present_flag &&
+ (bsize != cm->seq_params.sb_size || !mbmi->skip) &&
+ super_block_upper_left) {
+#if CONFIG_ENTROPY_STATS
+ const int dq =
+ (mbmi->current_qindex - xd->current_qindex) / cm->delta_q_res;
+ const int absdq = abs(dq);
+ for (int i = 0; i < AOMMIN(absdq, DELTA_Q_SMALL); ++i) {
+ td->counts->delta_q[i][1]++;
+ }
+ if (absdq < DELTA_Q_SMALL) td->counts->delta_q[absdq][0]++;
+#endif
+ xd->current_qindex = mbmi->current_qindex;
+ if (cm->delta_lf_present_flag) {
+ if (cm->delta_lf_multi) {
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+#if CONFIG_ENTROPY_STATS
+ const int delta_lf =
+ (mbmi->delta_lf[lf_id] - xd->delta_lf[lf_id]) / cm->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf_multi[lf_id][i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf_multi[lf_id][abs_delta_lf][0]++;
+#endif
+ xd->delta_lf[lf_id] = mbmi->delta_lf[lf_id];
+ }
+ } else {
+#if CONFIG_ENTROPY_STATS
+ const int delta_lf =
+ (mbmi->delta_lf_from_base - xd->delta_lf_from_base) /
+ cm->delta_lf_res;
+ const int abs_delta_lf = abs(delta_lf);
+ for (int i = 0; i < AOMMIN(abs_delta_lf, DELTA_LF_SMALL); ++i) {
+ td->counts->delta_lf[i][1]++;
+ }
+ if (abs_delta_lf < DELTA_LF_SMALL)
+ td->counts->delta_lf[abs_delta_lf][0]++;
+#endif
+ xd->delta_lf_from_base = mbmi->delta_lf_from_base;
+ }
+ }
+ }
+
+ if (!is_inter_block(mbmi)) {
+ sum_intra_stats(cm, td->counts, xd, mbmi, xd->above_mbmi, xd->left_mbmi,
+ frame_is_intra_only(cm), mi_row, mi_col,
+ tile_data->allow_update_cdf);
+ }
+
+ if (av1_allow_intrabc(cm)) {
+ if (allow_update_cdf)
+ update_cdf(fc->intrabc_cdf, is_intrabc_block(mbmi), 2);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intrabc[is_intrabc_block(mbmi)];
+#endif // CONFIG_ENTROPY_STATS
+ }
+
+ if (!frame_is_intra_only(cm)) {
+ RD_COUNTS *rdc = &td->rd_counts;
+
+ FRAME_COUNTS *const counts = td->counts;
+
+ if (mbmi->skip_mode) {
+ rdc->skip_mode_used_flag = 1;
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ assert(has_second_ref(mbmi));
+ rdc->compound_ref_used_flag = 1;
+ }
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ return;
+ }
+
+ const int inter_block = is_inter_block(mbmi);
+
+ if (!seg_ref_active) {
+#if CONFIG_ENTROPY_STATS
+ counts->intra_inter[av1_get_intra_inter_context(xd)][inter_block]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->intra_inter_cdf[av1_get_intra_inter_context(xd)],
+ inter_block, 2);
+ }
+ // If the segment reference feature is enabled we have only a single
+ // reference frame allowed for the segment so exclude it from
+ // the reference frame counts used to work out probabilities.
+ if (inter_block) {
+ const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+ const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ if (has_second_ref(mbmi))
+ // This flag is also updated for 4x4 blocks
+ rdc->compound_ref_used_flag = 1;
+ if (is_comp_ref_allowed(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->comp_inter[av1_get_reference_mode_context(xd)]
+ [has_second_ref(mbmi)]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(av1_get_reference_mode_cdf(xd), has_second_ref(mbmi),
+ 2);
+ }
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ const COMP_REFERENCE_TYPE comp_ref_type = has_uni_comp_refs(mbmi)
+ ? UNIDIR_COMP_REFERENCE
+ : BIDIR_COMP_REFERENCE;
+ if (allow_update_cdf) {
+ update_cdf(av1_get_comp_reference_type_cdf(xd), comp_ref_type,
+ COMP_REFERENCE_TYPES);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref_type[av1_get_comp_reference_type_context(xd)]
+ [comp_ref_type]++;
+#endif // CONFIG_ENTROPY_STATS
+
+ if (comp_ref_type == UNIDIR_COMP_REFERENCE) {
+ const int bit = (ref0 == BWDREF_FRAME);
+ if (allow_update_cdf)
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p(xd)][0]
+ [bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ const int bit1 = (ref1 == LAST3_FRAME || ref1 == GOLDEN_FRAME);
+ if (allow_update_cdf)
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p1(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p1(xd)][1]
+ [bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit1) {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_uni_comp_ref_p2(xd),
+ ref1 == GOLDEN_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->uni_comp_ref[av1_get_pred_context_uni_comp_ref_p2(xd)]
+ [2][ref1 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+ if (allow_update_cdf)
+ update_cdf(av1_get_pred_cdf_comp_ref_p(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit) {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_comp_ref_p1(xd),
+ ref0 == LAST2_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p1(xd)][1]
+ [ref0 == LAST2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_comp_ref_p2(xd),
+ ref0 == GOLDEN_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->comp_ref[av1_get_pred_context_comp_ref_p2(xd)][2]
+ [ref0 == GOLDEN_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p(xd),
+ ref1 == ALTREF_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p(xd)][0]
+ [ref1 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref1 != ALTREF_FRAME) {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_comp_bwdref_p1(xd),
+ ref1 == ALTREF2_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->comp_bwdref[av1_get_pred_context_comp_bwdref_p1(xd)][1]
+ [ref1 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ } else {
+ const int bit = (ref0 >= BWDREF_FRAME);
+ if (allow_update_cdf)
+ update_cdf(av1_get_pred_cdf_single_ref_p1(xd), bit, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p1(xd)][0][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (bit) {
+ assert(ref0 <= ALTREF_FRAME);
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_single_ref_p2(xd),
+ ref0 == ALTREF_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p2(xd)][1]
+ [ref0 == ALTREF_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (ref0 != ALTREF_FRAME) {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_single_ref_p6(xd),
+ ref0 == ALTREF2_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p6(xd)][5]
+ [ref0 == ALTREF2_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ } else {
+ const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+ if (allow_update_cdf)
+ update_cdf(av1_get_pred_cdf_single_ref_p3(xd), bit1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts
+ ->single_ref[av1_get_pred_context_single_ref_p3(xd)][2][bit1]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (!bit1) {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_single_ref_p4(xd),
+ ref0 != LAST_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p4(xd)][3]
+ [ref0 != LAST_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ if (allow_update_cdf) {
+ update_cdf(av1_get_pred_cdf_single_ref_p5(xd),
+ ref0 != LAST3_FRAME, 2);
+ }
+#if CONFIG_ENTROPY_STATS
+ counts->single_ref[av1_get_pred_context_single_ref_p5(xd)][4]
+ [ref0 != LAST3_FRAME]++;
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ }
+
+ if (cm->seq_params.enable_interintra_compound &&
+ is_interintra_allowed(mbmi)) {
+ const int bsize_group = size_group_lookup[bsize];
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][1]++;
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->interintra_cdf[bsize_group], 1, 2);
+#if CONFIG_ENTROPY_STATS
+ counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->interintra_mode_cdf[bsize_group],
+ mbmi->interintra_mode, INTERINTRA_MODES);
+ }
+ if (is_interintra_wedge_used(bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->wedge_interintra_cdf[bsize],
+ mbmi->use_wedge_interintra, 2);
+ }
+ if (mbmi->use_wedge_interintra) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interintra_wedge_index]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->wedge_idx_cdf[bsize],
+ mbmi->interintra_wedge_index, 16);
+ }
+ }
+ }
+ } else {
+#if CONFIG_ENTROPY_STATS
+ counts->interintra[bsize_group][0]++;
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->interintra_cdf[bsize_group], 0, 2);
+ }
+ }
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ const MOTION_MODE motion_allowed =
+ cm->switchable_motion_mode
+ ? motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->allow_warped_motion)
+ : SIMPLE_TRANSLATION;
+ if (mbmi->ref_frame[1] != INTRA_FRAME) {
+ if (motion_allowed == WARPED_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->motion_mode[bsize][mbmi->motion_mode]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->motion_mode_cdf[bsize], mbmi->motion_mode,
+ MOTION_MODES);
+ }
+ } else if (motion_allowed == OBMC_CAUSAL) {
+#if CONFIG_ENTROPY_STATS
+ counts->obmc[bsize][mbmi->motion_mode == OBMC_CAUSAL]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->obmc_cdf[bsize], mbmi->motion_mode == OBMC_CAUSAL,
+ 2);
+ }
+ }
+ }
+
+ if (has_second_ref(mbmi)) {
+ assert(cm->reference_mode != SINGLE_REFERENCE &&
+ is_inter_compound_mode(mbmi->mode) &&
+ mbmi->motion_mode == SIMPLE_TRANSLATION);
+
+ const int masked_compound_used =
+ is_any_masked_compound_used(bsize) &&
+ cm->seq_params.enable_masked_compound;
+ if (masked_compound_used) {
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->comp_group_idx[comp_group_idx_ctx][mbmi->comp_group_idx];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->comp_group_idx_cdf[comp_group_idx_ctx],
+ mbmi->comp_group_idx, 2);
+ }
+ }
+
+ if (mbmi->comp_group_idx == 0) {
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_index[comp_index_ctx][mbmi->compound_idx];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->compound_index_cdf[comp_index_ctx],
+ mbmi->compound_idx, 2);
+ }
+ } else {
+ assert(masked_compound_used);
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->compound_type[bsize][mbmi->interinter_comp.type - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->compound_type_cdf[bsize],
+ mbmi->interinter_comp.type - 1, COMPOUND_TYPES - 1);
+ }
+ }
+ }
+ }
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ if (is_interinter_compound_used(COMPOUND_WEDGE, bsize)) {
+#if CONFIG_ENTROPY_STATS
+ counts->wedge_idx[bsize][mbmi->interinter_comp.wedge_index]++;
+#endif
+ if (allow_update_cdf) {
+ update_cdf(fc->wedge_idx_cdf[bsize],
+ mbmi->interinter_comp.wedge_index, 16);
+ }
+ }
+ }
+ }
+ }
+
+ if (inter_block &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ int16_t mode_ctx;
+ const PREDICTION_MODE mode = mbmi->mode;
+
+ mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+ if (has_second_ref(mbmi)) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+#endif
+ if (allow_update_cdf)
+ update_cdf(fc->inter_compound_mode_cdf[mode_ctx],
+ INTER_COMPOUND_OFFSET(mode), INTER_COMPOUND_MODES);
+ } else {
+ update_inter_mode_stats(fc, counts, mode, mode_ctx, allow_update_cdf);
+ }
+
+ int mode_allowed = (mbmi->mode == NEWMV);
+ mode_allowed |= (mbmi->mode == NEW_NEWMV);
+ if (mode_allowed) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ int idx;
+
+ for (idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+#endif
+
+ if (mbmi->ref_mv_idx == idx) break;
+ }
+ }
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ int idx;
+
+ for (idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+#if CONFIG_ENTROPY_STATS
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+#endif
+
+ if (mbmi->ref_mv_idx == idx - 1) break;
+ }
+ }
+ }
+ }
+ }
+}
+
+typedef struct {
+ ENTROPY_CONTEXT a[MAX_MIB_SIZE * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[MAX_MIB_SIZE * MAX_MB_PLANE];
+ PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+ PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+ TXFM_CONTEXT *p_ta;
+ TXFM_CONTEXT *p_tl;
+ TXFM_CONTEXT ta[MAX_MIB_SIZE];
+ TXFM_CONTEXT tl[MAX_MIB_SIZE];
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+ const RD_SEARCH_MACROBLOCK_CONTEXT *ctx, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide =
+ block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_blocks_high =
+ block_size_high[bsize] >> tx_size_high_log2[0];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+ for (p = 0; p < num_planes; p++) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ ctx->a + num_4x4_blocks_wide * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ ctx->l + num_4x4_blocks_high * p,
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(xd->above_seg_context + mi_col, ctx->sa,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
+ sizeof(xd->left_seg_context[0]) * mi_height);
+ xd->above_txfm_context = ctx->p_ta;
+ xd->left_txfm_context = ctx->p_tl;
+ memcpy(xd->above_txfm_context, ctx->ta,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(xd->left_txfm_context, ctx->tl,
+ sizeof(*xd->left_txfm_context) * mi_height);
+}
+
+static void save_context(const MACROBLOCK *x, RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ const int num_planes) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ int p;
+ const int num_4x4_blocks_wide =
+ block_size_wide[bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_blocks_high =
+ block_size_high[bsize] >> tx_size_high_log2[0];
+ int mi_width = mi_size_wide[bsize];
+ int mi_height = mi_size_high[bsize];
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < num_planes; ++p) {
+ int tx_col = mi_col;
+ int tx_row = mi_row & MAX_MIB_MASK;
+ memcpy(ctx->a + num_4x4_blocks_wide * p,
+ xd->above_context[p] + (tx_col >> xd->plane[p].subsampling_x),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
+ xd->plane[p].subsampling_x);
+ memcpy(ctx->l + num_4x4_blocks_high * p,
+ xd->left_context[p] + (tx_row >> xd->plane[p].subsampling_y),
+ (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
+ xd->plane[p].subsampling_y);
+ }
+ memcpy(ctx->sa, xd->above_seg_context + mi_col,
+ sizeof(*xd->above_seg_context) * mi_width);
+ memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
+ sizeof(xd->left_seg_context[0]) * mi_height);
+ memcpy(ctx->ta, xd->above_txfm_context,
+ sizeof(*xd->above_txfm_context) * mi_width);
+ memcpy(ctx->tl, xd->left_txfm_context,
+ sizeof(*xd->left_txfm_context) * mi_height);
+ ctx->p_ta = xd->above_txfm_context;
+ ctx->p_tl = xd->left_txfm_context;
+}
+
+static void encode_b(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TOKENEXTRA **tp, int mi_row, int mi_col,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PARTITION_TYPE partition,
+ const PICK_MODE_CONTEXT *const ctx, int *rate) {
+ TileInfo *const tile = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ mbmi->partition = partition;
+ update_state(cpi, tile_data, td, ctx, mi_row, mi_col, bsize, dry_run);
+
+ if (!dry_run) av1_set_coeff_buffer(cpi, x, mi_row, mi_col);
+
+ encode_superblock(cpi, tile_data, td, tp, dry_run, mi_row, mi_col, bsize,
+ rate);
+
+ if (dry_run == 0)
+ x->cb_offset += block_size_wide[bsize] * block_size_high[bsize];
+
+ if (!dry_run) {
+ if (bsize == cpi->common.seq_params.sb_size && mbmi->skip == 1 &&
+ cpi->common.delta_lf_present_flag) {
+ const int frame_lf_count = av1_num_planes(&cpi->common) > 1
+ ? FRAME_LF_COUNT
+ : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id)
+ mbmi->delta_lf[lf_id] = xd->delta_lf[lf_id];
+ mbmi->delta_lf_from_base = xd->delta_lf_from_base;
+ }
+ if (has_second_ref(mbmi)) {
+ if (mbmi->compound_idx == 0 ||
+ mbmi->interinter_comp.type == COMPOUND_AVERAGE)
+ mbmi->comp_group_idx = 0;
+ else
+ mbmi->comp_group_idx = 1;
+ }
+ update_stats(&cpi->common, tile_data, td, mi_row, mi_col);
+ }
+}
+
+static void encode_sb(const AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp, int mi_row,
+ int mi_col, RUN_TYPE dry_run, BLOCK_SIZE bsize,
+ PC_TREE *pc_tree, int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int hbs = mi_size_wide[bsize] / 2;
+ const int is_partition_root = bsize >= BLOCK_8X8;
+ const int ctx = is_partition_root
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : -1;
+ const PARTITION_TYPE partition = pc_tree->partitioning;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ int quarter_step = mi_size_wide[bsize] / 4;
+ int i;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (!dry_run && ctx >= 0) {
+ const int has_rows = (mi_row + hbs) < cm->mi_rows;
+ const int has_cols = (mi_col + hbs) < cm->mi_cols;
+
+ if (has_rows && has_cols) {
+#if CONFIG_ENTROPY_STATS
+ td->counts->partition[ctx][partition]++;
+#endif
+
+ if (tile_data->allow_update_cdf) {
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+ update_cdf(fc->partition_cdf[ctx], partition,
+ partition_cdf_length(bsize));
+ }
+ }
+ }
+
+ switch (partition) {
+ case PARTITION_NONE:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->none, rate);
+ break;
+ case PARTITION_VERT:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->vertical[0], rate);
+ if (mi_col + hbs < cm->mi_cols) {
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, &pc_tree->vertical[1], rate);
+ }
+ break;
+ case PARTITION_HORZ:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontal[0], rate);
+ if (mi_row + hbs < cm->mi_rows) {
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontal[1], rate);
+ }
+ break;
+ case PARTITION_SPLIT:
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, dry_run, subsize,
+ pc_tree->split[0], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ pc_tree->split[1], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ pc_tree->split[2], rate);
+ encode_sb(cpi, td, tile_data, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ subsize, pc_tree->split[3], rate);
+ break;
+
+ case PARTITION_HORZ_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, &pc_tree->horizontala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->horizontala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontala[2], rate);
+ break;
+ case PARTITION_HORZ_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->horizontalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, &pc_tree->horizontalb[2], rate);
+ break;
+ case PARTITION_VERT_A:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, bsize2,
+ partition, &pc_tree->verticala[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col, dry_run, bsize2,
+ partition, &pc_tree->verticala[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, subsize,
+ partition, &pc_tree->verticala[2], rate);
+
+ break;
+ case PARTITION_VERT_B:
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->verticalb[0], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row, mi_col + hbs, dry_run, bsize2,
+ partition, &pc_tree->verticalb[1], rate);
+ encode_b(cpi, tile_data, td, tp, mi_row + hbs, mi_col + hbs, dry_run,
+ bsize2, partition, &pc_tree->verticalb[2], rate);
+ break;
+ case PARTITION_HORZ_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_row = mi_row + i * quarter_step;
+ if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+ encode_b(cpi, tile_data, td, tp, this_mi_row, mi_col, dry_run, subsize,
+ partition, &pc_tree->horizontal4[i], rate);
+ }
+ break;
+ case PARTITION_VERT_4:
+ for (i = 0; i < 4; ++i) {
+ int this_mi_col = mi_col + i * quarter_step;
+ if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+ encode_b(cpi, tile_data, td, tp, mi_row, this_mi_col, dry_run, subsize,
+ partition, &pc_tree->vertical4[i], rate);
+ }
+ break;
+ default: assert(0 && "Invalid partition type."); break;
+ }
+
+ update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+}
+
+// Check to see if the given partition size is allowed for a specified number
+// of mi block rows and columns remaining in the image.
+// If not then return the largest allowed partition size
+static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize, int rows_left,
+ int cols_left, int *bh, int *bw) {
+ if (rows_left <= 0 || cols_left <= 0) {
+ return AOMMIN(bsize, BLOCK_8X8);
+ } else {
+ for (; bsize > 0; bsize -= 3) {
+ *bh = mi_size_high[bsize];
+ *bw = mi_size_wide[bsize];
+ if ((*bh <= rows_left) && (*bw <= cols_left)) {
+ break;
+ }
+ }
+ }
+ return bsize;
+}
+
+static void set_partial_sb_partition(const AV1_COMMON *const cm,
+ MB_MODE_INFO *mi, int bh_in, int bw_in,
+ int mi_rows_remaining,
+ int mi_cols_remaining, BLOCK_SIZE bsize,
+ MB_MODE_INFO **mib) {
+ int bh = bh_in;
+ int r, c;
+ for (r = 0; r < cm->seq_params.mib_size; r += bh) {
+ int bw = bw_in;
+ for (c = 0; c < cm->seq_params.mib_size; c += bw) {
+ const int index = r * cm->mi_stride + c;
+ mib[index] = mi + index;
+ mib[index]->sb_type = find_partition_size(
+ bsize, mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
+ }
+ }
+}
+
+// This function attempts to set all mode info entries in a given superblock
+// to the same block partition size.
+// However, at the bottom and right borders of the image the requested size
+// may not be allowed in which case this code attempts to choose the largest
+// allowable partition.
+static void set_fixed_partitioning(AV1_COMP *cpi, const TileInfo *const tile,
+ MB_MODE_INFO **mib, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ int block_row, block_col;
+ MB_MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
+ int bh = mi_size_high[bsize];
+ int bw = mi_size_wide[bsize];
+
+ assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
+
+ // Apply the requested partition size to the SB if it is all "in image"
+ if ((mi_cols_remaining >= cm->seq_params.mib_size) &&
+ (mi_rows_remaining >= cm->seq_params.mib_size)) {
+ for (block_row = 0; block_row < cm->seq_params.mib_size; block_row += bh) {
+ for (block_col = 0; block_col < cm->seq_params.mib_size;
+ block_col += bw) {
+ int index = block_row * cm->mi_stride + block_col;
+ mib[index] = mi_upper_left + index;
+ mib[index]->sb_type = bsize;
+ }
+ }
+ } else {
+ // Else this is a partial SB.
+ set_partial_sb_partition(cm, mi_upper_left, bh, bw, mi_rows_remaining,
+ mi_cols_remaining, bsize, mib);
+ }
+}
+
+static void rd_use_partition(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, MB_MODE_INFO **mib,
+ TOKENEXTRA **tp, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate, int64_t *dist,
+ int do_recon, PC_TREE *pc_tree) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int bs = mi_size_wide[bsize];
+ const int hbs = bs / 2;
+ int i;
+ const int pl = (bsize >= BLOCK_8X8)
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const PARTITION_TYPE partition =
+ (bsize >= BLOCK_8X8) ? get_partition(cm, mi_row, mi_col, bsize)
+ : PARTITION_NONE;
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, partition);
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ RD_STATS last_part_rdc, none_rdc, chosen_rdc;
+ BLOCK_SIZE sub_subsize = BLOCK_4X4;
+ int splits_below = 0;
+ BLOCK_SIZE bs_type = mib[0]->sb_type;
+ int do_partition_search = 1;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ av1_invalid_rd_stats(&last_part_rdc);
+ av1_invalid_rd_stats(&none_rdc);
+ av1_invalid_rd_stats(&chosen_rdc);
+
+ pc_tree->partitioning = partition;
+
+ xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+ }
+
+ if (do_partition_search &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ cpi->sf.adjust_partitioning_from_last_frame) {
+ // Check if any of the sub blocks are further split.
+ if (partition == PARTITION_SPLIT && subsize > BLOCK_8X8) {
+ sub_subsize = get_partition_subsize(subsize, PARTITION_SPLIT);
+ splits_below = 1;
+ for (i = 0; i < 4; i++) {
+ int jj = i >> 1, ii = i & 0x01;
+ MB_MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
+ if (this_mi && this_mi->sb_type >= sub_subsize) {
+ splits_below = 0;
+ }
+ }
+ }
+
+ // If partition is not none try none unless each of the 4 splits are split
+ // even further..
+ if (partition != PARTITION_NONE && !splits_below &&
+ mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+ pc_tree->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+ PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+
+ if (none_rdc.rate < INT_MAX) {
+ none_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+ none_rdc.rdcost = RDCOST(x->rdmult, none_rdc.rate, none_rdc.dist);
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ mib[0]->sb_type = bs_type;
+ pc_tree->partitioning = partition;
+ }
+ }
+ for (int b = 0; b < 2; ++b) {
+ pc_tree->horizontal[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 3; ++b) {
+ pc_tree->horizontala[b].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[b].skip_ref_frame_mask = 0;
+ pc_tree->verticala[b].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[b].skip_ref_frame_mask = 0;
+ }
+ for (int b = 0; b < 4; ++b) {
+ pc_tree->horizontal4[b].skip_ref_frame_mask = 0;
+ pc_tree->vertical4[b].skip_ref_frame_mask = 0;
+ }
+ switch (partition) {
+ case PARTITION_NONE:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_NONE, bsize, ctx_none, INT64_MAX);
+ break;
+ case PARTITION_HORZ:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+ INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_row + hbs < cm->mi_rows) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+ mi_col, subsize, NULL);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + hbs, mi_col, &tmp_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+ INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_VERT:
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[0],
+ INT64_MAX);
+ if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
+ mi_col + hbs < cm->mi_cols) {
+ RD_STATS tmp_rdc;
+ const PICK_MODE_CONTEXT *const ctx_v = &pc_tree->vertical[0];
+ av1_init_rd_stats(&tmp_rdc);
+ update_state(cpi, tile_data, td, ctx_v, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row,
+ mi_col, subsize, NULL);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + hbs, &tmp_rdc,
+ PARTITION_VERT, subsize,
+ &pc_tree->vertical[bsize > BLOCK_8X8], INT64_MAX);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ last_part_rdc.rdcost += tmp_rdc.rdcost;
+ }
+ break;
+ case PARTITION_SPLIT:
+ last_part_rdc.rate = 0;
+ last_part_rdc.dist = 0;
+ last_part_rdc.rdcost = 0;
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ int jj = i >> 1, ii = i & 0x01;
+ RD_STATS tmp_rdc;
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ av1_init_rd_stats(&tmp_rdc);
+ rd_use_partition(cpi, td, tile_data,
+ mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
+ mi_row + y_idx, mi_col + x_idx, subsize, &tmp_rdc.rate,
+ &tmp_rdc.dist, i != 3, pc_tree->split[i]);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&last_part_rdc);
+ break;
+ }
+ last_part_rdc.rate += tmp_rdc.rate;
+ last_part_rdc.dist += tmp_rdc.dist;
+ }
+ break;
+ case PARTITION_VERT_A:
+ case PARTITION_VERT_B:
+ case PARTITION_HORZ_A:
+ case PARTITION_HORZ_B:
+ case PARTITION_HORZ_4:
+ case PARTITION_VERT_4:
+ assert(0 && "Cannot handle extended partition types");
+ default: assert(0); break;
+ }
+
+ if (last_part_rdc.rate < INT_MAX) {
+ last_part_rdc.rate += x->partition_cost[pl][partition];
+ last_part_rdc.rdcost =
+ RDCOST(x->rdmult, last_part_rdc.rate, last_part_rdc.dist);
+ }
+
+ if (do_partition_search && cpi->sf.adjust_partitioning_from_last_frame &&
+ cpi->sf.partition_search_type == SEARCH_PARTITION &&
+ partition != PARTITION_SPLIT && bsize > BLOCK_8X8 &&
+ (mi_row + bs < cm->mi_rows || mi_row + hbs == cm->mi_rows) &&
+ (mi_col + bs < cm->mi_cols || mi_col + hbs == cm->mi_cols)) {
+ BLOCK_SIZE split_subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ chosen_rdc.rate = 0;
+ chosen_rdc.dist = 0;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->partitioning = PARTITION_SPLIT;
+
+ // Split partition.
+ for (i = 0; i < 4; i++) {
+ int x_idx = (i & 1) * hbs;
+ int y_idx = (i >> 1) * hbs;
+ RD_STATS tmp_rdc;
+
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
+
+ save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ pc_tree->split[i]->partitioning = PARTITION_NONE;
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + y_idx, mi_col + x_idx,
+ &tmp_rdc, PARTITION_SPLIT, split_subsize,
+ &pc_tree->split[i]->none, INT64_MAX);
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
+ av1_invalid_rd_stats(&chosen_rdc);
+ break;
+ }
+
+ chosen_rdc.rate += tmp_rdc.rate;
+ chosen_rdc.dist += tmp_rdc.dist;
+
+ if (i != 3)
+ encode_sb(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+ OUTPUT_ENABLED, split_subsize, pc_tree->split[i], NULL);
+
+ chosen_rdc.rate += x->partition_cost[pl][PARTITION_NONE];
+ }
+ if (chosen_rdc.rate < INT_MAX) {
+ chosen_rdc.rate += x->partition_cost[pl][PARTITION_SPLIT];
+ chosen_rdc.rdcost = RDCOST(x->rdmult, chosen_rdc.rate, chosen_rdc.dist);
+ }
+ }
+
+ // If last_part is better set the partitioning to that.
+ if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
+ mib[0]->sb_type = bsize;
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = partition;
+ chosen_rdc = last_part_rdc;
+ }
+ // If none was better set the partitioning to that.
+ if (none_rdc.rdcost < chosen_rdc.rdcost) {
+ if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
+ chosen_rdc = none_rdc;
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+ // We must have chosen a partitioning and encoding or we'll fail later on.
+ // No other opportunities for success.
+ if (bsize == cm->seq_params.sb_size)
+ assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
+
+ if (do_recon) {
+ if (bsize == cm->seq_params.sb_size) {
+ // NOTE: To get estimate for rate due to the tokens, use:
+ // int rate_coeffs = 0;
+ // encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_COSTCOEFFS,
+ // bsize, pc_tree, &rate_coeffs);
+ x->cb_offset = 0;
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ *rate = chosen_rdc.rate;
+ *dist = chosen_rdc.dist;
+}
+
+/* clang-format off */
+static const BLOCK_SIZE min_partition_size[BLOCK_SIZES_ALL] = {
+ BLOCK_4X4, // 4x4
+ BLOCK_4X4, BLOCK_4X4, BLOCK_4X4, // 4x8, 8x4, 8x8
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 8x16, 16x8, 16x16
+ BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 16x32, 32x16, 32x32
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 32x64, 64x32, 64x64
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 64x128, 128x64, 128x128
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16
+};
+
+static const BLOCK_SIZE max_partition_size[BLOCK_SIZES_ALL] = {
+ BLOCK_8X8, // 4x4
+ BLOCK_16X16, BLOCK_16X16, BLOCK_16X16, // 4x8, 8x4, 8x8
+ BLOCK_32X32, BLOCK_32X32, BLOCK_32X32, // 8x16, 16x8, 16x16
+ BLOCK_64X64, BLOCK_64X64, BLOCK_64X64, // 16x32, 32x16, 32x32
+ BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 32x64, 64x32, 64x64
+ BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST, // 64x128, 128x64, 128x128
+ BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 4x16, 16x4, 8x32
+ BLOCK_32X32, BLOCK_LARGEST, BLOCK_LARGEST, // 32x8, 16x64, 64x16
+};
+
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES_ALL] = {
+ BLOCK_4X4, // 4x4
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x8, 8x4, 8x8
+ BLOCK_8X8, BLOCK_8X8, BLOCK_16X16, // 8x16, 16x8, 16x16
+ BLOCK_16X16, BLOCK_16X16, BLOCK_32X32, // 16x32, 32x16, 32x32
+ BLOCK_32X32, BLOCK_32X32, BLOCK_64X64, // 32x64, 64x32, 64x64
+ BLOCK_64X64, BLOCK_64X64, BLOCK_128X128, // 64x128, 128x64, 128x128
+ BLOCK_4X4, BLOCK_4X4, BLOCK_8X8, // 4x16, 16x4, 8x32
+ BLOCK_8X8, BLOCK_16X16, BLOCK_16X16, // 32x8, 16x64, 64x16
+};
+/* clang-format on */
+
+// Look at all the mode_info entries for blocks that are part of this
+// partition and find the min and max values for sb_type.
+// At the moment this is designed to work on a superblock but could be
+// adjusted to use a size parameter.
+//
+// The min and max are assumed to have been initialized prior to calling this
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, MB_MODE_INFO **mib,
+ BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
+ int i, j;
+ int index = 0;
+
+ // Check the sb_type for each block that belongs to this region.
+ for (i = 0; i < cm->seq_params.mib_size; ++i) {
+ for (j = 0; j < cm->seq_params.mib_size; ++j) {
+ MB_MODE_INFO *mi = mib[index + j];
+ BLOCK_SIZE sb_type = mi ? mi->sb_type : BLOCK_4X4;
+ *min_block_size = AOMMIN(*min_block_size, sb_type);
+ *max_block_size = AOMMAX(*max_block_size, sb_type);
+ }
+ index += xd->mi_stride;
+ }
+}
+
+// Checks to see if a super block is on a horizontal image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_h_edge(const AV1_COMP *cpi, int mi_row, int mi_step) {
+ int top_edge = 0;
+ int bottom_edge = cpi->common.mi_rows;
+ int is_active_h_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+ bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+ bottom_edge = AOMMAX(top_edge, bottom_edge);
+ }
+
+ if (((top_edge >= mi_row) && (top_edge < (mi_row + mi_step))) ||
+ ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + mi_step)))) {
+ is_active_h_edge = 1;
+ }
+ return is_active_h_edge;
+}
+
+// Checks to see if a super block is on a vertical image edge.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_v_edge(const AV1_COMP *cpi, int mi_col, int mi_step) {
+ int left_edge = 0;
+ int right_edge = cpi->common.mi_cols;
+ int is_active_v_edge = 0;
+
+ // For two pass account for any formatting bars detected.
+ if (cpi->oxcf.pass == 2) {
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ // The inactive region is specified in MBs not mi units.
+ // The image edge is in the following MB row.
+ left_edge += (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+
+ right_edge -= (int)(twopass->this_frame_stats.inactive_zone_cols * 2);
+ right_edge = AOMMAX(left_edge, right_edge);
+ }
+
+ if (((left_edge >= mi_col) && (left_edge < (mi_col + mi_step))) ||
+ ((right_edge >= mi_col) && (right_edge < (mi_col + mi_step)))) {
+ is_active_v_edge = 1;
+ }
+ return is_active_v_edge;
+}
+
+// Checks to see if a super block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_edge_sb(const AV1_COMP *cpi, int mi_row, int mi_col) {
+ return active_h_edge(cpi, mi_row, cpi->common.seq_params.mib_size) ||
+ active_v_edge(cpi, mi_col, cpi->common.seq_params.mib_size);
+}
+
+// Look at neighboring blocks and set a min and max partition size based on
+// what they chose.
+static void rd_auto_partition_range(AV1_COMP *cpi, const TileInfo *const tile,
+ MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE *min_block_size,
+ BLOCK_SIZE *max_block_size) {
+ AV1_COMMON *const cm = &cpi->common;
+ MB_MODE_INFO **mi = xd->mi;
+ const int left_in_image = xd->left_available && mi[-1];
+ const int above_in_image = xd->up_available && mi[-xd->mi_stride];
+ const int mi_rows_remaining = tile->mi_row_end - mi_row;
+ const int mi_cols_remaining = tile->mi_col_end - mi_col;
+ int bh, bw;
+ BLOCK_SIZE min_size = BLOCK_4X4;
+ BLOCK_SIZE max_size = BLOCK_LARGEST;
+
+ // Trap case where we do not have a prediction.
+ if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
+ // Default "min to max" and "max to min"
+ min_size = BLOCK_LARGEST;
+ max_size = BLOCK_4X4;
+
+ // NOTE: each call to get_sb_partition_size_range() uses the previous
+ // passed in values for min and max as a starting point.
+ // Find the min and max partition used in previous frame at this location
+ if (cm->frame_type != KEY_FRAME) {
+ MB_MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
+ get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
+ }
+ // Find the min and max partition sizes used in the left superblock
+ if (left_in_image) {
+ MB_MODE_INFO **left_sb_mi = &mi[-cm->seq_params.mib_size];
+ get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
+ }
+ // Find the min and max partition sizes used in the above suprblock.
+ if (above_in_image) {
+ MB_MODE_INFO **above_sb_mi =
+ &mi[-xd->mi_stride * cm->seq_params.mib_size];
+ get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
+ }
+
+ // Adjust observed min and max for "relaxed" auto partition case.
+ if (cpi->sf.auto_min_max_partition_size == RELAXED_NEIGHBORING_MIN_MAX) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+ }
+
+ // Check border cases where max and min from neighbors may not be legal.
+ max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
+ &bh, &bw);
+ min_size = AOMMIN(min_size, max_size);
+
+ // Test for blocks at the edge of the active image.
+ // This may be the actual edge of the image or where there are formatting
+ // bars.
+ if (active_edge_sb(cpi, mi_row, mi_col)) {
+ min_size = BLOCK_4X4;
+ } else {
+ min_size = AOMMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
+ }
+
+ // When use_square_partition_only is true, make sure at least one square
+ // partition is allowed by selecting the next smaller square size as
+ // *min_block_size.
+ if (min_size >= cpi->sf.use_square_partition_only_threshold) {
+ min_size = AOMMIN(min_size, next_square_size[max_size]);
+ }
+
+ *min_block_size = AOMMIN(min_size, cm->seq_params.sb_size);
+ *max_block_size = AOMMIN(max_size, cm->seq_params.sb_size);
+}
+
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(const AV1_COMMON *const cm,
+ const MACROBLOCKD *const xd, int mi_row,
+ int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *const min_bs,
+ BLOCK_SIZE *const max_bs) {
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ int idx, idy;
+
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **const prev_mi = &cm->prev_mi_grid_visible[idx_str];
+ BLOCK_SIZE min_size = cm->seq_params.sb_size; // default values
+ BLOCK_SIZE max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ const MB_MODE_INFO *const mi = prev_mi[idy * cm->mi_stride + idx];
+ const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_available) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ const MB_MODE_INFO *const mi = xd->mi[idy * cm->mi_stride - 1];
+ const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+
+ if (xd->up_available) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ const MB_MODE_INFO *const mi = xd->mi[idx - cm->mi_stride];
+ const BLOCK_SIZE bs = mi ? mi->sb_type : bsize;
+ min_size = AOMMIN(min_size, bs);
+ max_size = AOMMAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = AOMMIN(min_size, cm->seq_params.sb_size);
+ *max_bs = AOMMIN(max_size, cm->seq_params.sb_size);
+}
+
+static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
+ memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
+}
+
+static INLINE void load_pred_mv(MACROBLOCK *x,
+ const PICK_MODE_CONTEXT *const ctx) {
+ memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
+}
+
+#if CONFIG_FP_MB_STATS
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] = {
+ 0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
+ // TODO(debargha): What are the correct numbers here?
+ 130, 130, 150
+};
+const int qindex_split_threshold_lookup[BLOCK_SIZES] = {
+ 0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
+ // TODO(debargha): What are the correct numbers here?
+ 160, 160, 240
+};
+const int complexity_16x16_blocks_threshold[BLOCK_SIZES] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6,
+ // TODO(debargha): What are the correct numbers here?
+ 8, 8, 10
+};
+
+typedef enum {
+ MV_ZERO = 0,
+ MV_LEFT = 1,
+ MV_UP = 2,
+ MV_RIGHT = 3,
+ MV_DOWN = 4,
+ MV_INVALID
+} MOTION_DIRECTION;
+
+static INLINE MOTION_DIRECTION get_motion_direction_fp(uint8_t fp_byte) {
+ if (fp_byte & FPMB_MOTION_ZERO_MASK) {
+ return MV_ZERO;
+ } else if (fp_byte & FPMB_MOTION_LEFT_MASK) {
+ return MV_LEFT;
+ } else if (fp_byte & FPMB_MOTION_RIGHT_MASK) {
+ return MV_RIGHT;
+ } else if (fp_byte & FPMB_MOTION_UP_MASK) {
+ return MV_UP;
+ } else {
+ return MV_DOWN;
+ }
+}
+
+static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
+ MOTION_DIRECTION that_mv) {
+ if (this_mv == that_mv) {
+ return 0;
+ } else {
+ return abs(this_mv - that_mv) == 2 ? 2 : 1;
+ }
+}
+#endif
+
+// Try searching for an encoding for the given subblock. Returns zero if the
+// rdcost is already too high (to tell the caller not to bother searching for
+// encodings of further subblocks)
+static int rd_try_subblock(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp, int is_last,
+ int mi_row, int mi_col, BLOCK_SIZE subsize,
+ RD_STATS *best_rdc, RD_STATS *sum_rdc,
+ RD_STATS *this_rdc, PARTITION_TYPE partition,
+ PICK_MODE_CONTEXT *prev_ctx,
+ PICK_MODE_CONTEXT *this_ctx) {
+#define RTS_X_RATE_NOCOEF_ARG
+#define RTS_MAX_RDCOST best_rdc->rdcost
+
+ MACROBLOCK *const x = &td->mb;
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, prev_ctx);
+
+ const int64_t rdcost_remaining = best_rdc->rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc->rdcost - sum_rdc->rdcost);
+
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, this_rdc,
+ RTS_X_RATE_NOCOEF_ARG partition, subsize, this_ctx,
+ rdcost_remaining);
+
+ if (this_rdc->rate == INT_MAX) {
+ sum_rdc->rdcost = INT64_MAX;
+ } else {
+ sum_rdc->rate += this_rdc->rate;
+ sum_rdc->dist += this_rdc->dist;
+ sum_rdc->rdcost += this_rdc->rdcost;
+ }
+
+ if (sum_rdc->rdcost >= RTS_MAX_RDCOST) return 0;
+
+ if (!is_last) {
+ update_state(cpi, tile_data, td, this_ctx, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+ subsize, NULL);
+ }
+
+ return 1;
+
+#undef RTS_X_RATE_NOCOEF_ARG
+#undef RTS_MAX_RDCOST
+}
+
+static void rd_test_partition3(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ PC_TREE *pc_tree, RD_STATS *best_rdc,
+ PICK_MODE_CONTEXT ctxs[3],
+ PICK_MODE_CONTEXT *ctx, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, PARTITION_TYPE partition,
+ int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+ int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+ int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_STATS sum_rdc, this_rdc;
+#define RTP_STX_TRY_ARGS
+ int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+ av1_init_rd_stats(&sum_rdc);
+ sum_rdc.rate = x->partition_cost[pl][partition];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row0, mi_col0, subsize0,
+ best_rdc, &sum_rdc, &this_rdc,
+ RTP_STX_TRY_ARGS partition, ctx, &ctxs[0]))
+ return;
+
+ if (!rd_try_subblock(cpi, td, tile_data, tp, 0, mi_row1, mi_col1, subsize1,
+ best_rdc, &sum_rdc, &this_rdc,
+ RTP_STX_TRY_ARGS partition, &ctxs[0], &ctxs[1]))
+ return;
+
+ // With the new layout of mixed partitions for PARTITION_HORZ_B and
+ // PARTITION_VERT_B, the last subblock might start past halfway through the
+ // main block, so we might signal it even though the subblock lies strictly
+ // outside the image. In that case, we won't spend any bits coding it and the
+ // difference (obviously) doesn't contribute to the error.
+ const int try_block2 = 1;
+ if (try_block2 &&
+ !rd_try_subblock(cpi, td, tile_data, tp, 1, mi_row2, mi_col2, subsize2,
+ best_rdc, &sum_rdc, &this_rdc,
+ RTP_STX_TRY_ARGS partition, &ctxs[1], &ctxs[2]))
+ return;
+
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+ if (sum_rdc.rdcost >= best_rdc->rdcost) return;
+
+ *best_rdc = sum_rdc;
+ pc_tree->partitioning = partition;
+
+#undef RTP_STX_TRY_ARGS
+}
+
+static void reset_partition(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
+ pc_tree->partitioning = PARTITION_NONE;
+ pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+ pc_tree->none.skip = 0;
+
+ if (bsize >= BLOCK_8X8) {
+ BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ for (int idx = 0; idx < 4; ++idx)
+ reset_partition(pc_tree->split[idx], subsize);
+ }
+}
+
+static void rd_pick_sqr_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int64_t best_rd,
+ PC_TREE *pc_tree, int64_t *none_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_step = mi_size_wide[bsize] / 2;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ const TOKENEXTRA *const tp_orig = *tp;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+ int tmp_partition_cost[PARTITION_TYPES];
+ BLOCK_SIZE subsize;
+ RD_STATS this_rdc, sum_rdc, best_rdc, pn_rdc;
+ const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+ int do_square_split = bsize_at_least_8x8;
+ const int pl = bsize_at_least_8x8
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const int *partition_cost =
+ pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+ const int num_planes = av1_num_planes(cm);
+
+ int64_t split_rd[4] = { 0, 0, 0, 0 };
+
+ // Override skipping rectangular partition operations for edge blocks
+ const int has_rows = (mi_row + mi_step < cm->mi_rows);
+ const int has_cols = (mi_col + mi_step < cm->mi_cols);
+
+ if (none_rd) *none_rd = 0;
+
+ int partition_none_allowed = has_rows && has_cols;
+
+ (void)*tp_orig;
+ (void)split_rd;
+
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+ pc_tree->pc_tree_stats.valid = 1;
+
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c)
+ if (!(has_rows && has_cols)) {
+ assert(bsize_at_least_8x8 && pl >= 0);
+ const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+ for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+ if (has_cols) {
+ // At the bottom, the two possibilities are HORZ and SPLIT
+ aom_cdf_prob bot_cdf[2];
+ partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+ static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+ } else if (has_rows) {
+ // At the right, the two possibilities are VERT and SPLIT
+ aom_cdf_prob rhs_cdf[2];
+ partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+ static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+ } else {
+ // At the bottom right, we always split
+ tmp_partition_cost[PARTITION_SPLIT] = 0;
+ }
+
+ partition_cost = tmp_partition_cost;
+ }
+
+#ifndef NDEBUG
+ // Nothing should rely on the default value of this array (which is just
+ // leftover from encoding the previous block. Setting it to fixed pattern
+ // when debugging.
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
+#endif // NDEBUG
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ av1_init_rd_stats(&this_rdc);
+ av1_init_rd_stats(&sum_rdc);
+ av1_invalid_rd_stats(&best_rdc);
+ best_rdc.rdcost = best_rd;
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+ xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pc_tree->partitioning = PARTITION_NONE;
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+
+ pc_tree->pc_tree_stats.rdcost = ctx_none->rdcost;
+ pc_tree->pc_tree_stats.skip = ctx_none->skip;
+
+ if (none_rd) *none_rd = this_rdc.rdcost;
+ if (this_rdc.rate != INT_MAX) {
+ if (bsize_at_least_8x8) {
+ this_rdc.rate += pt_cost;
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ }
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ // Adjust dist breakout threshold according to the partition size.
+ const int64_t dist_breakout_thr =
+ cpi->sf.partition_search_breakout_dist_thr >>
+ ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+ const int rate_breakout_thr =
+ cpi->sf.partition_search_breakout_rate_thr *
+ num_pels_log2_lookup[bsize];
+
+ best_rdc = this_rdc;
+ if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+ pc_tree->cb_search_range = SEARCH_FULL_PLANE;
+
+ // If all y, u, v transform blocks in this partition are skippable, and
+ // the dist & rate are within the thresholds, the partition search is
+ // terminated for current branch of the partition search tree.
+ // The dist & rate thresholds are set to 0 at speed 0 to disable the
+ // early termination at that speed.
+ if (!x->e_mbd.lossless[xd->mi[0]->segment_id] &&
+ (ctx_none->skippable && best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr)) {
+ do_square_split = 0;
+ }
+ }
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // store estimated motion vector
+ if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+ int64_t temp_best_rdcost = best_rdc.rdcost;
+ pn_rdc = best_rdc;
+
+ // PARTITION_SPLIT
+ if (do_square_split) {
+ int reached_last_index = 0;
+ subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ int idx;
+
+ for (idx = 0; idx < 4 && sum_rdc.rdcost < temp_best_rdcost; ++idx) {
+ const int x_idx = (idx & 1) * mi_step;
+ const int y_idx = (idx >> 1) * mi_step;
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+ pc_tree->split[idx]->index = idx;
+ int64_t *p_split_rd = &split_rd[idx];
+ // TODO(Cherma) : Account for partition cost while passing best rd to
+ // rd_pick_sqr_partition()
+ rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row + y_idx,
+ mi_col + x_idx, subsize, &this_rdc,
+ temp_best_rdcost - sum_rdc.rdcost,
+ pc_tree->split[idx], p_split_rd);
+
+ pc_tree->pc_tree_stats.sub_block_rdcost[idx] = this_rdc.rdcost;
+ pc_tree->pc_tree_stats.sub_block_skip[idx] =
+ pc_tree->split[idx]->none.skip;
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ break;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+ reached_last_index = (idx == 4);
+
+ if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rate += partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ }
+
+ int has_split = 0;
+ if (pc_tree->partitioning == PARTITION_SPLIT) {
+ for (int cb_idx = 0; cb_idx <= AOMMIN(idx, 3); ++cb_idx) {
+ if (pc_tree->split[cb_idx]->partitioning == PARTITION_SPLIT)
+ ++has_split;
+ }
+
+ if (has_split >= 3 || sum_rdc.rdcost < (pn_rdc.rdcost >> 1)) {
+ pc_tree->cb_search_range = SPLIT_PLANE;
+ }
+ }
+
+ if (pc_tree->partitioning == PARTITION_NONE) {
+ pc_tree->cb_search_range = SEARCH_SAME_PLANE;
+ if (pn_rdc.dist <= sum_rdc.dist)
+ pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+ }
+
+ if (pn_rdc.rate == INT_MAX) pc_tree->cb_search_range = NONE_PARTITION_PLANE;
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ } // if (do_split)
+
+ pc_tree->pc_tree_stats.split = pc_tree->partitioning == PARTITION_SPLIT;
+ if (do_square_split) {
+ for (int i = 0; i < 4; ++i) {
+ pc_tree->pc_tree_stats.sub_block_split[i] =
+ pc_tree->split[i]->partitioning == PARTITION_SPLIT;
+ }
+ }
+
+ // TODO(jbb): This code added so that we avoid static analysis
+ // warning related to the fact that best_rd isn't used after this
+ // point. This code should be refactored so that the duplicate
+ // checks occur in some sub function and thus are used...
+ (void)best_rd;
+ *rd_cost = best_rdc;
+
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+ pc_tree->index != 3) {
+ if (bsize == cm->seq_params.sb_size) {
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ if (bsize == cm->seq_params.sb_size) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+
+#define FEATURE_SIZE 19
+static const float two_pass_split_partition_weights_128[FEATURE_SIZE + 1] = {
+ 2.683936f, -0.193620f, -4.106470f, -0.141320f, -0.282289f,
+ 0.125296f, -1.134961f, 0.862757f, -0.418799f, -0.637666f,
+ 0.016232f, 0.345013f, 0.018823f, -0.393394f, -1.130700f,
+ 0.695357f, 0.112569f, -0.341975f, -0.513882f, 5.7488966f,
+};
+
+static const float two_pass_split_partition_weights_64[FEATURE_SIZE + 1] = {
+ 2.990993f, 0.423273f, -0.926544f, 0.454646f, -0.292698f,
+ -1.311632f, -0.284432f, 0.717141f, -0.419257f, -0.574760f,
+ -0.674444f, 0.669047f, -0.374255f, 0.380624f, -0.804036f,
+ 0.264021f, 0.004163f, 1.896802f, 0.924287f, 0.13490619f,
+};
+
+static const float two_pass_split_partition_weights_32[FEATURE_SIZE + 1] = {
+ 2.795181f, -0.136943f, -0.924842f, 0.405330f, -0.463505f,
+ -0.584076f, -0.831472f, 0.382985f, -0.597544f, -0.138915f,
+ -1.354350f, 0.466035f, -0.553961f, 0.213202f, -1.166429f,
+ 0.010776f, -0.096236f, 2.335084f, 1.699857f, -0.58178353f,
+};
+
+static const float two_pass_split_partition_weights_16[FEATURE_SIZE + 1] = {
+ 1.987888f, -0.431100f, -1.687703f, 0.262602f, -0.425298f,
+ -0.463870f, -1.493457f, 0.470917f, -0.528457f, -0.087700f,
+ -1.815092f, 0.152883f, -0.337908f, 0.093679f, -1.548267f,
+ -0.042387f, -0.000861f, 2.556746f, 1.619192f, 0.03643292f,
+};
+
+static const float two_pass_split_partition_weights_8[FEATURE_SIZE + 1] = {
+ 2.188344f, -0.817528f, -2.119219f, 0.000000f, -0.348167f,
+ -0.658074f, -1.960362f, 0.000000f, -0.403080f, 0.282699f,
+ -2.061088f, 0.000000f, -0.431919f, -0.127960f, -1.099550f,
+ 0.000000f, 0.121622f, 2.017455f, 2.058228f, -0.15475988f,
+};
+
+static const float two_pass_none_partition_weights_128[FEATURE_SIZE + 1] = {
+ -1.006689f, 0.777908f, 4.461072f, -0.395782f, -0.014610f,
+ -0.853863f, 0.729997f, -0.420477f, 0.282429f, -1.194595f,
+ 3.181220f, -0.511416f, 0.117084f, -1.149348f, 1.507990f,
+ -0.477212f, 0.202963f, -1.469581f, 0.624461f, -0.89081228f,
+};
+
+static const float two_pass_none_partition_weights_64[FEATURE_SIZE + 1] = {
+ -1.241117f, 0.844878f, 5.638803f, -0.489780f, -0.108796f,
+ -4.576821f, 1.540624f, -0.477519f, 0.227791f, -1.443968f,
+ 1.586911f, -0.505125f, 0.140764f, -0.464194f, 1.466658f,
+ -0.641166f, 0.195412f, 1.427905f, 2.080007f, -1.98272777f,
+};
+
+static const float two_pass_none_partition_weights_32[FEATURE_SIZE + 1] = {
+ -2.130825f, 0.476023f, 5.907343f, -0.516002f, -0.097471f,
+ -2.662754f, 0.614858f, -0.576728f, 0.085261f, -0.031901f,
+ 0.727842f, -0.600034f, 0.079326f, 0.324328f, 0.504502f,
+ -0.547105f, -0.037670f, 0.304995f, 0.369018f, -2.66299987f,
+};
+
+static const float two_pass_none_partition_weights_16[FEATURE_SIZE + 1] = {
+ -1.626410f, 0.872047f, 5.414965f, -0.554781f, -0.084514f,
+ -3.020550f, 0.467632f, -0.382280f, 0.199568f, 0.426220f,
+ 0.829426f, -0.467100f, 0.153098f, 0.662994f, 0.327545f,
+ -0.560106f, -0.141610f, 0.403372f, 0.523991f, -3.02891231f,
+};
+
+static const float two_pass_none_partition_weights_8[FEATURE_SIZE + 1] = {
+ -1.463349f, 0.375376f, 4.751430f, 0.000000f, -0.184451f,
+ -1.655447f, 0.443214f, 0.000000f, 0.127961f, 0.152435f,
+ 0.083288f, 0.000000f, 0.143105f, 0.438012f, 0.073238f,
+ 0.000000f, -0.278137f, 0.186134f, 0.073737f, -1.6494962f,
+};
+
+// split_score indicates confidence of picking split partition;
+// none_score indicates confidence of picking none partition;
+static int ml_prune_2pass_split_partition(const PC_TREE_STATS *pc_tree_stats,
+ BLOCK_SIZE bsize, int *split_score,
+ int *none_score) {
+ if (!pc_tree_stats->valid) return 0;
+ const float *split_weights = NULL;
+ const float *none_weights = NULL;
+ switch (bsize) {
+ case BLOCK_4X4: break;
+ case BLOCK_8X8:
+ split_weights = two_pass_split_partition_weights_8;
+ none_weights = two_pass_none_partition_weights_8;
+ break;
+ case BLOCK_16X16:
+ split_weights = two_pass_split_partition_weights_16;
+ none_weights = two_pass_none_partition_weights_16;
+ break;
+ case BLOCK_32X32:
+ split_weights = two_pass_split_partition_weights_32;
+ none_weights = two_pass_none_partition_weights_32;
+ break;
+ case BLOCK_64X64:
+ split_weights = two_pass_split_partition_weights_64;
+ none_weights = two_pass_none_partition_weights_64;
+ break;
+ case BLOCK_128X128:
+ split_weights = two_pass_split_partition_weights_128;
+ none_weights = two_pass_none_partition_weights_128;
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!split_weights || !none_weights) return 0;
+
+ aom_clear_system_state();
+
+ float features[FEATURE_SIZE];
+ int feature_index = 0;
+ features[feature_index++] = (float)pc_tree_stats->split;
+ features[feature_index++] = (float)pc_tree_stats->skip;
+ const int rdcost = (int)AOMMIN(INT_MAX, pc_tree_stats->rdcost);
+ const int rd_valid = rdcost > 0 && rdcost < 1000000000;
+ features[feature_index++] = (float)rd_valid;
+ for (int i = 0; i < 4; ++i) {
+ features[feature_index++] = (float)pc_tree_stats->sub_block_split[i];
+ features[feature_index++] = (float)pc_tree_stats->sub_block_skip[i];
+ const int sub_rdcost =
+ (int)AOMMIN(INT_MAX, pc_tree_stats->sub_block_rdcost[i]);
+ const int sub_rd_valid = sub_rdcost > 0 && sub_rdcost < 1000000000;
+ features[feature_index++] = (float)sub_rd_valid;
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (rd_valid && sub_rd_valid && sub_rdcost < rdcost)
+ rd_ratio = (float)sub_rdcost / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+ assert(feature_index == FEATURE_SIZE);
+
+ float score_1 = split_weights[FEATURE_SIZE];
+ float score_2 = none_weights[FEATURE_SIZE];
+ for (int i = 0; i < FEATURE_SIZE; ++i) {
+ score_1 += features[i] * split_weights[i];
+ score_2 += features[i] * none_weights[i];
+ }
+ *split_score = (int)(score_1 * 100);
+ *none_score = (int)(score_2 * 100);
+ return 1;
+}
+#undef FEATURE_SIZE
+
+static void ml_prune_rect_partition(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int64_t best_rd, int64_t none_rd,
+ int64_t *split_rd,
+ int *const dst_prune_horz,
+ int *const dst_prune_vert) {
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ best_rd = AOMMAX(best_rd, 1);
+ const NN_CONFIG *nn_config = NULL;
+ const float prob_thresholds[5] = { 0.01f, 0.01f, 0.004f, 0.002f, 0.002f };
+ float cur_thresh = 0.0f;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_rect_partition_nnconfig_8;
+ cur_thresh = prob_thresholds[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_rect_partition_nnconfig_16;
+ cur_thresh = prob_thresholds[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_rect_partition_nnconfig_32;
+ cur_thresh = prob_thresholds[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_rect_partition_nnconfig_64;
+ cur_thresh = prob_thresholds[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_rect_partition_nnconfig_128;
+ cur_thresh = prob_thresholds[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+ aom_clear_system_state();
+
+ // 1. Compute input features
+ float features[9];
+
+ // RD cost ratios
+ for (int i = 0; i < 5; i++) features[i] = 1.0f;
+ if (none_rd > 0 && none_rd < 1000000000)
+ features[0] = (float)none_rd / (float)best_rd;
+ for (int i = 0; i < 4; i++) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ features[1 + i] = (float)split_rd[i] / (float)best_rd;
+ }
+
+ // Variance ratios
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ int whole_block_variance;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ whole_block_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ whole_block_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ whole_block_variance = AOMMAX(whole_block_variance, 1);
+
+ int split_variance[4];
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ struct buf_2d buf;
+ buf.stride = x->plane[0].src.stride;
+ const int bw = block_size_wide[bsize];
+ for (int i = 0; i < 4; ++i) {
+ const int x_idx = (i & 1) * bw / 2;
+ const int y_idx = (i >> 1) * bw / 2;
+ buf.buf = x->plane[0].src.buf + x_idx + y_idx * buf.stride;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ split_variance[i] =
+ av1_high_get_sby_perpixel_variance(cpi, &buf, subsize, xd->bd);
+ } else {
+ split_variance[i] = av1_get_sby_perpixel_variance(cpi, &buf, subsize);
+ }
+ }
+
+ for (int i = 0; i < 4; i++)
+ features[5 + i] = (float)split_variance[i] / (float)whole_block_variance;
+
+ // 2. Do the prediction and prune 0-2 partitions based on their probabilities
+ float raw_scores[3] = { 0.0f };
+ av1_nn_predict(features, nn_config, raw_scores);
+ float probs[3] = { 0.0f };
+ av1_nn_softmax(raw_scores, probs, 3);
+
+ // probs[0] is the probability of the fact that both rectangular partitions
+ // are worse than current best_rd
+ if (probs[1] <= cur_thresh) (*dst_prune_horz) = 1;
+ if (probs[2] <= cur_thresh) (*dst_prune_vert) = 1;
+}
+
+// Use a ML model to predict if horz_a, horz_b, vert_a, and vert_b should be
+// considered.
+static void ml_prune_ab_partition(BLOCK_SIZE bsize, int part_ctx, int var_ctx,
+ int64_t best_rd, int64_t horz_rd[2],
+ int64_t vert_rd[2], int64_t split_rd[4],
+ int *const horza_partition_allowed,
+ int *const horzb_partition_allowed,
+ int *const verta_partition_allowed,
+ int *const vertb_partition_allowed) {
+ if (bsize < BLOCK_8X8 || best_rd >= 1000000000) return;
+ const NN_CONFIG *nn_config = NULL;
+ switch (bsize) {
+ case BLOCK_8X8: nn_config = NULL; break;
+ case BLOCK_16X16: nn_config = &av1_ab_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_ab_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_ab_partition_nnconfig_64; break;
+ case BLOCK_128X128: nn_config = &av1_ab_partition_nnconfig_128; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ aom_clear_system_state();
+
+ // Generate features.
+ float features[10];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)var_ctx;
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < 2; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 4; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+ assert(feature_index == 10);
+
+ // Calculate scores using the NN model.
+ float score[16] = { 0.0f };
+ av1_nn_predict(features, nn_config, score);
+ int int_score[16];
+ int max_score = -1000;
+ for (int i = 0; i < 16; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 150; break;
+ case BLOCK_32X32: thresh -= 100; break;
+ default: break;
+ }
+ *horza_partition_allowed = 0;
+ *horzb_partition_allowed = 0;
+ *verta_partition_allowed = 0;
+ *vertb_partition_allowed = 0;
+ for (int i = 0; i < 16; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) *horza_partition_allowed = 1;
+ if ((i >> 1) & 1) *horzb_partition_allowed = 1;
+ if ((i >> 2) & 1) *verta_partition_allowed = 1;
+ if ((i >> 3) & 1) *vertb_partition_allowed = 1;
+ }
+ }
+}
+
+#define FEATURES 18
+#define LABELS 4
+// Use a ML model to predict if horz4 and vert4 should be considered.
+static void ml_prune_4_partition(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int part_ctx,
+ int64_t best_rd, int64_t horz_rd[2],
+ int64_t vert_rd[2], int64_t split_rd[4],
+ int *const partition_horz4_allowed,
+ int *const partition_vert4_allowed,
+ unsigned int pb_source_variance, int mi_row,
+ int mi_col) {
+ if (best_rd >= 1000000000) return;
+ const NN_CONFIG *nn_config = NULL;
+ switch (bsize) {
+ case BLOCK_16X16: nn_config = &av1_4_partition_nnconfig_16; break;
+ case BLOCK_32X32: nn_config = &av1_4_partition_nnconfig_32; break;
+ case BLOCK_64X64: nn_config = &av1_4_partition_nnconfig_64; break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config) return;
+
+ aom_clear_system_state();
+
+ // Generate features.
+ float features[FEATURES];
+ int feature_index = 0;
+ features[feature_index++] = (float)part_ctx;
+ features[feature_index++] = (float)get_unsigned_bits(pb_source_variance);
+
+ const int rdcost = (int)AOMMIN(INT_MAX, best_rd);
+ int sub_block_rdcost[8] = { 0 };
+ int rd_index = 0;
+ for (int i = 0; i < 2; ++i) {
+ if (horz_rd[i] > 0 && horz_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)horz_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (vert_rd[i] > 0 && vert_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)vert_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 4; ++i) {
+ if (split_rd[i] > 0 && split_rd[i] < 1000000000)
+ sub_block_rdcost[rd_index] = (int)split_rd[i];
+ ++rd_index;
+ }
+ for (int i = 0; i < 8; ++i) {
+ // Ratio between the sub-block RD and the whole-block RD.
+ float rd_ratio = 1.0f;
+ if (sub_block_rdcost[i] > 0 && sub_block_rdcost[i] < rdcost)
+ rd_ratio = (float)sub_block_rdcost[i] / (float)rdcost;
+ features[feature_index++] = rd_ratio;
+ }
+
+ // Get variance of the 1:4 and 4:1 sub-blocks.
+ unsigned int horz_4_source_var[4] = { 0 };
+ unsigned int vert_4_source_var[4] = { 0 };
+ {
+ BLOCK_SIZE horz_4_bs = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ BLOCK_SIZE vert_4_bs = get_partition_subsize(bsize, PARTITION_VERT_4);
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col,
+ av1_num_planes(&cpi->common));
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ for (int i = 0; i < 4; ++i) {
+ const uint8_t *horz_src =
+ src + i * block_size_high[horz_4_bs] * src_stride;
+ const uint8_t *vert_src = src + i * block_size_wide[vert_4_bs];
+ unsigned int horz_var, vert_var, sse;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ switch (xd->bd) {
+ case 10:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_10),
+ 0, &sse);
+ break;
+ case 12:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_12),
+ 0, &sse);
+ break;
+ case 8:
+ default:
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(
+ horz_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(
+ vert_src, src_stride, CONVERT_TO_BYTEPTR(AV1_HIGH_VAR_OFFS_8),
+ 0, &sse);
+ break;
+ }
+ horz_4_source_var[i] =
+ ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+ vert_4_source_var[i] =
+ ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+ } else {
+ horz_var = cpi->fn_ptr[horz_4_bs].vf(horz_src, src_stride, AV1_VAR_OFFS,
+ 0, &sse);
+ vert_var = cpi->fn_ptr[vert_4_bs].vf(vert_src, src_stride, AV1_VAR_OFFS,
+ 0, &sse);
+ horz_4_source_var[i] =
+ ROUND_POWER_OF_TWO(horz_var, num_pels_log2_lookup[horz_4_bs]);
+ vert_4_source_var[i] =
+ ROUND_POWER_OF_TWO(vert_var, num_pels_log2_lookup[vert_4_bs]);
+ }
+ }
+ }
+
+ const float denom = (float)(pb_source_variance + 1);
+ const float low_b = 0.1f;
+ const float high_b = 10.0f;
+ for (int i = 0; i < 4; ++i) {
+ // Ratio between the 4:1 sub-block variance and the whole-block variance.
+ float var_ratio = (float)(horz_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ for (int i = 0; i < 4; ++i) {
+ // Ratio between the 1:4 sub-block RD and the whole-block RD.
+ float var_ratio = (float)(vert_4_source_var[i] + 1) / denom;
+ if (var_ratio < low_b) var_ratio = low_b;
+ if (var_ratio > high_b) var_ratio = high_b;
+ features[feature_index++] = var_ratio;
+ }
+ assert(feature_index == FEATURES);
+
+ // Calculate scores using the NN model.
+ float score[LABELS] = { 0.0f };
+ av1_nn_predict(features, nn_config, score);
+ int int_score[LABELS];
+ int max_score = -1000;
+ for (int i = 0; i < LABELS; ++i) {
+ int_score[i] = (int)(100 * score[i]);
+ max_score = AOMMAX(int_score[i], max_score);
+ }
+
+ // Make decisions based on the model scores.
+ int thresh = max_score;
+ switch (bsize) {
+ case BLOCK_16X16: thresh -= 500; break;
+ case BLOCK_32X32: thresh -= 500; break;
+ case BLOCK_64X64: thresh -= 200; break;
+ default: break;
+ }
+ *partition_horz4_allowed = 0;
+ *partition_vert4_allowed = 0;
+ for (int i = 0; i < LABELS; ++i) {
+ if (int_score[i] >= thresh) {
+ if ((i >> 0) & 1) *partition_horz4_allowed = 1;
+ if ((i >> 1) & 1) *partition_vert4_allowed = 1;
+ }
+ }
+}
+#undef FEATURES
+#undef LABELS
+
+#define FEATURES 4
+// ML-based partition search breakout.
+static int ml_predict_breakout(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ unsigned int pb_source_variance) {
+ const NN_CONFIG *nn_config = NULL;
+ int thresh = 0;
+ switch (bsize) {
+ case BLOCK_8X8:
+ nn_config = &av1_partition_breakout_nnconfig_8;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[0];
+ break;
+ case BLOCK_16X16:
+ nn_config = &av1_partition_breakout_nnconfig_16;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[1];
+ break;
+ case BLOCK_32X32:
+ nn_config = &av1_partition_breakout_nnconfig_32;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[2];
+ break;
+ case BLOCK_64X64:
+ nn_config = &av1_partition_breakout_nnconfig_64;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[3];
+ break;
+ case BLOCK_128X128:
+ nn_config = &av1_partition_breakout_nnconfig_128;
+ thresh = cpi->sf.ml_partition_search_breakout_thresh[4];
+ break;
+ default: assert(0 && "Unexpected bsize.");
+ }
+ if (!nn_config || thresh < 0) return 0;
+
+ // Generate feature values.
+ float features[FEATURES];
+ int feature_index = 0;
+ aom_clear_system_state();
+
+ const int num_pels_log2 = num_pels_log2_lookup[bsize];
+ float rate_f = (float)AOMMIN(rd_stats->rate, INT_MAX);
+ rate_f = ((float)x->rdmult / 128.0f / 512.0f / (float)(1 << num_pels_log2)) *
+ rate_f;
+ features[feature_index++] = rate_f;
+
+ const float dist_f =
+ (float)(AOMMIN(rd_stats->dist, INT_MAX) >> num_pels_log2);
+ features[feature_index++] = dist_f;
+
+ features[feature_index++] = (float)pb_source_variance;
+
+ const int dc_q = (int)x->plane[0].dequant_QTX[0];
+ features[feature_index++] = (float)(dc_q * dc_q) / 256.0f;
+ assert(feature_index == FEATURES);
+
+ // Calculate score using the NN model.
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, &score);
+
+ // Make decision.
+ return (int)(score * 100) >= thresh;
+}
+#undef FEATURES
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previous rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(AV1_COMP *const cpi, ThreadData *td,
+ TileDataEnc *tile_data, TOKENEXTRA **tp,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ RD_STATS *rd_cost, int64_t best_rd,
+ PC_TREE *pc_tree, int64_t *none_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int mi_step = mi_size_wide[bsize] / 2;
+ RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
+ const TOKENEXTRA *const tp_orig = *tp;
+ PICK_MODE_CONTEXT *ctx_none = &pc_tree->none;
+ int tmp_partition_cost[PARTITION_TYPES];
+ BLOCK_SIZE subsize;
+ RD_STATS this_rdc, sum_rdc, best_rdc;
+ const int bsize_at_least_8x8 = (bsize >= BLOCK_8X8);
+ int do_square_split = bsize_at_least_8x8;
+ const int pl = bsize_at_least_8x8
+ ? partition_plane_context(xd, mi_row, mi_col, bsize)
+ : 0;
+ const int *partition_cost =
+ pl >= 0 ? x->partition_cost[pl] : x->partition_cost[0];
+
+ int do_rectangular_split = 1;
+ int64_t cur_none_rd = 0;
+ int64_t split_rd[4] = { 0, 0, 0, 0 };
+ int64_t horz_rd[2] = { 0, 0 };
+ int64_t vert_rd[2] = { 0, 0 };
+
+ int split_ctx_is_ready[2] = { 0, 0 };
+ int horz_ctx_is_ready = 0;
+ int vert_ctx_is_ready = 0;
+ BLOCK_SIZE bsize2 = get_partition_subsize(bsize, PARTITION_SPLIT);
+
+ if (best_rd < 0) {
+ pc_tree->none.rdcost = INT64_MAX;
+ pc_tree->none.skip = 0;
+ av1_invalid_rd_stats(rd_cost);
+ return;
+ }
+ if (bsize == cm->seq_params.sb_size) x->must_find_valid_partition = 0;
+
+ // Override skipping rectangular partition operations for edge blocks
+ const int has_rows = (mi_row + mi_step < cm->mi_rows);
+ const int has_cols = (mi_col + mi_step < cm->mi_cols);
+ const int xss = x->e_mbd.plane[1].subsampling_x;
+ const int yss = x->e_mbd.plane[1].subsampling_y;
+
+ BLOCK_SIZE min_size = x->min_partition_size;
+ BLOCK_SIZE max_size = x->max_partition_size;
+
+ if (none_rd) *none_rd = 0;
+
+#if CONFIG_FP_MB_STATS
+ unsigned int src_diff_var = UINT_MAX;
+ int none_complexity = 0;
+#endif
+
+ int partition_none_allowed = has_rows && has_cols;
+ int partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+ int partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+
+ (void)*tp_orig;
+
+ // Override partition costs at the edges of the frame in the same
+ // way as in read_partition (see decodeframe.c)
+ if (!(has_rows && has_cols)) {
+ assert(bsize_at_least_8x8 && pl >= 0);
+ const aom_cdf_prob *partition_cdf = cm->fc->partition_cdf[pl];
+ for (int i = 0; i < PARTITION_TYPES; ++i) tmp_partition_cost[i] = INT_MAX;
+ if (has_cols) {
+ // At the bottom, the two possibilities are HORZ and SPLIT
+ aom_cdf_prob bot_cdf[2];
+ partition_gather_vert_alike(bot_cdf, partition_cdf, bsize);
+ static const int bot_inv_map[2] = { PARTITION_HORZ, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(tmp_partition_cost, bot_cdf, bot_inv_map);
+ } else if (has_rows) {
+ // At the right, the two possibilities are VERT and SPLIT
+ aom_cdf_prob rhs_cdf[2];
+ partition_gather_horz_alike(rhs_cdf, partition_cdf, bsize);
+ static const int rhs_inv_map[2] = { PARTITION_VERT, PARTITION_SPLIT };
+ av1_cost_tokens_from_cdf(tmp_partition_cost, rhs_cdf, rhs_inv_map);
+ } else {
+ // At the bottom right, we always split
+ tmp_partition_cost[PARTITION_SPLIT] = 0;
+ }
+
+ partition_cost = tmp_partition_cost;
+ }
+
+#ifndef NDEBUG
+ // Nothing should rely on the default value of this array (which is just
+ // leftover from encoding the previous block. Setting it to fixed pattern
+ // when debugging.
+ // bit 0, 1, 2 are blk_skip of each plane
+ // bit 4, 5, 6 are initialization checking of each plane
+ memset(x->blk_skip, 0x77, sizeof(x->blk_skip));
+#endif // NDEBUG
+
+ assert(mi_size_wide[bsize] == mi_size_high[bsize]);
+
+ av1_init_rd_stats(&this_rdc);
+ av1_invalid_rd_stats(&best_rdc);
+ best_rdc.rdcost = best_rd;
+
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+
+ if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+ x->mb_energy = av1_log_block_var(cpi, x, bsize);
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ const int cb_partition_search_ctrl =
+ ((pc_tree->index == 0 || pc_tree->index == 3) +
+ get_chessboard_index(cm->current_video_frame)) &
+ 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+ }
+
+ // Determine partition types in search according to the speed features.
+ // The threshold set here has to be of square block size.
+ if (cpi->sf.auto_min_max_partition_size) {
+ const int no_partition_allowed = (bsize <= max_size && bsize >= min_size);
+ // Note: Further partitioning is NOT allowed when bsize == min_size already.
+ const int partition_allowed = (bsize <= max_size && bsize > min_size);
+ partition_none_allowed &= no_partition_allowed;
+ partition_horz_allowed &= partition_allowed || !has_rows;
+ partition_vert_allowed &= partition_allowed || !has_cols;
+ do_square_split &= bsize > min_size;
+ }
+
+ if (bsize > cpi->sf.use_square_partition_only_threshold) {
+ partition_horz_allowed &= !has_rows;
+ partition_vert_allowed &= !has_cols;
+ }
+
+ if (bsize > BLOCK_4X4 && x->use_cb_search_range &&
+ cpi->sf.auto_min_max_partition_size == 0) {
+ int split_score = 0;
+ int none_score = 0;
+ const int score_valid = ml_prune_2pass_split_partition(
+ &pc_tree->pc_tree_stats, bsize, &split_score, &none_score);
+ if (score_valid) {
+ {
+ const int only_split_thresh = 300;
+ const int no_none_thresh = 250;
+ const int no_split_thresh = 0;
+ if (split_score > only_split_thresh) {
+ partition_none_allowed = 0;
+ partition_horz_allowed = 0;
+ partition_vert_allowed = 0;
+ } else if (split_score > no_none_thresh) {
+ partition_none_allowed = 0;
+ }
+ if (split_score < no_split_thresh) do_square_split = 0;
+ }
+ {
+ const int no_split_thresh = 120;
+ const int no_none_thresh = -120;
+ if (none_score > no_split_thresh && partition_none_allowed)
+ do_square_split = 0;
+ if (none_score < no_none_thresh) partition_none_allowed = 0;
+ }
+ } else {
+ if (pc_tree->cb_search_range == SPLIT_PLANE) {
+ partition_none_allowed = 0;
+ partition_horz_allowed = 0;
+ partition_vert_allowed = 0;
+ }
+ if (pc_tree->cb_search_range == SEARCH_SAME_PLANE) do_square_split = 0;
+ if (pc_tree->cb_search_range == NONE_PARTITION_PLANE) {
+ do_square_split = 0;
+ partition_horz_allowed = 0;
+ partition_vert_allowed = 0;
+ }
+ }
+
+ // Fall back to default values in case all partition modes are rejected.
+ if (partition_none_allowed == 0 && do_square_split == 0 &&
+ partition_horz_allowed == 0 && partition_vert_allowed == 0) {
+ do_square_split = bsize_at_least_8x8;
+ partition_none_allowed = has_rows && has_cols;
+ partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+ partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+ }
+ }
+
+ xd->above_txfm_context = cm->above_txfm_context[tile_info->tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+ save_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src, mi_row,
+ mi_col, bsize);
+ }
+
+ // Decide whether we shall split directly and skip searching NONE by using
+ // the first pass block statistics
+ if (cpi->use_fp_mb_stats && bsize >= BLOCK_32X32 && do_square_split &&
+ partition_none_allowed && src_diff_var > 4 &&
+ cm->base_qindex < qindex_split_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ // compute a complexity measure, basically measure inconsistency of motion
+ // vectors obtained from the first pass in the current block
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+
+ MOTION_DIRECTION this_mv;
+ MOTION_DIRECTION right_mv;
+ MOTION_DIRECTION bottom_mv;
+
+ this_mv =
+ get_motion_direction_fp(cpi->twopass.this_frame_mb_stats[mb_index]);
+
+ // to its right
+ if (c != mb_col_end - 1) {
+ right_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + 1]);
+ none_complexity += get_motion_inconsistency(this_mv, right_mv);
+ }
+
+ // to its bottom
+ if (r != mb_row_end - 1) {
+ bottom_mv = get_motion_direction_fp(
+ cpi->twopass.this_frame_mb_stats[mb_index + cm->mb_cols]);
+ none_complexity += get_motion_inconsistency(this_mv, bottom_mv);
+ }
+
+ // do not count its left and top neighbors to avoid double counting
+ }
+ }
+
+ if (none_complexity > complexity_16x16_blocks_threshold[bsize]) {
+ partition_none_allowed = 0;
+ }
+ }
+#endif
+
+ // Ref frames picked in the [i_th] quarter subblock during square partition
+ // RD search. It may be used to prune ref frame selection of rect partitions.
+ int ref_frames_used[4] = {
+ 0,
+ };
+
+BEGIN_PARTITION_SEARCH:
+ if (x->must_find_valid_partition) {
+ partition_none_allowed = has_rows && has_cols;
+ partition_horz_allowed = has_cols && yss <= xss && bsize_at_least_8x8;
+ partition_vert_allowed = has_rows && xss <= yss && bsize_at_least_8x8;
+ }
+
+ // Partition block source pixel variance.
+ unsigned int pb_source_variance = UINT_MAX;
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8) partition_horz_allowed = 0;
+ if (block_size_wide[bsize] <= 8) partition_vert_allowed = 0;
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8)
+ do_square_split = 0;
+ }
+#endif
+
+ // PARTITION_NONE
+ if (partition_none_allowed) {
+ int pt_cost = 0;
+ if (bsize_at_least_8x8) {
+ pt_cost = partition_cost[PARTITION_NONE] < INT_MAX
+ ? partition_cost[PARTITION_NONE]
+ : 0;
+ }
+ int64_t partition_rd_cost = RDCOST(x->rdmult, pt_cost, 0);
+ int64_t best_remain_rdcost = (best_rdc.rdcost == INT64_MAX)
+ ? INT64_MAX
+ : (best_rdc.rdcost - partition_rd_cost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_NONE, bsize, ctx_none, best_remain_rdcost);
+ pb_source_variance = x->source_variance;
+ if (none_rd) *none_rd = this_rdc.rdcost;
+ cur_none_rd = this_rdc.rdcost;
+ if (this_rdc.rate != INT_MAX) {
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int ref_type = av1_ref_frame_type(ctx_none->mic.ref_frame);
+ for (int i = 0; i < 4; ++i) {
+ ref_frames_used[i] |= (1 << ref_type);
+ }
+ }
+ if (bsize_at_least_8x8) {
+ this_rdc.rate += pt_cost;
+ this_rdc.rdcost = RDCOST(x->rdmult, this_rdc.rate, this_rdc.dist);
+ }
+
+ if (this_rdc.rdcost < best_rdc.rdcost) {
+ // Adjust dist breakout threshold according to the partition size.
+ const int64_t dist_breakout_thr =
+ cpi->sf.partition_search_breakout_dist_thr >>
+ ((2 * (MAX_SB_SIZE_LOG2 - 2)) -
+ (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]));
+ const int rate_breakout_thr =
+ cpi->sf.partition_search_breakout_rate_thr *
+ num_pels_log2_lookup[bsize];
+
+ best_rdc = this_rdc;
+ if (bsize_at_least_8x8) pc_tree->partitioning = PARTITION_NONE;
+
+ if ((do_square_split || do_rectangular_split) &&
+ !x->e_mbd.lossless[xd->mi[0]->segment_id] && ctx_none->skippable) {
+ const int use_ml_based_breakout =
+ bsize <= cpi->sf.use_square_partition_only_threshold &&
+ bsize > BLOCK_4X4 && xd->bd == 8;
+ if (use_ml_based_breakout) {
+ if (ml_predict_breakout(cpi, bsize, x, &this_rdc,
+ pb_source_variance)) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+
+ // If all y, u, v transform blocks in this partition are skippable,
+ // and the dist & rate are within the thresholds, the partition
+ // search is terminated for current branch of the partition search
+ // tree. The dist & rate thresholds are set to 0 at speed 0 to
+ // disable the early termination at that speed.
+ if (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+
+#if CONFIG_FP_MB_STATS
+ // Check if every 16x16 first pass block statistics has zero
+ // motion and the corresponding first pass residue is small enough.
+ // If that is the case, check the difference variance between the
+ // current frame and the last frame. If the variance is small enough,
+ // stop further splitting in RD optimization
+ if (cpi->use_fp_mb_stats && do_square_split &&
+ cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ AOMMIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ AOMMIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ int skip = 1;
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+ if (!(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_MOTION_ZERO_MASK) ||
+ !(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_ERROR_SMALL_MASK)) {
+ skip = 0;
+ break;
+ }
+ }
+ if (skip == 0) {
+ break;
+ }
+ }
+ if (skip) {
+ if (src_diff_var == UINT_MAX) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
+ src_diff_var = get_sby_perpixel_diff_variance(
+ cpi, &x->plane[0].src, mi_row, mi_col, bsize);
+ }
+ if (src_diff_var < 8) {
+ do_square_split = 0;
+ do_rectangular_split = 0;
+ }
+ }
+ }
+#endif
+ }
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // store estimated motion vector
+ if (cpi->sf.adaptive_motion_search) store_pred_mv(x, ctx_none);
+
+ // PARTITION_SPLIT
+ if (do_square_split) {
+ av1_init_rd_stats(&sum_rdc);
+ subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ sum_rdc.rate = partition_cost[PARTITION_SPLIT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+ int idx;
+ for (idx = 0; idx < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++idx) {
+ const int x_idx = (idx & 1) * mi_step;
+ const int y_idx = (idx >> 1) * mi_step;
+
+ if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ continue;
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+ pc_tree->split[idx]->index = idx;
+ int64_t *p_split_rd = &split_rd[idx];
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ if (cpi->sf.prune_ref_frame_for_rect_partitions)
+ pc_tree->split[idx]->none.rate = INT_MAX;
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row + y_idx, mi_col + x_idx,
+ subsize, &this_rdc, best_remain_rdcost,
+ pc_tree->split[idx], p_split_rd);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ break;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions &&
+ pc_tree->split[idx]->none.rate != INT_MAX) {
+ const int ref_type =
+ av1_ref_frame_type(pc_tree->split[idx]->none.mic.ref_frame);
+ ref_frames_used[idx] |= (1 << ref_type);
+ }
+ if (idx <= 1 && (bsize <= BLOCK_8X8 ||
+ pc_tree->split[idx]->partitioning == PARTITION_NONE)) {
+ const MB_MODE_INFO *const mbmi = &pc_tree->split[idx]->none.mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted
+ if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED) split_ctx_is_ready[idx] = 1;
+ }
+ }
+ }
+ }
+ const int reached_last_index = (idx == 4);
+
+ if (reached_last_index && sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_SPLIT;
+ }
+ } else if (cpi->sf.less_rectangular_check_level > 0) {
+ // skip rectangular partition test when larger block size
+ // gives better rd cost
+ if (cpi->sf.less_rectangular_check_level == 2 || idx <= 2)
+ do_rectangular_split &= !partition_none_allowed;
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ } // if (do_split)
+
+ pc_tree->horizontal[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontal[1].skip_ref_frame_mask = 0;
+ pc_tree->vertical[0].skip_ref_frame_mask = 0;
+ pc_tree->vertical[1].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames) pc_tree->horizontal[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames) pc_tree->horizontal[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->vertical[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->vertical[1].skip_ref_frame_mask = ~used_frames;
+ }
+
+ int prune_horz = 0;
+ int prune_vert = 0;
+ if (cpi->sf.ml_prune_rect_partition && !frame_is_intra_only(cm) &&
+ (partition_horz_allowed || partition_vert_allowed)) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ ml_prune_rect_partition(cpi, x, bsize, best_rdc.rdcost, cur_none_rd,
+ split_rd, &prune_horz, &prune_vert);
+ }
+
+ // PARTITION_HORZ
+ if (partition_horz_allowed && !prune_horz &&
+ (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+ av1_init_rd_stats(&sum_rdc);
+ subsize = get_partition_subsize(bsize, PARTITION_HORZ);
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed) {
+ pc_tree->horizontal[0].pred_interp_filter =
+ av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+ }
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[0],
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ horz_rd[0] = this_rdc.rdcost;
+
+ if (sum_rdc.rdcost < best_rdc.rdcost && has_rows) {
+ const PICK_MODE_CONTEXT *const ctx_h = &pc_tree->horizontal[0];
+ const MB_MODE_INFO *const mbmi = &pc_tree->horizontal[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted
+ if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED) horz_ctx_is_ready = 1;
+ }
+ update_state(cpi, tile_data, td, ctx_h, mi_row, mi_col, subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+ subsize, NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_h);
+
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed) {
+ pc_tree->horizontal[1].pred_interp_filter =
+ av1_extract_interp_filter(ctx_h->mic.interp_filters, 0);
+ }
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col, &this_rdc,
+ PARTITION_HORZ, subsize, &pc_tree->horizontal[1],
+ best_rdc.rdcost - sum_rdc.rdcost);
+ horz_rd[1] = this_rdc.rdcost;
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_HORZ;
+ }
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // PARTITION_VERT
+ if (partition_vert_allowed && !prune_vert &&
+ (do_rectangular_split || active_v_edge(cpi, mi_col, mi_step))) {
+ av1_init_rd_stats(&sum_rdc);
+ subsize = get_partition_subsize(bsize, PARTITION_VERT);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed) {
+ pc_tree->vertical[0].pred_interp_filter =
+ av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+ }
+ sum_rdc.rate = partition_cost[PARTITION_VERT];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+ int64_t best_remain_rdcost = best_rdc.rdcost == INT64_MAX
+ ? INT64_MAX
+ : (best_rdc.rdcost - sum_rdc.rdcost);
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[0],
+ best_remain_rdcost);
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ vert_rd[0] = this_rdc.rdcost;
+ const int64_t vert_max_rdcost = best_rdc.rdcost;
+ if (sum_rdc.rdcost < vert_max_rdcost && has_cols) {
+ const MB_MODE_INFO *const mbmi = &pc_tree->vertical[0].mic;
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ // Neither palette mode nor cfl predicted
+ if (pmi->palette_size[0] == 0 && pmi->palette_size[1] == 0) {
+ if (mbmi->uv_mode != UV_CFL_PRED) vert_ctx_is_ready = 1;
+ }
+ update_state(cpi, tile_data, td, &pc_tree->vertical[0], mi_row, mi_col,
+ subsize, 1);
+ encode_superblock(cpi, tile_data, td, tp, DRY_RUN_NORMAL, mi_row, mi_col,
+ subsize, NULL);
+
+ if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx_none);
+
+ if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+ partition_none_allowed) {
+ pc_tree->vertical[1].pred_interp_filter =
+ av1_extract_interp_filter(ctx_none->mic.interp_filters, 0);
+ }
+ rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+ PARTITION_VERT, subsize, &pc_tree->vertical[1],
+ best_rdc.rdcost - sum_rdc.rdcost);
+ vert_rd[1] = this_rdc.rdcost;
+
+ if (this_rdc.rate == INT_MAX) {
+ sum_rdc.rdcost = INT64_MAX;
+ } else {
+ sum_rdc.rate += this_rdc.rate;
+ sum_rdc.dist += this_rdc.dist;
+ sum_rdc.rdcost += this_rdc.rdcost;
+ }
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_VERT;
+ }
+ }
+
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ if (pb_source_variance == UINT_MAX) {
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ pb_source_variance = av1_high_get_sby_perpixel_variance(
+ cpi, &x->plane[0].src, bsize, xd->bd);
+ } else {
+ pb_source_variance =
+ av1_get_sby_perpixel_variance(cpi, &x->plane[0].src, bsize);
+ }
+ }
+
+ const int ext_partition_allowed =
+ do_rectangular_split && bsize > BLOCK_8X8 && partition_none_allowed;
+
+ // The standard AB partitions are allowed whenever ext-partition-types are
+ // allowed
+ int horzab_partition_allowed = ext_partition_allowed;
+ int vertab_partition_allowed = ext_partition_allowed;
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 8 || block_size_wide[bsize] <= 8) {
+ horzab_partition_allowed = 0;
+ vertab_partition_allowed = 0;
+ }
+ }
+#endif
+
+ if (cpi->sf.prune_ext_partition_types_search_level) {
+ if (cpi->sf.prune_ext_partition_types_search_level == 1) {
+ // TODO(debargha,huisu@google.com): may need to tune the threshold for
+ // pb_source_variance.
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ (pc_tree->partitioning == PARTITION_NONE &&
+ pb_source_variance < 32) ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ } else {
+ horzab_partition_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ vertab_partition_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_SPLIT);
+ }
+ horz_rd[0] = (horz_rd[0] < INT64_MAX ? horz_rd[0] : 0);
+ horz_rd[1] = (horz_rd[1] < INT64_MAX ? horz_rd[1] : 0);
+ vert_rd[0] = (vert_rd[0] < INT64_MAX ? vert_rd[0] : 0);
+ vert_rd[1] = (vert_rd[1] < INT64_MAX ? vert_rd[1] : 0);
+ split_rd[0] = (split_rd[0] < INT64_MAX ? split_rd[0] : 0);
+ split_rd[1] = (split_rd[1] < INT64_MAX ? split_rd[1] : 0);
+ split_rd[2] = (split_rd[2] < INT64_MAX ? split_rd[2] : 0);
+ split_rd[3] = (split_rd[3] < INT64_MAX ? split_rd[3] : 0);
+ }
+ int horza_partition_allowed = horzab_partition_allowed;
+ int horzb_partition_allowed = horzab_partition_allowed;
+ if (cpi->sf.prune_ext_partition_types_search_level) {
+ const int64_t horz_a_rd = horz_rd[1] + split_rd[0] + split_rd[1];
+ const int64_t horz_b_rd = horz_rd[0] + split_rd[2] + split_rd[3];
+ switch (cpi->sf.prune_ext_partition_types_search_level) {
+ case 1:
+ horza_partition_allowed &= (horz_a_rd / 16 * 14 < best_rdc.rdcost);
+ horzb_partition_allowed &= (horz_b_rd / 16 * 14 < best_rdc.rdcost);
+ break;
+ case 2:
+ default:
+ horza_partition_allowed &= (horz_a_rd / 16 * 15 < best_rdc.rdcost);
+ horzb_partition_allowed &= (horz_b_rd / 16 * 15 < best_rdc.rdcost);
+ break;
+ }
+ }
+
+ int verta_partition_allowed = vertab_partition_allowed;
+ int vertb_partition_allowed = vertab_partition_allowed;
+ if (cpi->sf.prune_ext_partition_types_search_level) {
+ const int64_t vert_a_rd = vert_rd[1] + split_rd[0] + split_rd[2];
+ const int64_t vert_b_rd = vert_rd[0] + split_rd[1] + split_rd[3];
+ switch (cpi->sf.prune_ext_partition_types_search_level) {
+ case 1:
+ verta_partition_allowed &= (vert_a_rd / 16 * 14 < best_rdc.rdcost);
+ vertb_partition_allowed &= (vert_b_rd / 16 * 14 < best_rdc.rdcost);
+ break;
+ case 2:
+ default:
+ verta_partition_allowed &= (vert_a_rd / 16 * 15 < best_rdc.rdcost);
+ vertb_partition_allowed &= (vert_b_rd / 16 * 15 < best_rdc.rdcost);
+ break;
+ }
+ }
+
+ if (cpi->sf.ml_prune_ab_partition && ext_partition_allowed &&
+ partition_horz_allowed && partition_vert_allowed) {
+ // TODO(huisu@google.com): x->source_variance may not be the current block's
+ // variance. The correct one to use is pb_source_variance.
+ // Need to re-train the model to fix it.
+ ml_prune_ab_partition(bsize, pc_tree->partitioning,
+ get_unsigned_bits(x->source_variance),
+ best_rdc.rdcost, horz_rd, vert_rd, split_rd,
+ &horza_partition_allowed, &horzb_partition_allowed,
+ &verta_partition_allowed, &vertb_partition_allowed);
+ }
+
+ // PARTITION_HORZ_A
+ if (partition_horz_allowed && horza_partition_allowed) {
+ subsize = get_partition_subsize(bsize, PARTITION_HORZ_A);
+ pc_tree->horizontala[0].rd_mode_is_ready = 0;
+ pc_tree->horizontala[1].rd_mode_is_ready = 0;
+ pc_tree->horizontala[2].rd_mode_is_ready = 0;
+ if (split_ctx_is_ready[0]) {
+ av1_copy_tree_context(&pc_tree->horizontala[0], &pc_tree->split[0]->none);
+ pc_tree->horizontala[0].mic.partition = PARTITION_HORZ_A;
+ pc_tree->horizontala[0].rd_mode_is_ready = 1;
+ if (split_ctx_is_ready[1]) {
+ av1_copy_tree_context(&pc_tree->horizontala[1],
+ &pc_tree->split[1]->none);
+ pc_tree->horizontala[1].mic.partition = PARTITION_HORZ_A;
+ pc_tree->horizontala[1].rd_mode_is_ready = 1;
+ }
+ }
+ pc_tree->horizontala[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames)
+ pc_tree->horizontala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2] | ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontala[2].skip_ref_frame_mask = ~used_frames;
+ }
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->horizontala, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_HORZ_A, mi_row, mi_col, bsize2, mi_row,
+ mi_col + mi_step, bsize2, mi_row + mi_step, mi_col,
+ subsize);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+ // PARTITION_HORZ_B
+ if (partition_horz_allowed && horzb_partition_allowed) {
+ subsize = get_partition_subsize(bsize, PARTITION_HORZ_B);
+ pc_tree->horizontalb[0].rd_mode_is_ready = 0;
+ pc_tree->horizontalb[1].rd_mode_is_ready = 0;
+ pc_tree->horizontalb[2].rd_mode_is_ready = 0;
+ if (horz_ctx_is_ready) {
+ av1_copy_tree_context(&pc_tree->horizontalb[0], &pc_tree->horizontal[0]);
+ pc_tree->horizontalb[0].mic.partition = PARTITION_HORZ_B;
+ pc_tree->horizontalb[0].rd_mode_is_ready = 1;
+ }
+ pc_tree->horizontalb[0].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[1].skip_ref_frame_mask = 0;
+ pc_tree->horizontalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[1];
+ if (used_frames)
+ pc_tree->horizontalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames)
+ pc_tree->horizontalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames)
+ pc_tree->horizontalb[2].skip_ref_frame_mask = ~used_frames;
+ }
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->horizontalb, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_HORZ_B, mi_row, mi_col, subsize,
+ mi_row + mi_step, mi_col, bsize2, mi_row + mi_step,
+ mi_col + mi_step, bsize2);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // PARTITION_VERT_A
+ if (partition_vert_allowed && verta_partition_allowed) {
+ subsize = get_partition_subsize(bsize, PARTITION_VERT_A);
+ pc_tree->verticala[0].rd_mode_is_ready = 0;
+ pc_tree->verticala[1].rd_mode_is_ready = 0;
+ pc_tree->verticala[2].rd_mode_is_ready = 0;
+ if (split_ctx_is_ready[0]) {
+ av1_copy_tree_context(&pc_tree->verticala[0], &pc_tree->split[0]->none);
+ pc_tree->verticala[0].mic.partition = PARTITION_VERT_A;
+ pc_tree->verticala[0].rd_mode_is_ready = 1;
+ }
+ pc_tree->verticala[0].skip_ref_frame_mask = 0;
+ pc_tree->verticala[1].skip_ref_frame_mask = 0;
+ pc_tree->verticala[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0];
+ if (used_frames) pc_tree->verticala[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[2];
+ if (used_frames) pc_tree->verticala[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1] | ref_frames_used[3];
+ if (used_frames) pc_tree->verticala[2].skip_ref_frame_mask = ~used_frames;
+ }
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->verticala, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_VERT_A, mi_row, mi_col, bsize2,
+ mi_row + mi_step, mi_col, bsize2, mi_row,
+ mi_col + mi_step, subsize);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+ // PARTITION_VERT_B
+ if (partition_vert_allowed && vertb_partition_allowed) {
+ subsize = get_partition_subsize(bsize, PARTITION_VERT_B);
+ pc_tree->verticalb[0].rd_mode_is_ready = 0;
+ pc_tree->verticalb[1].rd_mode_is_ready = 0;
+ pc_tree->verticalb[2].rd_mode_is_ready = 0;
+ if (vert_ctx_is_ready) {
+ av1_copy_tree_context(&pc_tree->verticalb[0], &pc_tree->vertical[0]);
+ pc_tree->verticalb[0].mic.partition = PARTITION_VERT_B;
+ pc_tree->verticalb[0].rd_mode_is_ready = 1;
+ }
+ pc_tree->verticalb[0].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[1].skip_ref_frame_mask = 0;
+ pc_tree->verticalb[2].skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ int used_frames;
+ used_frames = ref_frames_used[0] | ref_frames_used[2];
+ if (used_frames) pc_tree->verticalb[0].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[1];
+ if (used_frames) pc_tree->verticalb[1].skip_ref_frame_mask = ~used_frames;
+ used_frames = ref_frames_used[3];
+ if (used_frames) pc_tree->verticalb[2].skip_ref_frame_mask = ~used_frames;
+ }
+ rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+ pc_tree->verticalb, ctx_none, mi_row, mi_col, bsize,
+ PARTITION_VERT_B, mi_row, mi_col, subsize, mi_row,
+ mi_col + mi_step, bsize2, mi_row + mi_step,
+ mi_col + mi_step, bsize2);
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // partition4_allowed is 1 if we can use a PARTITION_HORZ_4 or
+ // PARTITION_VERT_4 for this block. This is almost the same as
+ // ext_partition_allowed, except that we don't allow 128x32 or 32x128 blocks,
+ // so we require that bsize is not BLOCK_128X128.
+ const int partition4_allowed =
+ ext_partition_allowed && bsize != BLOCK_128X128;
+ int partition_horz4_allowed = partition4_allowed && partition_horz_allowed;
+ int partition_vert4_allowed = partition4_allowed && partition_vert_allowed;
+ if (cpi->sf.prune_ext_partition_types_search_level == 2) {
+ partition_horz4_allowed &= (pc_tree->partitioning == PARTITION_HORZ ||
+ pc_tree->partitioning == PARTITION_HORZ_A ||
+ pc_tree->partitioning == PARTITION_HORZ_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ partition_vert4_allowed &= (pc_tree->partitioning == PARTITION_VERT ||
+ pc_tree->partitioning == PARTITION_VERT_A ||
+ pc_tree->partitioning == PARTITION_VERT_B ||
+ pc_tree->partitioning == PARTITION_SPLIT ||
+ pc_tree->partitioning == PARTITION_NONE);
+ }
+ if (cpi->sf.ml_prune_4_partition && partition4_allowed &&
+ partition_horz_allowed && partition_vert_allowed) {
+ ml_prune_4_partition(cpi, x, bsize, pc_tree->partitioning, best_rdc.rdcost,
+ horz_rd, vert_rd, split_rd, &partition_horz4_allowed,
+ &partition_vert4_allowed, pb_source_variance, mi_row,
+ mi_col);
+ }
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (block_size_high[bsize] <= 16 || block_size_wide[bsize] <= 16) {
+ partition_horz4_allowed = 0;
+ partition_vert4_allowed = 0;
+ }
+ }
+#endif
+
+ // PARTITION_HORZ_4
+ if (partition_horz4_allowed && has_rows &&
+ (do_rectangular_split || active_h_edge(cpi, mi_row, mi_step))) {
+ av1_init_rd_stats(&sum_rdc);
+ const int quarter_step = mi_size_high[bsize] / 4;
+ PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+ subsize = get_partition_subsize(bsize, PARTITION_HORZ_4);
+ sum_rdc.rate = partition_cost[PARTITION_HORZ_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_row = mi_row + i * quarter_step;
+
+ if (i > 0 && this_mi_row >= cm->mi_rows) break;
+
+ PICK_MODE_CONTEXT *ctx_this = &pc_tree->horizontal4[i];
+
+ ctx_this->rd_mode_is_ready = 0;
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[1])
+ : (ref_frames_used[2] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), this_mi_row,
+ mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+ PARTITION_HORZ_4, ctx_prev, ctx_this))
+ break;
+
+ ctx_prev = ctx_this;
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_HORZ_4;
+ }
+ }
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ // PARTITION_VERT_4
+ if (partition_vert4_allowed && has_cols &&
+ (do_rectangular_split || active_v_edge(cpi, mi_row, mi_step))) {
+ av1_init_rd_stats(&sum_rdc);
+ const int quarter_step = mi_size_wide[bsize] / 4;
+ PICK_MODE_CONTEXT *ctx_prev = ctx_none;
+
+ subsize = get_partition_subsize(bsize, PARTITION_VERT_4);
+ sum_rdc.rate = partition_cost[PARTITION_VERT_4];
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, 0);
+
+ for (int i = 0; i < 4; ++i) {
+ const int this_mi_col = mi_col + i * quarter_step;
+
+ if (i > 0 && this_mi_col >= cm->mi_cols) break;
+
+ PICK_MODE_CONTEXT *ctx_this = &pc_tree->vertical4[i];
+
+ ctx_this->rd_mode_is_ready = 0;
+ ctx_this->skip_ref_frame_mask = 0;
+ if (cpi->sf.prune_ref_frame_for_rect_partitions) {
+ const int used_frames = i <= 1
+ ? (ref_frames_used[0] | ref_frames_used[2])
+ : (ref_frames_used[1] | ref_frames_used[3]);
+ if (used_frames) ctx_this->skip_ref_frame_mask = ~used_frames;
+ }
+ if (!rd_try_subblock(cpi, td, tile_data, tp, (i == 3), mi_row,
+ this_mi_col, subsize, &best_rdc, &sum_rdc, &this_rdc,
+ PARTITION_VERT_4, ctx_prev, ctx_this))
+ break;
+
+ ctx_prev = ctx_this;
+ }
+
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ sum_rdc.rdcost = RDCOST(x->rdmult, sum_rdc.rate, sum_rdc.dist);
+ if (sum_rdc.rdcost < best_rdc.rdcost) {
+ best_rdc = sum_rdc;
+ pc_tree->partitioning = PARTITION_VERT_4;
+ }
+ }
+ restore_context(x, &x_ctx, mi_row, mi_col, bsize, num_planes);
+ }
+
+ if (bsize == cm->seq_params.sb_size && best_rdc.rate == INT_MAX) {
+ // Did not find a valid partition, go back and search again, with less
+ // constraint on which partition types to search.
+ x->must_find_valid_partition = 1;
+ goto BEGIN_PARTITION_SEARCH;
+ }
+
+ // TODO(jbb): This code added so that we avoid static analysis
+ // warning related to the fact that best_rd isn't used after this
+ // point. This code should be refactored so that the duplicate
+ // checks occur in some sub function and thus are used...
+ (void)best_rd;
+ *rd_cost = best_rdc;
+
+ if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
+ pc_tree->index != 3) {
+ if (bsize == cm->seq_params.sb_size) {
+ x->cb_offset = 0;
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, OUTPUT_ENABLED, bsize,
+ pc_tree, NULL);
+ } else {
+ encode_sb(cpi, td, tile_data, tp, mi_row, mi_col, DRY_RUN_NORMAL, bsize,
+ pc_tree, NULL);
+ }
+ }
+
+ if (bsize == cm->seq_params.sb_size) {
+ assert(best_rdc.rate < INT_MAX);
+ assert(best_rdc.dist < INT64_MAX);
+ } else {
+ assert(tp_orig == *tp);
+ }
+}
+
+// Set all the counters as max.
+static void init_first_partition_pass_stats_tables(
+ FIRST_PARTITION_PASS_STATS *stats) {
+ for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+ memset(stats[i].ref0_counts, 0xff, sizeof(stats[i].ref0_counts));
+ memset(stats[i].ref1_counts, 0xff, sizeof(stats[i].ref1_counts));
+ stats[i].sample_counts = INT_MAX;
+ }
+}
+
+// clear pc_tree_stats
+static INLINE void clear_pc_tree_stats(PC_TREE *pt) {
+ if (pt == NULL) return;
+ pt->pc_tree_stats.valid = 0;
+ for (int i = 0; i < 4; ++i) {
+ clear_pc_tree_stats(pt->split[i]);
+ }
+}
+
+// Minimum number of samples to trigger the
+// mode_pruning_based_on_two_pass_partition_search feature.
+#define FIRST_PARTITION_PASS_MIN_SAMPLES 16
+
+static void encode_rd_sb_row(AV1_COMP *cpi, ThreadData *td,
+ TileDataEnc *tile_data, int mi_row,
+ TOKENEXTRA **tp) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const TileInfo *const tile_info = &tile_data->tile_info;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const int leaf_nodes = 256;
+
+ // Initialize the left context for the new SB row
+ av1_zero_left_context(xd);
+
+ // Reset delta for every tile
+ if (mi_row == tile_info->mi_row_start) {
+ if (cm->delta_q_present_flag) xd->current_qindex = cm->base_qindex;
+ if (cm->delta_lf_present_flag) {
+ av1_reset_loop_filter_delta(xd, av1_num_planes(cm));
+ }
+ }
+
+ PC_TREE *const pc_root =
+ td->pc_root[cm->seq_params.mib_size_log2 - MIN_MIB_SIZE_LOG2];
+ // Code each SB in the row
+ for (int mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
+ mi_col += cm->seq_params.mib_size) {
+ av1_fill_coeff_costs(&td->mb, xd->tile_ctx, num_planes);
+ av1_fill_mode_rates(cm, x, xd->tile_ctx);
+
+ if (sf->adaptive_pred_interp_filter) {
+ for (int i = 0; i < leaf_nodes; ++i) {
+ td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ }
+ }
+
+ x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+
+ av1_zero(x->txb_rd_record_8X8);
+ av1_zero(x->txb_rd_record_16X16);
+ av1_zero(x->txb_rd_record_32X32);
+ av1_zero(x->txb_rd_record_64X64);
+ av1_zero(x->txb_rd_record_intra);
+
+ av1_zero(x->pred_mv);
+ pc_root->index = 0;
+
+ const struct segmentation *const seg = &cm->seg;
+ int seg_skip = 0;
+ if (seg->enabled) {
+ const uint8_t *const map =
+ seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+ const int segment_id =
+ map ? get_segment_id(cm, map, cm->seq_params.sb_size, mi_row, mi_col)
+ : 0;
+ seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+ }
+ xd->cur_frame_force_integer_mv = cm->cur_frame_force_integer_mv;
+
+ x->sb_energy_level = 0;
+ if (cm->delta_q_present_flag) {
+ // Delta-q modulation based on variance
+ av1_setup_src_planes(x, cpi->source, mi_row, mi_col, num_planes);
+
+ int offset_qindex;
+ if (DELTAQ_MODULATION == 1) {
+ const int block_wavelet_energy_level =
+ av1_block_wavelet_energy_level(cpi, x, cm->seq_params.sb_size);
+ x->sb_energy_level = block_wavelet_energy_level;
+ offset_qindex = av1_compute_deltaq_from_energy_level(
+ cpi, block_wavelet_energy_level);
+ } else {
+ const int block_var_level =
+ av1_log_block_var(cpi, x, cm->seq_params.sb_size);
+ x->sb_energy_level = block_var_level;
+ offset_qindex =
+ av1_compute_deltaq_from_energy_level(cpi, block_var_level);
+ }
+ const int qmask = ~(cm->delta_q_res - 1);
+ int current_qindex = clamp(cm->base_qindex + offset_qindex,
+ cm->delta_q_res, 256 - cm->delta_q_res);
+ current_qindex =
+ ((current_qindex - cm->base_qindex + cm->delta_q_res / 2) & qmask) +
+ cm->base_qindex;
+ assert(current_qindex > 0);
+
+ xd->delta_qindex = current_qindex - cm->base_qindex;
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+ xd->mi[0]->current_qindex = current_qindex;
+ av1_init_plane_quantizers(cpi, x, xd->mi[0]->segment_id);
+ if (cpi->oxcf.deltaq_mode == DELTA_Q_LF) {
+ const int lfmask = ~(cm->delta_lf_res - 1);
+ const int delta_lf_from_base =
+ ((offset_qindex / 2 + cm->delta_lf_res / 2) & lfmask);
+
+ // pre-set the delta lf for loop filter. Note that this value is set
+ // before mi is assigned for each block in current superblock
+ for (int j = 0;
+ j < AOMMIN(cm->seq_params.mib_size, cm->mi_rows - mi_row); j++) {
+ for (int k = 0;
+ k < AOMMIN(cm->seq_params.mib_size, cm->mi_cols - mi_col); k++) {
+ cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+ .delta_lf_from_base =
+ clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ const int frame_lf_count =
+ av1_num_planes(cm) > 1 ? FRAME_LF_COUNT : FRAME_LF_COUNT - 2;
+ for (int lf_id = 0; lf_id < frame_lf_count; ++lf_id) {
+ cm->mi[(mi_row + j) * cm->mi_stride + (mi_col + k)]
+ .delta_lf[lf_id] =
+ clamp(delta_lf_from_base, -MAX_LOOP_FILTER, MAX_LOOP_FILTER);
+ }
+ }
+ }
+ }
+ }
+
+ int dummy_rate;
+ int64_t dummy_dist;
+ RD_STATS dummy_rdc;
+ const int idx_str = cm->mi_stride * mi_row + mi_col;
+ MB_MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+ x->source_variance = UINT_MAX;
+ if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+ const BLOCK_SIZE bsize =
+ seg_skip ? cm->seq_params.sb_size : sf->always_this_block_size;
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+ pc_root);
+ } else if (cpi->partition_search_skippable_frame) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+ const BLOCK_SIZE bsize =
+ get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
+ set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
+ rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rate, &dummy_dist, 1,
+ pc_root);
+ } else {
+ // If required set upper and lower partition size limits
+ if (sf->auto_min_max_partition_size) {
+ set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->seq_params.sb_size);
+ rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
+ &x->min_partition_size, &x->max_partition_size);
+ }
+
+ reset_partition(pc_root, cm->seq_params.sb_size);
+ x->use_cb_search_range = 0;
+ init_first_partition_pass_stats_tables(x->first_partition_pass_stats);
+ // Do the first pass if we need two pass partition search
+ if (cpi->sf.two_pass_partition_search &&
+ cpi->sf.use_square_partition_only_threshold > BLOCK_4X4 &&
+ mi_row + mi_size_high[cm->seq_params.sb_size] < cm->mi_rows &&
+ mi_col + mi_size_wide[cm->seq_params.sb_size] < cm->mi_cols &&
+ cm->frame_type != KEY_FRAME) {
+ x->cb_partition_scan = 1;
+ // Reset the stats tables.
+ if (sf->mode_pruning_based_on_two_pass_partition_search)
+ av1_zero(x->first_partition_pass_stats);
+ clear_pc_tree_stats(pc_root);
+ rd_pick_sqr_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rdc, INT64_MAX,
+ pc_root, NULL);
+ x->cb_partition_scan = 0;
+
+ x->source_variance = UINT_MAX;
+ if (sf->adaptive_pred_interp_filter) {
+ for (int i = 0; i < leaf_nodes; ++i) {
+ td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
+ td->pc_tree[i].horizontal[1].pred_interp_filter = SWITCHABLE;
+ }
+ }
+
+ x->mb_rd_record.num = x->mb_rd_record.index_start = 0;
+ av1_zero(x->txb_rd_record_8X8);
+ av1_zero(x->txb_rd_record_16X16);
+ av1_zero(x->txb_rd_record_32X32);
+ av1_zero(x->txb_rd_record_64X64);
+ av1_zero(x->txb_rd_record_intra);
+ av1_zero(x->pred_mv);
+ pc_root->index = 0;
+
+ for (int idy = 0; idy < mi_size_high[cm->seq_params.sb_size]; ++idy) {
+ for (int idx = 0; idx < mi_size_wide[cm->seq_params.sb_size]; ++idx) {
+ const int offset = cm->mi_stride * (mi_row + idy) + (mi_col + idx);
+ cm->mi_grid_visible[offset] = 0;
+ }
+ }
+
+ x->use_cb_search_range = 1;
+
+ if (sf->mode_pruning_based_on_two_pass_partition_search) {
+ for (int i = 0; i < FIRST_PARTITION_PASS_STATS_TABLES; ++i) {
+ FIRST_PARTITION_PASS_STATS *const stat =
+ &x->first_partition_pass_stats[i];
+ if (stat->sample_counts < FIRST_PARTITION_PASS_MIN_SAMPLES) {
+ // If there are not enough samples collected, make all available.
+ memset(stat->ref0_counts, 0xff, sizeof(stat->ref0_counts));
+ memset(stat->ref1_counts, 0xff, sizeof(stat->ref1_counts));
+ } else if (sf->selective_ref_frame < 2) {
+ // ALTREF2_FRAME and BWDREF_FRAME may be skipped during the
+ // initial partition scan, so we don't eliminate them.
+ stat->ref0_counts[ALTREF2_FRAME] = 0xff;
+ stat->ref1_counts[ALTREF2_FRAME] = 0xff;
+ stat->ref0_counts[BWDREF_FRAME] = 0xff;
+ stat->ref1_counts[BWDREF_FRAME] = 0xff;
+ }
+ }
+ }
+ }
+
+ rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col,
+ cm->seq_params.sb_size, &dummy_rdc, INT64_MAX, pc_root,
+ NULL);
+ }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ // TODO(angiebird): Let inter_mode_rd_model_estimation support multi-tile.
+ if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+ cm->tile_rows == 1) {
+ av1_inter_mode_data_fit(tile_data, x->rdmult);
+ }
+#endif
+ }
+}
+
+static void init_encode_frame_mb_context(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Copy data over into macro block data structures.
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+
+ av1_setup_block_planes(xd, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, num_planes);
+}
+
+static MV_REFERENCE_FRAME get_frame_type(const AV1_COMP *cpi) {
+ if (frame_is_intra_only(&cpi->common)) return INTRA_FRAME;
+ // We will not update the golden frame with an internal overlay frame
+ else if ((cpi->rc.is_src_frame_alt_ref && cpi->refresh_golden_frame) ||
+ cpi->rc.is_src_frame_ext_arf)
+ return ALTREF_FRAME;
+ else if (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_alt_ref_frame)
+ return GOLDEN_FRAME;
+ else
+ // TODO(zoeliu): To investigate whether a frame_type other than
+ // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+ return LAST_FRAME;
+}
+
+static TX_MODE select_tx_mode(const AV1_COMP *cpi) {
+ if (cpi->common.coded_lossless) return ONLY_4X4;
+ if (cpi->sf.tx_size_search_method == USE_LARGESTALL)
+ return TX_MODE_LARGEST;
+ else if (cpi->sf.tx_size_search_method == USE_FULL_RD ||
+ cpi->sf.tx_size_search_method == USE_FAST_RD)
+ return TX_MODE_SELECT;
+ else
+ return cpi->common.tx_mode;
+}
+
+void av1_alloc_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+
+ if (cpi->tile_data != NULL) aom_free(cpi->tile_data);
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data)));
+ cpi->allocated_tiles = tile_cols * tile_rows;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row)
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ int i, j;
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ for (j = 0; j < MAX_MODES; ++j) {
+ tile_data->thresh_freq_fact[i][j] = 32;
+ tile_data->mode_map[i][j] = j;
+ }
+ }
+ }
+}
+
+void av1_init_tile_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+ TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
+ TOKENLIST *tplist = cpi->tplist[0][0];
+ unsigned int tile_tok = 0;
+ int tplist_count = 0;
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ TileDataEnc *const tile_data =
+ &cpi->tile_data[tile_row * tile_cols + tile_col];
+ TileInfo *const tile_info = &tile_data->tile_info;
+ av1_tile_init(tile_info, cm, tile_row, tile_col);
+
+ cpi->tile_tok[tile_row][tile_col] = pre_tok + tile_tok;
+ pre_tok = cpi->tile_tok[tile_row][tile_col];
+ tile_tok = allocated_tokens(
+ *tile_info, cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col] = tplist + tplist_count;
+ tplist = cpi->tplist[tile_row][tile_col];
+ tplist_count = av1_get_sb_rows_in_tile(cm, tile_data->tile_info);
+ tile_data->allow_update_cdf = !cm->large_scale_tile;
+ tile_data->allow_update_cdf =
+ tile_data->allow_update_cdf && !cm->disable_cdf_update;
+ }
+ }
+}
+
+void av1_encode_sb_row(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col, int mi_row) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ TOKENEXTRA *tok = NULL;
+ int sb_row_in_tile;
+ int tile_mb_cols = (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+
+ int num_mb_rows_in_sb =
+ ((1 << (cm->seq_params.mib_size_log2 + MI_SIZE_LOG2)) + 8) >> 4;
+
+ sb_row_in_tile =
+ (mi_row - tile_info->mi_row_start) >> cm->seq_params.mib_size_log2;
+
+ get_start_tok(cpi, tile_row, tile_col, mi_row, &tok,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes);
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start = tok;
+
+ encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
+
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop = tok;
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].count =
+ (unsigned int)(cpi->tplist[tile_row][tile_col][sb_row_in_tile].stop -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start);
+
+ assert(
+ (unsigned int)(tok -
+ cpi->tplist[tile_row][tile_col][sb_row_in_tile].start) <=
+ get_token_alloc(num_mb_rows_in_sb, tile_mb_cols,
+ cm->seq_params.mib_size_log2 + MI_SIZE_LOG2, num_planes));
+
+ (void)tile_mb_cols;
+ (void)num_mb_rows_in_sb;
+}
+
+void av1_encode_tile(AV1_COMP *cpi, ThreadData *td, int tile_row,
+ int tile_col) {
+ AV1_COMMON *const cm = &cpi->common;
+ TileDataEnc *const this_tile =
+ &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+ int mi_row;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ av1_inter_mode_data_init(this_tile);
+#endif
+
+ av1_zero_above_context(cm, &td->mb.e_mbd, tile_info->mi_col_start,
+ tile_info->mi_col_end, tile_row);
+ av1_init_above_context(cm, &td->mb.e_mbd, tile_row);
+
+ // Set up pointers to per thread motion search counters.
+ this_tile->m_search_count = 0; // Count of motion search hits.
+ this_tile->ex_search_count = 0; // Exhaustive mesh search hits.
+ td->mb.m_search_count_ptr = &this_tile->m_search_count;
+ td->mb.ex_search_count_ptr = &this_tile->ex_search_count;
+ this_tile->tctx = *cm->fc;
+ td->mb.e_mbd.tile_ctx = &this_tile->tctx;
+
+ cfl_init(&td->mb.e_mbd.cfl, &cm->seq_params);
+
+ av1_crc32c_calculator_init(&td->mb.mb_rd_record.crc_calculator);
+
+ td->intrabc_used_this_tile = 0;
+
+ for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
+ mi_row += cm->seq_params.mib_size) {
+ av1_encode_sb_row(cpi, td, tile_row, tile_col, mi_row);
+ }
+}
+
+static void encode_tiles(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int tile_col, tile_row;
+
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+
+ for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
+ for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
+ av1_encode_tile(cpi, &cpi->td, tile_row, tile_col);
+ cpi->intrabc_used |= cpi->td.intrabc_used_this_tile;
+ }
+ }
+}
+
+#if CONFIG_FP_MB_STATS
+static int input_fpmb_stats(FIRSTPASS_MB_STATS *firstpass_mb_stats,
+ AV1_COMMON *cm, uint8_t **this_frame_mb_stats) {
+ uint8_t *mb_stats_in = firstpass_mb_stats->mb_stats_start +
+ cm->current_video_frame * cm->MBs * sizeof(uint8_t);
+
+ if (mb_stats_in > firstpass_mb_stats->mb_stats_end) return EOF;
+
+ *this_frame_mb_stats = mb_stats_in;
+
+ return 1;
+}
+#endif
+
+#define GLOBAL_TRANS_TYPES_ENC 3 // highest motion model to search
+static int gm_get_params_cost(const WarpedMotionParams *gm,
+ const WarpedMotionParams *ref_gm, int allow_hp) {
+ int params_cost = 0;
+ int trans_bits, trans_prec_diff;
+ switch (gm->wmtype) {
+ case AFFINE:
+ case ROTZOOM:
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[2] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[3] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[3] >> GM_ALPHA_PREC_DIFF));
+ if (gm->wmtype >= AFFINE) {
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[4] >> GM_ALPHA_PREC_DIFF),
+ (gm->wmmat[4] >> GM_ALPHA_PREC_DIFF));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ GM_ALPHA_MAX + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) -
+ (1 << GM_ALPHA_PREC_BITS),
+ (gm->wmmat[5] >> GM_ALPHA_PREC_DIFF) - (1 << GM_ALPHA_PREC_BITS));
+ }
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ trans_bits = (gm->wmtype == TRANSLATION)
+ ? GM_ABS_TRANS_ONLY_BITS - !allow_hp
+ : GM_ABS_TRANS_BITS;
+ trans_prec_diff = (gm->wmtype == TRANSLATION)
+ ? GM_TRANS_ONLY_PREC_DIFF + !allow_hp
+ : GM_TRANS_PREC_DIFF;
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[0] >> trans_prec_diff),
+ (gm->wmmat[0] >> trans_prec_diff));
+ params_cost += aom_count_signed_primitive_refsubexpfin(
+ (1 << trans_bits) + 1, SUBEXPFIN_K,
+ (ref_gm->wmmat[1] >> trans_prec_diff),
+ (gm->wmmat[1] >> trans_prec_diff));
+ AOM_FALLTHROUGH_INTENDED;
+ case IDENTITY: break;
+ default: assert(0);
+ }
+ return (params_cost << AV1_PROB_COST_SHIFT);
+}
+
+static int do_gm_search_logic(SPEED_FEATURES *const sf, int num_refs_using_gm,
+ int frame) {
+ (void)num_refs_using_gm;
+ (void)frame;
+ switch (sf->gm_search_type) {
+ case GM_FULL_SEARCH: return 1;
+ case GM_REDUCED_REF_SEARCH:
+ return !(frame == LAST2_FRAME || frame == LAST3_FRAME);
+ case GM_DISABLE_SEARCH: return 0;
+ default: assert(0);
+ }
+ return 1;
+}
+
+// Estimate if the source frame is screen content, based on the portion of
+// blocks that have no more than 4 (experimentally selected) luma colors.
+static int is_screen_content(const uint8_t *src, int use_hbd, int bd,
+ int stride, int width, int height) {
+ assert(src != NULL);
+ int counts = 0;
+ const int blk_w = 16;
+ const int blk_h = 16;
+ const int limit = 4;
+ for (int r = 0; r + blk_h <= height; r += blk_h) {
+ for (int c = 0; c + blk_w <= width; c += blk_w) {
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ const int n_colors =
+ use_hbd ? av1_count_colors_highbd(src + r * stride + c, stride, blk_w,
+ blk_h, bd, count_buf)
+ : av1_count_colors(src + r * stride + c, stride, blk_w, blk_h,
+ count_buf);
+ if (n_colors > 1 && n_colors <= limit) counts++;
+ }
+ }
+ // The threshold is 10%.
+ return counts * blk_h * blk_w * 10 > width * height;
+}
+
+static const uint8_t ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
+// Enforce the number of references for each arbitrary frame limited to
+// (INTER_REFS_PER_FRAME - 1)
+static void enforce_max_ref_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+ int total_valid_refs = 0;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])
+ total_valid_refs++;
+ }
+
+ // NOTE(zoeliu): When all the possible reference frames are availble, we
+ // reduce the number of reference frames by 1, following the rules of:
+ // (1) Retain GOLDEN_FARME/ALTEF_FRAME;
+ // (2) Check the earliest 2 remaining reference frames, and remove the one
+ // with the lower quality factor, otherwise if both have been coded at
+ // the same quality level, remove the earliest reference frame.
+
+ if (total_valid_refs == INTER_REFS_PER_FRAME) {
+ unsigned int min_ref_offset = UINT_MAX;
+ unsigned int second_min_ref_offset = UINT_MAX;
+ MV_REFERENCE_FRAME earliest_ref_frames[2] = { LAST3_FRAME, LAST2_FRAME };
+ int earliest_buf_idxes[2] = { 0 };
+
+ // Locate the earliest two reference frames except GOLDEN/ALTREF.
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Retain GOLDEN/ALTERF
+ if (ref_frame == GOLDEN_FRAME || ref_frame == ALTREF_FRAME) continue;
+
+ const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+ if (buf_idx >= 0) {
+ const unsigned int ref_offset =
+ cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+
+ if (min_ref_offset == UINT_MAX) {
+ min_ref_offset = ref_offset;
+ earliest_ref_frames[0] = ref_frame;
+ earliest_buf_idxes[0] = buf_idx;
+ } else {
+ if (get_relative_dist(cm, ref_offset, min_ref_offset) < 0) {
+ second_min_ref_offset = min_ref_offset;
+ earliest_ref_frames[1] = earliest_ref_frames[0];
+ earliest_buf_idxes[1] = earliest_buf_idxes[0];
+
+ min_ref_offset = ref_offset;
+ earliest_ref_frames[0] = ref_frame;
+ earliest_buf_idxes[0] = buf_idx;
+ } else if (second_min_ref_offset == UINT_MAX ||
+ get_relative_dist(cm, ref_offset, second_min_ref_offset) <
+ 0) {
+ second_min_ref_offset = ref_offset;
+ earliest_ref_frames[1] = ref_frame;
+ earliest_buf_idxes[1] = buf_idx;
+ }
+ }
+ }
+ }
+ // Check the coding quality factors of the two earliest reference frames.
+ RATE_FACTOR_LEVEL ref_rf_level[2];
+ double ref_rf_deltas[2];
+ for (int i = 0; i < 2; ++i) {
+ ref_rf_level[i] = cpi->frame_rf_level[earliest_buf_idxes[i]];
+ ref_rf_deltas[i] = rate_factor_deltas[ref_rf_level[i]];
+ }
+ (void)ref_rf_level;
+ (void)ref_rf_deltas;
+
+#define USE_RF_LEVEL_TO_ENFORCE 1
+#if USE_RF_LEVEL_TO_ENFORCE
+ // If both earliest two reference frames are coded using the same rate-
+ // factor, disable the earliest reference frame; Otherwise disable the
+ // reference frame that uses a lower rate-factor delta.
+ const MV_REFERENCE_FRAME ref_frame_to_disable =
+ (ref_rf_deltas[0] <= ref_rf_deltas[1]) ? earliest_ref_frames[0]
+ : earliest_ref_frames[1];
+#else
+ // Always disable the earliest reference frame
+ const MV_REFERENCE_FRAME ref_frame_to_disable = earliest_ref_frames[0];
+#endif // USE_RF_LEVEL_TO_ENFORCE
+#undef USE_RF_LEVEL_TO_ENFORCE
+
+ switch (ref_frame_to_disable) {
+ case LAST_FRAME: cpi->ref_frame_flags &= ~AOM_LAST_FLAG; break;
+ case LAST2_FRAME: cpi->ref_frame_flags &= ~AOM_LAST2_FLAG; break;
+ case LAST3_FRAME: cpi->ref_frame_flags &= ~AOM_LAST3_FLAG; break;
+ case BWDREF_FRAME: cpi->ref_frame_flags &= ~AOM_BWD_FLAG; break;
+ case ALTREF2_FRAME: cpi->ref_frame_flags &= ~AOM_ALT2_FLAG; break;
+ default: break;
+ }
+ }
+}
+
+static INLINE int av1_refs_are_one_sided(const AV1_COMMON *cm) {
+ assert(!frame_is_intra_only(cm));
+
+ int one_sided_refs = 1;
+ for (int ref = 0; ref < INTER_REFS_PER_FRAME; ++ref) {
+ const int buf_idx = cm->frame_refs[ref].idx;
+ if (buf_idx == INVALID_IDX) continue;
+
+ const int ref_offset =
+ cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ if (get_relative_dist(cm, ref_offset, (int)cm->frame_offset) > 0) {
+ one_sided_refs = 0; // bwd reference
+ break;
+ }
+ }
+ return one_sided_refs;
+}
+
+static INLINE void get_skip_mode_ref_offsets(const AV1_COMMON *cm,
+ int ref_offset[2]) {
+ ref_offset[0] = ref_offset[1] = 0;
+ if (!cm->is_skip_mode_allowed) return;
+
+ const int buf_idx_0 = cm->frame_refs[cm->ref_frame_idx_0].idx;
+ const int buf_idx_1 = cm->frame_refs[cm->ref_frame_idx_1].idx;
+ assert(buf_idx_0 != INVALID_IDX && buf_idx_1 != INVALID_IDX);
+
+ ref_offset[0] = cm->buffer_pool->frame_bufs[buf_idx_0].cur_frame_offset;
+ ref_offset[1] = cm->buffer_pool->frame_bufs[buf_idx_1].cur_frame_offset;
+}
+
+static int check_skip_mode_enabled(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ av1_setup_skip_mode_allowed(cm);
+ if (!cm->is_skip_mode_allowed) return 0;
+
+ // Turn off skip mode if the temporal distances of the reference pair to the
+ // current frame are different by more than 1 frame.
+ const int cur_offset = (int)cm->frame_offset;
+ int ref_offset[2];
+ get_skip_mode_ref_offsets(cm, ref_offset);
+ const int cur_to_ref0 = get_relative_dist(cm, cur_offset, ref_offset[0]);
+ const int cur_to_ref1 = abs(get_relative_dist(cm, cur_offset, ref_offset[1]));
+ if (abs(cur_to_ref0 - cur_to_ref1) > 1) return 0;
+
+ // High Latency: Turn off skip mode if all refs are fwd.
+ if (cpi->all_one_sided_refs && cpi->oxcf.lag_in_frames > 0) return 0;
+
+ static const int flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+ const int ref_frame[2] = { cm->ref_frame_idx_0 + LAST_FRAME,
+ cm->ref_frame_idx_1 + LAST_FRAME };
+ if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) ||
+ !(cpi->ref_frame_flags & flag_list[ref_frame[1]]))
+ return 0;
+
+ return 1;
+}
+
+// Function to decide if we can skip the global motion parameter computation
+// for a particular ref frame
+static INLINE int skip_gm_frame(AV1_COMMON *const cm, int ref_frame) {
+ if ((ref_frame == LAST3_FRAME || ref_frame == LAST2_FRAME) &&
+ cm->global_motion[GOLDEN_FRAME].wmtype != IDENTITY) {
+ return get_relative_dist(
+ cm, cm->cur_frame->ref_frame_offset[ref_frame - LAST_FRAME],
+ cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0;
+ }
+ return 0;
+}
+
+static void set_default_interp_skip_flags(AV1_COMP *cpi) {
+ const int num_planes = av1_num_planes(&cpi->common);
+ cpi->default_interp_skip_flags = (num_planes == 1)
+ ? DEFAULT_LUMA_INTERP_SKIP_FLAG
+ : DEFAULT_INTERP_SKIP_FLAG;
+}
+
+static void encode_frame_internal(AV1_COMP *cpi) {
+ ThreadData *const td = &cpi->td;
+ MACROBLOCK *const x = &td->mb;
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ int i;
+
+ x->min_partition_size = AOMMIN(x->min_partition_size, cm->seq_params.sb_size);
+ x->max_partition_size = AOMMIN(x->max_partition_size, cm->seq_params.sb_size);
+#if CONFIG_DIST_8X8
+ x->using_dist_8x8 = cpi->oxcf.using_dist_8x8;
+ x->tune_metric = cpi->oxcf.tuning;
+#endif
+ cm->setup_mi(cm);
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+ av1_zero(*td->counts);
+ av1_zero(rdc->comp_pred_diff);
+
+ if (frame_is_intra_only(cm)) {
+ if (cm->seq_params.force_screen_content_tools == 2) {
+ cm->allow_screen_content_tools =
+ cpi->oxcf.content == AOM_CONTENT_SCREEN ||
+ is_screen_content(cpi->source->y_buffer,
+ cpi->source->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+ cpi->source->y_stride, cpi->source->y_width,
+ cpi->source->y_height);
+ } else {
+ cm->allow_screen_content_tools =
+ cm->seq_params.force_screen_content_tools;
+ }
+ }
+
+ // Allow intrabc when screen content tools are enabled.
+ cm->allow_intrabc = cm->allow_screen_content_tools;
+ // Reset the flag.
+ cpi->intrabc_used = 0;
+ // Need to disable intrabc when superres is selected
+ if (av1_superres_scaled(cm)) {
+ cm->allow_intrabc = 0;
+ }
+
+ if (cpi->oxcf.pass != 1 && av1_use_hash_me(cm)) {
+ // add to hash table
+ const int pic_width = cpi->source->y_crop_width;
+ const int pic_height = cpi->source->y_crop_height;
+ uint32_t *block_hash_values[2][2];
+ int8_t *is_block_same[2][3];
+ int k, j;
+
+ for (k = 0; k < 2; k++) {
+ for (j = 0; j < 2; j++) {
+ CHECK_MEM_ERROR(cm, block_hash_values[k][j],
+ aom_malloc(sizeof(uint32_t) * pic_width * pic_height));
+ }
+
+ for (j = 0; j < 3; j++) {
+ CHECK_MEM_ERROR(cm, is_block_same[k][j],
+ aom_malloc(sizeof(int8_t) * pic_width * pic_height));
+ }
+ }
+
+ av1_hash_table_create(&cm->cur_frame->hash_table);
+ av1_generate_block_2x2_hash_value(cpi->source, block_hash_values[0],
+ is_block_same[0], &cpi->td.mb);
+ av1_generate_block_hash_value(cpi->source, 4, block_hash_values[0],
+ block_hash_values[1], is_block_same[0],
+ is_block_same[1], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+ pic_width, pic_height, 4);
+ av1_generate_block_hash_value(cpi->source, 8, block_hash_values[1],
+ block_hash_values[0], is_block_same[1],
+ is_block_same[0], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+ pic_width, pic_height, 8);
+ av1_generate_block_hash_value(cpi->source, 16, block_hash_values[0],
+ block_hash_values[1], is_block_same[0],
+ is_block_same[1], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+ pic_width, pic_height, 16);
+ av1_generate_block_hash_value(cpi->source, 32, block_hash_values[1],
+ block_hash_values[0], is_block_same[1],
+ is_block_same[0], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+ pic_width, pic_height, 32);
+ av1_generate_block_hash_value(cpi->source, 64, block_hash_values[0],
+ block_hash_values[1], is_block_same[0],
+ is_block_same[1], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[1], is_block_same[1][2],
+ pic_width, pic_height, 64);
+
+ av1_generate_block_hash_value(cpi->source, 128, block_hash_values[1],
+ block_hash_values[0], is_block_same[1],
+ is_block_same[0], &cpi->td.mb);
+ av1_add_to_hash_map_by_row_with_precal_data(
+ &cm->cur_frame->hash_table, block_hash_values[0], is_block_same[0][2],
+ pic_width, pic_height, 128);
+
+ for (k = 0; k < 2; k++) {
+ for (j = 0; j < 2; j++) {
+ aom_free(block_hash_values[k][j]);
+ }
+
+ for (j = 0; j < 3; j++) {
+ aom_free(is_block_same[k][j]);
+ }
+ }
+ }
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ const int qindex = cm->seg.enabled
+ ? av1_get_qindex(&cm->seg, i, cm->base_qindex)
+ : cm->base_qindex;
+ xd->lossless[i] = qindex == 0 && cm->y_dc_delta_q == 0 &&
+ cm->u_dc_delta_q == 0 && cm->u_ac_delta_q == 0 &&
+ cm->v_dc_delta_q == 0 && cm->v_ac_delta_q == 0;
+ if (xd->lossless[i]) cpi->has_lossless_segment = 1;
+ xd->qindex[i] = qindex;
+ if (xd->lossless[i]) {
+ cpi->optimize_seg_arr[i] = 0;
+ } else {
+ cpi->optimize_seg_arr[i] = cpi->optimize_speed_feature;
+ }
+ }
+ cm->coded_lossless = is_coded_lossless(cm, xd);
+ cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+
+ cm->tx_mode = select_tx_mode(cpi);
+
+ // Fix delta q resolution for the moment
+ cm->delta_q_res = DEFAULT_DELTA_Q_RES;
+ // Set delta_q_present_flag before it is used for the first time
+ cm->delta_lf_res = DEFAULT_DELTA_LF_RES;
+ cm->delta_q_present_flag = cpi->oxcf.deltaq_mode != NO_DELTA_Q;
+ cm->delta_lf_present_flag = cpi->oxcf.deltaq_mode == DELTA_Q_LF;
+ cm->delta_lf_multi = DEFAULT_DELTA_LF_MULTI;
+ // update delta_q_present_flag and delta_lf_present_flag based on base_qindex
+ cm->delta_q_present_flag &= cm->base_qindex > 0;
+ cm->delta_lf_present_flag &= cm->base_qindex > 0;
+
+ av1_frame_init_quantizer(cpi);
+
+ av1_initialize_rd_consts(cpi);
+ av1_initialize_me_consts(cpi, x, cm->base_qindex);
+ init_encode_frame_mb_context(cpi);
+ set_default_interp_skip_flags(cpi);
+ if (cm->prev_frame)
+ cm->last_frame_seg_map = cm->prev_frame->seg_map;
+ else
+ cm->last_frame_seg_map = NULL;
+ cm->current_frame_seg_map = cm->cur_frame->seg_map;
+ if (cm->allow_intrabc || cm->coded_lossless) {
+ av1_set_default_ref_deltas(cm->lf.ref_deltas);
+ av1_set_default_mode_deltas(cm->lf.mode_deltas);
+ } else if (cm->prev_frame) {
+ memcpy(cm->lf.ref_deltas, cm->prev_frame->ref_deltas, REF_FRAMES);
+ memcpy(cm->lf.mode_deltas, cm->prev_frame->mode_deltas, MAX_MODE_LF_DELTAS);
+ }
+ memcpy(cm->cur_frame->ref_deltas, cm->lf.ref_deltas, REF_FRAMES);
+ memcpy(cm->cur_frame->mode_deltas, cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
+
+ // Special case: set prev_mi to NULL when the previous mode info
+ // context cannot be used.
+ cm->prev_mi = cm->allow_ref_frame_mvs ? cm->prev_mip : NULL;
+
+ x->txb_split_count = 0;
+
+ av1_zero(rdc->global_motion_used);
+ av1_zero(cpi->gmparams_cost);
+#if !CONFIG_GLOBAL_MOTION_SEARCH
+ cpi->global_motion_search_done = 1;
+#endif // !CONFIG_GLOBAL_MOTION_SEARCH
+ if (cpi->common.frame_type == INTER_FRAME && cpi->source &&
+ !cpi->global_motion_search_done) {
+ YV12_BUFFER_CONFIG *ref_buf[REF_FRAMES];
+ int frame;
+ double params_by_motion[RANSAC_NUM_MOTIONS * (MAX_PARAMDIM - 1)];
+ const double *params_this_motion;
+ int inliers_by_motion[RANSAC_NUM_MOTIONS];
+ WarpedMotionParams tmp_wm_params;
+ static const double kIdentityParams[MAX_PARAMDIM - 1] = {
+ 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0
+ };
+ int num_refs_using_gm = 0;
+
+ for (frame = ALTREF_FRAME; frame >= LAST_FRAME; --frame) {
+ ref_buf[frame] = get_ref_frame_buffer(cpi, frame);
+ int pframe;
+ cm->global_motion[frame] = default_warp_params;
+ const WarpedMotionParams *ref_params =
+ cm->prev_frame ? &cm->prev_frame->global_motion[frame]
+ : &default_warp_params;
+ // check for duplicate buffer
+ for (pframe = ALTREF_FRAME; pframe > frame; --pframe) {
+ if (ref_buf[frame] == ref_buf[pframe]) break;
+ }
+ if (pframe > frame) {
+ memcpy(&cm->global_motion[frame], &cm->global_motion[pframe],
+ sizeof(WarpedMotionParams));
+ } else if (ref_buf[frame] &&
+ ref_buf[frame]->y_crop_width == cpi->source->y_crop_width &&
+ ref_buf[frame]->y_crop_height == cpi->source->y_crop_height &&
+ do_gm_search_logic(&cpi->sf, num_refs_using_gm, frame) &&
+ !(cpi->sf.selective_ref_gm && skip_gm_frame(cm, frame))) {
+ TransformationType model;
+ const int64_t ref_frame_error =
+ av1_frame_error(xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+ ref_buf[frame]->y_buffer, ref_buf[frame]->y_stride,
+ cpi->source->y_buffer, cpi->source->y_width,
+ cpi->source->y_height, cpi->source->y_stride);
+
+ if (ref_frame_error == 0) continue;
+
+ aom_clear_system_state();
+ for (model = ROTZOOM; model < GLOBAL_TRANS_TYPES_ENC; ++model) {
+ int64_t best_warp_error = INT64_MAX;
+ // Initially set all params to identity.
+ for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ memcpy(params_by_motion + (MAX_PARAMDIM - 1) * i, kIdentityParams,
+ (MAX_PARAMDIM - 1) * sizeof(*params_by_motion));
+ }
+
+ compute_global_motion_feature_based(
+ model, cpi->source, ref_buf[frame],
+ cpi->common.seq_params.bit_depth, inliers_by_motion,
+ params_by_motion, RANSAC_NUM_MOTIONS);
+
+ for (i = 0; i < RANSAC_NUM_MOTIONS; ++i) {
+ if (inliers_by_motion[i] == 0) continue;
+
+ params_this_motion = params_by_motion + (MAX_PARAMDIM - 1) * i;
+ convert_model_to_params(params_this_motion, &tmp_wm_params);
+
+ if (tmp_wm_params.wmtype != IDENTITY) {
+ const int64_t warp_error = refine_integerized_param(
+ &tmp_wm_params, tmp_wm_params.wmtype,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, xd->bd,
+ ref_buf[frame]->y_buffer, ref_buf[frame]->y_width,
+ ref_buf[frame]->y_height, ref_buf[frame]->y_stride,
+ cpi->source->y_buffer, cpi->source->y_width,
+ cpi->source->y_height, cpi->source->y_stride, 5,
+ best_warp_error);
+ if (warp_error < best_warp_error) {
+ best_warp_error = warp_error;
+ // Save the wm_params modified by refine_integerized_param()
+ // rather than motion index to avoid rerunning refine() below.
+ memcpy(&(cm->global_motion[frame]), &tmp_wm_params,
+ sizeof(WarpedMotionParams));
+ }
+ }
+ }
+ if (cm->global_motion[frame].wmtype <= AFFINE)
+ if (!get_shear_params(&cm->global_motion[frame]))
+ cm->global_motion[frame] = default_warp_params;
+
+ if (cm->global_motion[frame].wmtype == TRANSLATION) {
+ cm->global_motion[frame].wmmat[0] =
+ convert_to_trans_prec(cm->allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[0]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ cm->global_motion[frame].wmmat[1] =
+ convert_to_trans_prec(cm->allow_high_precision_mv,
+ cm->global_motion[frame].wmmat[1]) *
+ GM_TRANS_ONLY_DECODE_FACTOR;
+ }
+
+ // If the best error advantage found doesn't meet the threshold for
+ // this motion type, revert to IDENTITY.
+ if (!is_enough_erroradvantage(
+ (double)best_warp_error / ref_frame_error,
+ gm_get_params_cost(&cm->global_motion[frame], ref_params,
+ cm->allow_high_precision_mv),
+ cpi->sf.gm_erroradv_type)) {
+ cm->global_motion[frame] = default_warp_params;
+ }
+ if (cm->global_motion[frame].wmtype != IDENTITY) break;
+ }
+ aom_clear_system_state();
+ }
+ if (cm->global_motion[frame].wmtype != IDENTITY) num_refs_using_gm++;
+ cpi->gmparams_cost[frame] =
+ gm_get_params_cost(&cm->global_motion[frame], ref_params,
+ cm->allow_high_precision_mv) +
+ cpi->gmtype_cost[cm->global_motion[frame].wmtype] -
+ cpi->gmtype_cost[IDENTITY];
+ }
+ // clear disabled ref_frames
+ for (frame = LAST_FRAME; frame <= ALTREF_FRAME; ++frame) {
+ const int ref_disabled =
+ !(cpi->ref_frame_flags & ref_frame_flag_list[frame]);
+ if (ref_disabled && cpi->sf.recode_loop != DISALLOW_RECODE) {
+ cpi->gmparams_cost[frame] = 0;
+ cm->global_motion[frame] = default_warp_params;
+ }
+ }
+ cpi->global_motion_search_done = 1;
+ }
+ memcpy(cm->cur_frame->global_motion, cm->global_motion,
+ REF_FRAMES * sizeof(WarpedMotionParams));
+
+ av1_setup_motion_field(cm);
+
+ cpi->all_one_sided_refs =
+ frame_is_intra_only(cm) ? 0 : av1_refs_are_one_sided(cm);
+
+ cm->skip_mode_flag = check_skip_mode_enabled(cpi);
+
+ {
+ struct aom_usec_timer emr_timer;
+ aom_usec_timer_start(&emr_timer);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ input_fpmb_stats(&cpi->twopass.firstpass_mb_stats, cm,
+ &cpi->twopass.this_frame_mb_stats);
+ }
+#endif
+
+ if (cpi->row_mt && (cpi->oxcf.max_threads > 1))
+ av1_encode_tiles_mt(cpi);
+ else if (AOMMIN(cpi->oxcf.max_threads, cm->tile_cols * cm->tile_rows) > 1)
+ av1_encode_tiles_mt(cpi);
+ else
+ encode_tiles(cpi);
+
+ aom_usec_timer_mark(&emr_timer);
+ cpi->time_encode_sb_row += aom_usec_timer_elapsed(&emr_timer);
+ }
+
+ // If intrabc is allowed but never selected, reset the allow_intrabc flag.
+ if (cm->allow_intrabc && !cpi->intrabc_used) cm->allow_intrabc = 0;
+ if (cm->allow_intrabc) cm->delta_lf_present_flag = 0;
+}
+
+void av1_encode_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ // Indicates whether or not to use a default reduced set for ext-tx
+ // rather than the potential full set of 16 transforms
+ cm->reduced_tx_set_used = 0;
+
+ if (cm->show_frame == 0) {
+ int arf_offset = AOMMIN(
+ (MAX_GF_INTERVAL - 1),
+ cpi->twopass.gf_group.arf_src_offset[cpi->twopass.gf_group.index]);
+ int brf_offset =
+ cpi->twopass.gf_group.brf_src_offset[cpi->twopass.gf_group.index];
+ arf_offset = AOMMIN((MAX_GF_INTERVAL - 1), arf_offset + brf_offset);
+ cm->frame_offset = cm->current_video_frame + arf_offset;
+ } else {
+ cm->frame_offset = cm->current_video_frame;
+ }
+ cm->frame_offset %= (1 << (cm->seq_params.order_hint_bits_minus_1 + 1));
+
+ // Make sure segment_id is no larger than last_active_segid.
+ if (cm->seg.enabled && cm->seg.update_map) {
+ const int mi_rows = cm->mi_rows;
+ const int mi_cols = cm->mi_cols;
+ const int last_active_segid = cm->seg.last_active_segid;
+ uint8_t *map = cpi->segmentation_map;
+ for (int mi_row = 0; mi_row < mi_rows; ++mi_row) {
+ for (int mi_col = 0; mi_col < mi_cols; ++mi_col) {
+ map[mi_col] = AOMMIN(map[mi_col], last_active_segid);
+ }
+ map += mi_cols;
+ }
+ }
+
+ av1_setup_frame_buf_refs(cm);
+ if (cpi->sf.selective_ref_frame >= 2) enforce_max_ref_frames(cpi);
+ av1_setup_frame_sign_bias(cm);
+
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_reset_frame(num_planes);
+#else
+ (void)num_planes;
+#endif
+
+ cpi->allow_comp_inter_inter = !frame_is_intra_only(cm);
+
+ if (cpi->sf.frame_parameter_update) {
+ int i;
+ RD_OPT *const rd_opt = &cpi->rd;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+
+ // This code does a single RD pass over the whole frame assuming
+ // either compound, single or hybrid prediction as per whatever has
+ // worked best for that type of frame in the past.
+ // It also predicts whether another coding mode would have worked
+ // better than this coding mode. If that is the case, it remembers
+ // that for subsequent frames.
+ // It does the same analysis for transform size selection also.
+ //
+ // TODO(zoeliu): To investigate whether a frame_type other than
+ // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
+ const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
+ int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
+ const int is_alt_ref = frame_type == ALTREF_FRAME;
+
+ /* prediction (compound, single or hybrid) mode selection */
+ // NOTE: "is_alt_ref" is true only for OVERLAY/INTNL_OVERLAY frames
+ if (is_alt_ref || !cpi->allow_comp_inter_inter)
+ cm->reference_mode = SINGLE_REFERENCE;
+ else
+ cm->reference_mode = REFERENCE_MODE_SELECT;
+
+ cm->interp_filter = SWITCHABLE;
+ if (cm->large_scale_tile) cm->interp_filter = EIGHTTAP_REGULAR;
+
+ cm->switchable_motion_mode = 1;
+
+ rdc->compound_ref_used_flag = 0;
+ rdc->skip_mode_used_flag = 0;
+
+ encode_frame_internal(cpi);
+
+ for (i = 0; i < REFERENCE_MODES; ++i)
+ mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ // Use a flag that includes 4x4 blocks
+ if (rdc->compound_ref_used_flag == 0) {
+ cm->reference_mode = SINGLE_REFERENCE;
+#if CONFIG_ENTROPY_STATS
+ av1_zero(cpi->td.counts->comp_inter);
+#endif // CONFIG_ENTROPY_STATS
+ }
+ }
+ // Re-check on the skip mode status as reference mode may have been changed.
+ if (frame_is_intra_only(cm) || cm->reference_mode == SINGLE_REFERENCE) {
+ cm->is_skip_mode_allowed = 0;
+ cm->skip_mode_flag = 0;
+ }
+ if (cm->skip_mode_flag && rdc->skip_mode_used_flag == 0)
+ cm->skip_mode_flag = 0;
+
+ if (!cm->large_scale_tile) {
+ if (cm->tx_mode == TX_MODE_SELECT && cpi->td.mb.txb_split_count == 0)
+ cm->tx_mode = TX_MODE_LARGEST;
+ }
+ } else {
+ encode_frame_internal(cpi);
+ }
+}
+
+static void update_txfm_count(MACROBLOCK *x, MACROBLOCKD *xd,
+ FRAME_COUNTS *counts, TX_SIZE tx_size, int depth,
+ int blk_row, int blk_col,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ int ctx = txfm_partition_context(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row,
+ mbmi->sb_type, tx_size);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+ assert(tx_size > TX_4X4);
+
+ if (depth == MAX_VARTX_DEPTH) {
+ // Don't add to counts in this case
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ return;
+ }
+
+ if (tx_size == plane_tx_size) {
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][0];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 0, 2);
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+
+#if CONFIG_ENTROPY_STATS
+ ++counts->txfm_partition[ctx][1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(xd->tile_ctx->txfm_partition_cdf[ctx], 1, 2);
+ ++x->txb_split_count;
+
+ if (sub_txs == TX_4X4) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ int offsetr = row;
+ int offsetc = col;
+
+ update_txfm_count(x, xd, counts, sub_txs, depth + 1, blk_row + offsetr,
+ blk_col + offsetc, allow_update_cdf);
+ }
+ }
+ }
+}
+
+static void tx_partition_count_update(const AV1_COMMON *const cm, MACROBLOCK *x,
+ BLOCK_SIZE plane_bsize, int mi_row,
+ int mi_col, FRAME_COUNTS *td_counts,
+ uint8_t allow_update_cdf) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+
+ xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ for (idy = 0; idy < mi_height; idy += bh)
+ for (idx = 0; idx < mi_width; idx += bw)
+ update_txfm_count(x, xd, td_counts, max_tx_size, 0, idy, idx,
+ allow_update_cdf);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size, int blk_row,
+ int blk_col) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int txb_size_index = av1_get_txb_size_index(bsize, blk_row, blk_col);
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[txb_size_index];
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ if (tx_size == plane_tx_size) {
+ mbmi->tx_size = tx_size;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, tx_size, tx_size);
+
+ } else {
+ if (tx_size == TX_8X8) {
+ mbmi->inter_tx_size[txb_size_index] = TX_4X4;
+ mbmi->tx_size = TX_4X4;
+ txfm_partition_update(xd->above_txfm_context + blk_col,
+ xd->left_txfm_context + blk_row, TX_4X4, tx_size);
+ return;
+ }
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+ set_txfm_context(xd, sub_txs, offsetr, offsetc);
+ }
+ }
+ }
+}
+
+static void tx_partition_set_contexts(const AV1_COMMON *const cm,
+ MACROBLOCKD *xd, BLOCK_SIZE plane_bsize,
+ int mi_row, int mi_col) {
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+
+ xd->above_txfm_context = cm->above_txfm_context[xd->tile.tile_row] + mi_col;
+ xd->left_txfm_context =
+ xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+ for (idy = 0; idy < mi_height; idy += bh)
+ for (idx = 0; idx < mi_width; idx += bw)
+ set_txfm_context(xd, max_tx_size, idy, idx);
+}
+
+static void encode_superblock(const AV1_COMP *const cpi, TileDataEnc *tile_data,
+ ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ int *rate) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO **mi_4x4 = xd->mi;
+ MB_MODE_INFO *mbmi = mi_4x4[0];
+ const int seg_skip =
+ segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP);
+ const int mis = cm->mi_stride;
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ const int is_inter = is_inter_block(mbmi);
+
+ if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+ x->cb_partition_scan) {
+ for (int row = mi_row; row < mi_row + mi_width;
+ row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ for (int col = mi_col; col < mi_col + mi_height;
+ col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ const int index = av1_first_partition_pass_stats_index(row, col);
+ FIRST_PARTITION_PASS_STATS *const stats =
+ &x->first_partition_pass_stats[index];
+ // Increase the counter of data samples.
+ ++stats->sample_counts;
+ // Increase the counter for ref_frame[0] and ref_frame[1].
+ if (stats->ref0_counts[mbmi->ref_frame[0]] < 255)
+ ++stats->ref0_counts[mbmi->ref_frame[0]];
+ if (mbmi->ref_frame[1] >= 0 &&
+ stats->ref1_counts[mbmi->ref_frame[0]] < 255)
+ ++stats->ref1_counts[mbmi->ref_frame[1]];
+ }
+ }
+ }
+
+ if (!is_inter) {
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
+ xd->cfl.store_y = store_cfl_required(cm, xd);
+ mbmi->skip = 1;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ av1_encode_intra_block_plane(cpi, x, bsize, plane,
+ cpi->optimize_seg_arr[mbmi->segment_id],
+ mi_row, mi_col);
+ }
+
+ // If there is at least one lossless segment, force the skip for intra
+ // block to be 0, in order to avoid the segment_id to be changed by in
+ // write_segment_id().
+ if (!cpi->common.seg.segid_preskip && cpi->common.seg.update_map &&
+ cpi->has_lossless_segment)
+ mbmi->skip = 0;
+
+ xd->cfl.store_y = 0;
+ if (av1_allow_palette(cm->allow_screen_content_tools, bsize)) {
+ for (int plane = 0; plane < AOMMIN(2, num_planes); ++plane) {
+ if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+ if (!dry_run) {
+ av1_tokenize_color_map(x, plane, t, bsize, mbmi->tx_size,
+ PALETTE_MAP, tile_data->allow_update_cdf,
+ td->counts);
+ } else if (dry_run == DRY_RUN_COSTCOEFFS) {
+ rate +=
+ av1_cost_color_map(x, plane, bsize, mbmi->tx_size, PALETTE_MAP);
+ }
+ }
+ }
+ }
+
+ av1_update_txb_context(cpi, td, dry_run, bsize, rate, mi_row, mi_col,
+ tile_data->allow_update_cdf);
+ } else {
+ int ref;
+ const int is_compound = has_second_ref(mbmi);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi, mbmi->ref_frame[ref]);
+ assert(IMPLIES(!is_intrabc_block(mbmi), cfg));
+ av1_setup_pre_planes(xd, ref, cfg, mi_row, mi_col,
+ &xd->block_refs[ref]->sf, num_planes);
+ }
+
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ int pixel_c, pixel_r;
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, 0, 0,
+ pd->subsampling_x, pd->subsampling_y);
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ mismatch_record_block_pre(pd->dst.buf, pd->dst.stride, cm->frame_offset,
+ plane, pixel_c, pixel_r, pd->width,
+ pd->height,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+ }
+#else
+ (void)num_planes;
+#endif
+
+ av1_encode_sb(cpi, x, bsize, mi_row, mi_col, dry_run);
+ av1_tokenize_sb_vartx(cpi, td, t, dry_run, mi_row, mi_col, bsize, rate,
+ tile_data->allow_update_cdf);
+ }
+
+ if (!dry_run) {
+ if (av1_allow_intrabc(cm) && is_intrabc_block(mbmi))
+ td->intrabc_used_this_tile = 1;
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id] &&
+ mbmi->sb_type > BLOCK_4X4 && !(is_inter && (mbmi->skip || seg_skip))) {
+ if (is_inter) {
+ tx_partition_count_update(cm, x, bsize, mi_row, mi_col, td->counts,
+ tile_data->allow_update_cdf);
+ } else {
+ if (mbmi->tx_size != max_txsize_rect_lookup[bsize])
+ ++x->txb_split_count;
+ if (block_signals_txsize(bsize)) {
+ const int tx_size_ctx = get_tx_size_context(xd);
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(mbmi->tx_size, bsize);
+ const int max_depths = bsize_to_max_depth(bsize);
+
+ if (tile_data->allow_update_cdf)
+ update_cdf(xd->tile_ctx->tx_size_cdf[tx_size_cat][tx_size_ctx],
+ depth, max_depths + 1);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->intra_tx_size[tx_size_cat][tx_size_ctx][depth];
+#endif
+ }
+ }
+ assert(IMPLIES(is_rect_tx(mbmi->tx_size), is_rect_tx_allowed(xd, mbmi)));
+ } else {
+ int i, j;
+ TX_SIZE intra_tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ intra_tx_size = TX_4X4;
+ } else {
+ intra_tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ }
+ } else {
+ intra_tx_size = mbmi->tx_size;
+ }
+
+ for (j = 0; j < mi_height; j++)
+ for (i = 0; i < mi_width; i++)
+ if (mi_col + i < cm->mi_cols && mi_row + j < cm->mi_rows)
+ mi_4x4[mis * j + i]->tx_size = intra_tx_size;
+
+ if (intra_tx_size != max_txsize_rect_lookup[bsize]) ++x->txb_split_count;
+ }
+ }
+
+ if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type) &&
+ is_inter && !(mbmi->skip || seg_skip) &&
+ !xd->lossless[mbmi->segment_id]) {
+ if (dry_run) tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+ } else {
+ TX_SIZE tx_size = mbmi->tx_size;
+ // The new intra coding scheme requires no change of transform size
+ if (is_inter) {
+ if (xd->lossless[mbmi->segment_id]) {
+ tx_size = TX_4X4;
+ } else {
+ tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ }
+ } else {
+ tx_size = (bsize > BLOCK_4X4) ? tx_size : TX_4X4;
+ }
+ mbmi->tx_size = tx_size;
+ set_txfm_ctxs(tx_size, xd->n4_w, xd->n4_h,
+ (mbmi->skip || seg_skip) && is_inter_block(mbmi), xd);
+ }
+ CFL_CTX *const cfl = &xd->cfl;
+ if (is_inter_block(mbmi) &&
+ !is_chroma_reference(mi_row, mi_col, bsize, cfl->subsampling_x,
+ cfl->subsampling_y) &&
+ is_cfl_allowed(xd)) {
+ cfl_store_block(xd, mbmi->sb_type, mbmi->tx_size);
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodeframe.h b/third_party/aom/av1/encoder/encodeframe.h
new file mode 100644
index 0000000000..e8cf9b4685
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodeframe.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEFRAME_H_
+#define AOM_AV1_ENCODER_ENCODEFRAME_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define DELTAQ_MODULATION 1 // 0: variance based, 1: wavelet AC energy based
+
+struct macroblock;
+struct yv12_buffer_config;
+struct AV1_COMP;
+struct ThreadData;
+
+void av1_setup_src_planes(struct macroblock *x,
+ const struct yv12_buffer_config *src, int mi_row,
+ int mi_col, const int num_planes);
+
+void av1_encode_frame(struct AV1_COMP *cpi);
+
+void av1_alloc_tile_data(struct AV1_COMP *cpi);
+void av1_init_tile_data(struct AV1_COMP *cpi);
+void av1_encode_tile(struct AV1_COMP *cpi, struct ThreadData *td, int tile_row,
+ int tile_col);
+void av1_encode_sb_row(struct AV1_COMP *cpi, struct ThreadData *td,
+ int tile_row, int tile_col, int mi_row);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEFRAME_H_
diff --git a/third_party/aom/av1/encoder/encodemb.c b/third_party/aom/av1/encoder/encodemb.c
new file mode 100644
index 0000000000..ad12577e6e
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.c
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/cfl.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+
+// Check if one needs to use c version subtraction.
+static int check_subtract_block_size(int w, int h) { return w < 4 || h < 4; }
+
+static void subtract_block(const MACROBLOCKD *xd, int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src8, ptrdiff_t src_stride,
+ const uint8_t *pred8, ptrdiff_t pred_stride) {
+ if (check_subtract_block_size(rows, cols)) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block_c(rows, cols, diff, diff_stride, src8,
+ src_stride, pred8, pred_stride, xd->bd);
+ return;
+ }
+ aom_subtract_block_c(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+
+ return;
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ aom_highbd_subtract_block(rows, cols, diff, diff_stride, src8, src_stride,
+ pred8, pred_stride, xd->bd);
+ return;
+ }
+ aom_subtract_block(rows, cols, diff, diff_stride, src8, src_stride, pred8,
+ pred_stride);
+}
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int src_stride = p->src.stride;
+ const int dst_stride = pd->dst.stride;
+ const int tx1d_width = tx_size_wide[tx_size];
+ const int tx1d_height = tx_size_high[tx_size];
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ uint8_t *src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ int16_t *src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+ subtract_block(xd, tx1d_height, tx1d_width, src_diff, diff_stride, src,
+ src_stride, dst, dst_stride);
+}
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ subtract_block(xd, bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+}
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int fast_mode,
+ int *rate_cost) {
+ MACROBLOCKD *const xd = &mb->e_mbd;
+ struct macroblock_plane *const p = &mb->plane[plane];
+ const int eob = p->eobs[block];
+ const int segment_id = xd->mi[0]->segment_id;
+
+ if (eob == 0 || !cpi->optimize_seg_arr[segment_id] ||
+ xd->lossless[segment_id]) {
+ *rate_cost = av1_cost_skip_txb(mb, txb_ctx, plane, tx_size);
+ return eob;
+ }
+
+ (void)fast_mode;
+ return av1_optimize_txb_new(cpi, mb, plane, block, tx_size, tx_type, txb_ctx,
+ rate_cost, cpi->oxcf.sharpness);
+}
+
+typedef enum QUANT_FUNC {
+ QUANT_FUNC_LOWBD = 0,
+ QUANT_FUNC_HIGHBD = 1,
+ QUANT_FUNC_TYPES = 2
+} QUANT_FUNC;
+
+static AV1_QUANT_FACADE
+ quant_func_list[AV1_XFORM_QUANT_TYPES][QUANT_FUNC_TYPES] = {
+ { av1_quantize_fp_facade, av1_highbd_quantize_fp_facade },
+ { av1_quantize_b_facade, av1_highbd_quantize_b_facade },
+ { av1_quantize_dc_facade, av1_highbd_quantize_dc_facade },
+ { NULL, NULL }
+ };
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ AV1_XFORM_QUANT xform_quant_idx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint16_t *const eob = &p->eobs[block];
+ const int diff_stride = block_size_wide[plane_bsize];
+ int seg_id = mbmi->segment_id;
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ // Use a flat matrix (i.e. no weighting) for 1D and Identity transforms
+ const qm_val_t *qmatrix =
+ IS_2D_TRANSFORM(tx_type) ? pd->seg_qmatrix[seg_id][qm_tx_size]
+ : cm->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+ const qm_val_t *iqmatrix =
+ IS_2D_TRANSFORM(tx_type)
+ ? pd->seg_iqmatrix[seg_id][qm_tx_size]
+ : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+
+ const int src_offset = (blk_row * diff_stride + blk_col);
+ const int16_t *src_diff = &p->src_diff[src_offset << tx_size_wide_log2[0]];
+ QUANT_PARAM qparam;
+ qparam.log_scale = av1_get_tx_scale(tx_size);
+ qparam.tx_size = tx_size;
+ qparam.qmatrix = qmatrix;
+ qparam.iqmatrix = iqmatrix;
+ TxfmParam txfm_param;
+ txfm_param.tx_type = tx_type;
+ txfm_param.tx_size = tx_size;
+ txfm_param.lossless = xd->lossless[mbmi->segment_id];
+ txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+ txfm_param.tx_size, is_inter_block(mbmi), cm->reduced_tx_set_used);
+
+ txfm_param.bd = xd->bd;
+ txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+
+ av1_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
+
+ if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
+ const int n_coeffs = av1_get_max_eob(tx_size);
+ if (LIKELY(!x->skip_block)) {
+ quant_func_list[xform_quant_idx][txfm_param.is_hbd](
+ coeff, n_coeffs, p, qcoeff, dqcoeff, eob, scan_order, &qparam);
+ } else {
+ av1_quantize_skip(n_coeffs, qcoeff, dqcoeff, eob);
+ }
+ }
+ // NOTE: optimize_b_following is ture means av1_optimze_b will be called
+ // When the condition of doing optimize_b is changed,
+ // this flag need update simultaneously
+ const int optimize_b_following =
+ (xform_quant_idx != AV1_XFORM_QUANT_FP) || (txfm_param.lossless);
+ if (optimize_b_following) {
+ p->txb_entropy_ctx[block] =
+ (uint8_t)av1_get_txb_entropy_context(qcoeff, scan_order, *eob);
+ } else {
+ p->txb_entropy_ctx[block] = 0;
+ }
+ return;
+}
+
+static void encode_block(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg,
+ int mi_row, int mi_col, RUN_TYPE dry_run) {
+ (void)mi_row;
+ (void)mi_col;
+ (void)dry_run;
+ struct encode_b_args *const args = arg;
+ const AV1_COMMON *const cm = &args->cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ uint8_t *dst;
+ ENTROPY_CONTEXT *a, *l;
+ int dummy_rate_cost = 0;
+
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ dst = &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+
+ a = &args->ta[blk_col];
+ l = &args->tl[blk_row];
+
+ if (!is_blk_skip(x, plane, blk_row * bw + blk_col) && !mbmi->skip_mode) {
+ TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ if (args->enable_optimize_b) {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, tx_type, AV1_XFORM_QUANT_FP);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+ &dummy_rate_cost);
+ } else {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+ }
+ } else {
+ p->eobs[block] = 0;
+ p->txb_entropy_ctx[block] = 0;
+ }
+
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ if (p->eobs[block]) {
+ *(args->skip) = 0;
+
+ TX_TYPE tx_type = av1_get_tx_type(pd->plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ pd->dst.stride, p->eobs[block],
+ cm->reduced_tx_set_used);
+ }
+
+ if (p->eobs[block] == 0 && plane == 0) {
+ // TODO(debargha, jingning): Temporarily disable txk_type check for eob=0
+ // case. It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+#if 0
+ if (args->cpi->oxcf.aq_mode == NO_AQ &&
+ args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+ // TODO(jingning,angiebird,huisu@google.com): enable txk_check when
+ // enable_optimize_b is true to detect potential RD bug.
+ const uint8_t disable_txk_check = args->enable_optimize_b;
+ if (!disable_txk_check) {
+ assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+ blk_col)] == DCT_DCT);
+ }
+ }
+#endif
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ }
+
+#if CONFIG_MISMATCH_DEBUG
+ if (dry_run == OUTPUT_ENABLED) {
+ int pixel_c, pixel_r;
+ BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+ int blk_w = block_size_wide[bsize];
+ int blk_h = block_size_high[bsize];
+ mi_to_pixel_loc(&pixel_c, &pixel_r, mi_col, mi_row, blk_col, blk_row,
+ pd->subsampling_x, pd->subsampling_y);
+ mismatch_record_block_tx(dst, pd->dst.stride, cm->frame_offset, plane,
+ pixel_c, pixel_r, blk_w, blk_h,
+ xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH);
+ }
+#endif
+}
+
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg, int mi_row, int mi_col,
+ RUN_TYPE dry_run) {
+ (void)mi_row;
+ (void)mi_col;
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+ if (!plane) {
+ assert(tx_size_wide[tx_size] >= tx_size_wide[plane_tx_size] &&
+ tx_size_high[tx_size] >= tx_size_high[plane_tx_size]);
+ }
+
+ if (tx_size == plane_tx_size || plane) {
+ encode_block(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg,
+ mi_row, mi_col, dry_run);
+ } else {
+ assert(tx_size < TX_SIZES_ALL);
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ assert(IMPLIES(tx_size <= TX_4X4, sub_txs == tx_size));
+ assert(IMPLIES(tx_size > TX_4X4, sub_txs < tx_size));
+ // This is the square transform block partition entry point.
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ encode_block_inter(plane, block, offsetr, offsetc, plane_bsize, sub_txs,
+ arg, mi_row, mi_col, dry_run);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // block and transform sizes, in number of 4x4 blocks log 2 ("*_b")
+ // 4x4=0, 8x8=2, 16x16=4, 32x32=6, 64x64=8
+ // transform size varies per plane, look it up in a common way.
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const uint8_t txw_unit = tx_size_wide_unit[tx_size];
+ const uint8_t txh_unit = tx_size_high_unit[tx_size];
+ const int step = txw_unit * txh_unit;
+ int i = 0, r, c;
+
+ // If mb_to_right_edge is < 0 we are in a situation in which
+ // the current block size extends into the UMV and we won't
+ // visit the sub blocks that are wholly within the UMV.
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+
+ int blk_row, blk_col;
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ // Keep track of the row and column of the blocks we use so that we know
+ // if we are in the unrestricted motion border.
+ for (r = 0; r < max_blocks_high; r += mu_blocks_high) {
+ const int unit_height = AOMMIN(mu_blocks_high + r, max_blocks_high);
+ // Skip visiting the sub blocks that are wholly within the UMV.
+ for (c = 0; c < max_blocks_wide; c += mu_blocks_wide) {
+ const int unit_width = AOMMIN(mu_blocks_wide + c, max_blocks_wide);
+ for (blk_row = r; blk_row < unit_height; blk_row += txh_unit) {
+ for (blk_col = c; blk_col < unit_width; blk_col += txw_unit) {
+ visit(plane, i, blk_row, blk_col, plane_bsize, tx_size, arg);
+ i += step;
+ }
+ }
+ }
+ }
+}
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ continue;
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, visit, arg);
+ }
+}
+
+typedef struct encode_block_pass1_args {
+ AV1_COMMON *cm;
+ MACROBLOCK *x;
+} encode_block_pass1_args;
+
+static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ encode_block_pass1_args *args = (encode_block_pass1_args *)arg;
+ AV1_COMMON *cm = args->cm;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ TxfmParam txfm_param;
+ uint8_t *dst;
+ dst = &pd->dst
+ .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ DCT_DCT, AV1_XFORM_QUANT_B);
+
+ if (p->eobs[block] > 0) {
+ txfm_param.bd = xd->bd;
+ txfm_param.is_hbd = get_bitdepth_data_path_index(xd);
+ txfm_param.tx_type = DCT_DCT;
+ txfm_param.tx_size = tx_size;
+ txfm_param.eob = p->eobs[block];
+ txfm_param.lossless = xd->lossless[xd->mi[0]->segment_id];
+ txfm_param.tx_set_type = av1_get_ext_tx_set_type(
+ txfm_param.tx_size, is_inter_block(xd->mi[0]), cm->reduced_tx_set_used);
+ if (txfm_param.is_hbd) {
+ av1_highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ return;
+ }
+ av1_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &txfm_param);
+ }
+}
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize) {
+ encode_block_pass1_args args = { cm, x };
+ av1_subtract_plane(x, bsize, 0);
+ av1_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
+ encode_block_pass1, &args);
+}
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, RUN_TYPE dry_run) {
+ (void)dry_run;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct optimize_ctx ctx;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct encode_b_args arg = { cpi,
+ x,
+ &ctx,
+ &mbmi->skip,
+ NULL,
+ NULL,
+ cpi->optimize_seg_arr[mbmi->segment_id] };
+ int plane;
+
+ mbmi->skip = 1;
+
+ if (x->skip) return;
+
+ for (plane = 0; plane < num_planes; ++plane) {
+ const int subsampling_x = xd->plane[plane].subsampling_x;
+ const int subsampling_y = xd->plane[plane].subsampling_y;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, subsampling_x,
+ subsampling_y))
+ continue;
+
+ const BLOCK_SIZE bsizec =
+ scale_chroma_bsize(bsize, subsampling_x, subsampling_y);
+
+ // TODO(jingning): Clean this up.
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ const int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ const int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ av1_get_entropy_contexts(bsizec, pd, ctx.ta[plane], ctx.tl[plane]);
+
+ av1_subtract_plane(x, bsizec, plane);
+
+ arg.ta = ctx.ta[plane];
+ arg.tl = ctx.tl[plane];
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide =
+ block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high =
+ block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ int blk_row, blk_col;
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ encode_block_inter(plane, block, blk_row, blk_col, plane_bsize,
+ max_tx_size, &arg, mi_row, mi_col, dry_run);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+static void encode_block_intra_and_set_context(int plane, int block,
+ int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ av1_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ arg);
+
+ struct encode_b_args *const args = arg;
+ MACROBLOCK *x = args->x;
+ ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct encode_b_args *const args = arg;
+ const AV1_COMMON *const cm = &args->cpi->common;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ uint16_t *eob = &p->eobs[block];
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ int dummy_rate_cost = 0;
+
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ if (plane == 0 && is_blk_skip(x, plane, blk_row * bw + blk_col)) {
+ *eob = 0;
+ p->txb_entropy_ctx[block] = 0;
+ } else {
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+
+ const ENTROPY_CONTEXT *a = &args->ta[blk_col];
+ const ENTROPY_CONTEXT *l = &args->tl[blk_row];
+ if (args->enable_optimize_b) {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, tx_type, AV1_XFORM_QUANT_FP);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ av1_optimize_b(args->cpi, x, plane, block, tx_size, tx_type, &txb_ctx, 1,
+ &dummy_rate_cost);
+ } else {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+ }
+ }
+
+ if (*eob) {
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, *eob, cm->reduced_tx_set_used);
+ }
+
+ if (*eob == 0 && plane == 0) {
+ // TODO(jingning): Temporarily disable txk_type check for eob=0 case.
+ // It is possible that certain collision in hash index would cause
+ // the assertion failure. To further optimize the rate-distortion
+ // performance, we need to re-visit this part and enable this assert
+ // again.
+#if 0
+ if (args->cpi->oxcf.aq_mode == NO_AQ
+ && args->cpi->oxcf.deltaq_mode == NO_DELTA_Q) {
+ assert(mbmi->txk_type[av1_get_txk_type_index(plane_bsize, blk_row,
+ blk_col)] == DCT_DCT);
+ }
+#endif
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ }
+
+ // For intra mode, skipped blocks are so rare that transmitting skip=1 is
+ // very expensive.
+ *(args->skip) = 0;
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+}
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane,
+ int enable_optimize_b, int mi_row,
+ int mi_col) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ ENTROPY_CONTEXT ta[MAX_MIB_SIZE] = { 0 };
+ ENTROPY_CONTEXT tl[MAX_MIB_SIZE] = { 0 };
+
+ struct encode_b_args arg = {
+ cpi, x, NULL, &(xd->mi[0]->skip), ta, tl, enable_optimize_b
+ };
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y))
+ return;
+
+ if (enable_optimize_b) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ av1_get_entropy_contexts(bsize, pd, ta, tl);
+ }
+ av1_foreach_transformed_block_in_plane(
+ xd, bsize, plane, encode_block_intra_and_set_context, &arg);
+}
diff --git a/third_party/aom/av1/encoder/encodemb.h b/third_party/aom/av1/encoder/encodemb.h
new file mode 100644
index 0000000000..39080de599
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemb.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMB_H_
+#define AOM_AV1_ENCODER_ENCODEMB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/tokenize.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct optimize_ctx {
+ ENTROPY_CONTEXT ta[MAX_MB_PLANE][MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[MAX_MB_PLANE][MAX_MIB_SIZE];
+};
+
+struct encode_b_args {
+ const struct AV1_COMP *cpi;
+ MACROBLOCK *x;
+ struct optimize_ctx *ctx;
+ int8_t *skip;
+ ENTROPY_CONTEXT *ta;
+ ENTROPY_CONTEXT *tl;
+ int8_t enable_optimize_b;
+};
+
+typedef enum AV1_XFORM_QUANT {
+ AV1_XFORM_QUANT_FP = 0,
+ AV1_XFORM_QUANT_B = 1,
+ AV1_XFORM_QUANT_DC = 2,
+ AV1_XFORM_QUANT_SKIP_QUANT,
+ AV1_XFORM_QUANT_TYPES,
+} AV1_XFORM_QUANT;
+
+void av1_encode_sb(const struct AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, RUN_TYPE dry_run);
+
+void av1_foreach_transformed_block_in_plane(
+ const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
+ foreach_transformed_block_visitor visit, void *arg);
+
+void av1_foreach_transformed_block(const MACROBLOCKD *const xd,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ foreach_transformed_block_visitor visit,
+ void *arg, const int num_planes);
+
+void av1_encode_sby_pass1(AV1_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE bsize);
+
+void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
+ int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ AV1_XFORM_QUANT xform_quant_idx);
+
+int av1_optimize_b(const struct AV1_COMP *cpi, MACROBLOCK *mb, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int fast_mode, int *rate_cost);
+
+void av1_subtract_txb(MACROBLOCK *x, int plane, BLOCK_SIZE plane_bsize,
+ int blk_col, int blk_row, TX_SIZE tx_size);
+
+void av1_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
+
+static INLINE void av1_set_txb_context(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, ENTROPY_CONTEXT *a,
+ ENTROPY_CONTEXT *l) {
+ const uint8_t ctx = x->plane[plane].txb_entropy_ctx[block];
+ memset(a, ctx, tx_size_wide_unit[tx_size] * sizeof(*a));
+ memset(l, ctx, tx_size_high_unit[tx_size] * sizeof(*l));
+}
+
+void av1_encode_block_intra(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
+
+void av1_encode_intra_block_plane(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int plane,
+ int enable_optimize_b, int mi_row,
+ int mi_col);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMB_H_
diff --git a/third_party/aom/av1/encoder/encodemv.c b/third_party/aom/av1/encoder/encodemv.c
new file mode 100644
index 0000000000..42eb5abf62
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemv.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+static INLINE int mv_class_base(MV_CLASS_TYPE c) {
+ return c ? CLASS0_SIZE << (c + 2) : 0;
+}
+
+// If n != 0, returns the floor of log base 2 of n. If n == 0, returns 0.
+static INLINE uint8_t log_in_base_2(unsigned int n) {
+ // get_msb() is only valid when n != 0.
+ return n == 0 ? 0 : get_msb(n);
+}
+
+static INLINE MV_CLASS_TYPE get_mv_class(int z, int *offset) {
+ const MV_CLASS_TYPE c = (z >= CLASS0_SIZE * 4096)
+ ? MV_CLASS_10
+ : (MV_CLASS_TYPE)log_in_base_2(z >> 3);
+ if (offset) *offset = z - mv_class_base(c);
+ return c;
+}
+
+static void encode_mv_component(aom_writer *w, int comp, nmv_component *mvcomp,
+ MvSubpelPrecision precision) {
+ assert(comp != 0);
+ int offset;
+ const int sign = comp < 0;
+ const int mag = sign ? -comp : comp;
+ const int mv_class = get_mv_class(mag - 1, &offset);
+ const int d = offset >> 3; // int mv data
+ const int fr = (offset >> 1) & 3; // fractional mv data
+ const int hp = offset & 1; // high precision mv data
+
+ // Sign
+ aom_write_symbol(w, sign, mvcomp->sign_cdf, 2);
+
+ // Class
+ aom_write_symbol(w, mv_class, mvcomp->classes_cdf, MV_CLASSES);
+
+ // Integer bits
+ if (mv_class == MV_CLASS_0) {
+ aom_write_symbol(w, d, mvcomp->class0_cdf, CLASS0_SIZE);
+ } else {
+ int i;
+ const int n = mv_class + CLASS0_BITS - 1; // number of bits
+ for (i = 0; i < n; ++i)
+ aom_write_symbol(w, (d >> i) & 1, mvcomp->bits_cdf[i], 2);
+ }
+ // Fractional bits
+ if (precision > MV_SUBPEL_NONE) {
+ aom_write_symbol(
+ w, fr,
+ mv_class == MV_CLASS_0 ? mvcomp->class0_fp_cdf[d] : mvcomp->fp_cdf,
+ MV_FP_SIZE);
+ }
+
+ // High precision bit
+ if (precision > MV_SUBPEL_LOW_PRECISION)
+ aom_write_symbol(
+ w, hp, mv_class == MV_CLASS_0 ? mvcomp->class0_hp_cdf : mvcomp->hp_cdf,
+ 2);
+}
+
+static void build_nmv_component_cost_table(int *mvcost,
+ const nmv_component *const mvcomp,
+ MvSubpelPrecision precision) {
+ int i, v;
+ int sign_cost[2], class_cost[MV_CLASSES], class0_cost[CLASS0_SIZE];
+ int bits_cost[MV_OFFSET_BITS][2];
+ int class0_fp_cost[CLASS0_SIZE][MV_FP_SIZE], fp_cost[MV_FP_SIZE];
+ int class0_hp_cost[2], hp_cost[2];
+
+ av1_cost_tokens_from_cdf(sign_cost, mvcomp->sign_cdf, NULL);
+ av1_cost_tokens_from_cdf(class_cost, mvcomp->classes_cdf, NULL);
+ av1_cost_tokens_from_cdf(class0_cost, mvcomp->class0_cdf, NULL);
+ for (i = 0; i < MV_OFFSET_BITS; ++i) {
+ av1_cost_tokens_from_cdf(bits_cost[i], mvcomp->bits_cdf[i], NULL);
+ }
+
+ for (i = 0; i < CLASS0_SIZE; ++i)
+ av1_cost_tokens_from_cdf(class0_fp_cost[i], mvcomp->class0_fp_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(fp_cost, mvcomp->fp_cdf, NULL);
+
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ av1_cost_tokens_from_cdf(class0_hp_cost, mvcomp->class0_hp_cdf, NULL);
+ av1_cost_tokens_from_cdf(hp_cost, mvcomp->hp_cdf, NULL);
+ }
+ mvcost[0] = 0;
+ for (v = 1; v <= MV_MAX; ++v) {
+ int z, c, o, d, e, f, cost = 0;
+ z = v - 1;
+ c = get_mv_class(z, &o);
+ cost += class_cost[c];
+ d = (o >> 3); /* int mv data */
+ f = (o >> 1) & 3; /* fractional pel mv data */
+ e = (o & 1); /* high precision mv data */
+ if (c == MV_CLASS_0) {
+ cost += class0_cost[d];
+ } else {
+ const int b = c + CLASS0_BITS - 1; /* number of bits */
+ for (i = 0; i < b; ++i) cost += bits_cost[i][((d >> i) & 1)];
+ }
+ if (precision > MV_SUBPEL_NONE) {
+ if (c == MV_CLASS_0) {
+ cost += class0_fp_cost[d][f];
+ } else {
+ cost += fp_cost[f];
+ }
+ if (precision > MV_SUBPEL_LOW_PRECISION) {
+ if (c == MV_CLASS_0) {
+ cost += class0_hp_cost[e];
+ } else {
+ cost += hp_cost[e];
+ }
+ }
+ }
+ mvcost[v] = cost + sign_cost[0];
+ mvcost[-v] = cost + sign_cost[1];
+ }
+}
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx, int usehp) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+ if (cpi->common.cur_frame_force_integer_mv) {
+ usehp = MV_SUBPEL_NONE;
+ }
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], usehp);
+
+ // If auto_mv_step_size is enabled then keep track of the largest
+ // motion vector component used.
+ if (cpi->sf.mv.auto_mv_step_size) {
+ unsigned int maxv = AOMMAX(abs(mv->row), abs(mv->col)) >> 3;
+ cpi->max_mv_magnitude = AOMMAX(maxv, cpi->max_mv_magnitude);
+ }
+}
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx) {
+ // DV and ref DV should not have sub-pel.
+ assert((mv->col & 7) == 0);
+ assert((mv->row & 7) == 0);
+ assert((ref->col & 7) == 0);
+ assert((ref->row & 7) == 0);
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ const MV_JOINT_TYPE j = av1_get_mv_joint(&diff);
+
+ aom_write_symbol(w, j, mvctx->joints_cdf, MV_JOINTS);
+ if (mv_joint_vertical(j))
+ encode_mv_component(w, diff.row, &mvctx->comps[0], MV_SUBPEL_NONE);
+
+ if (mv_joint_horizontal(j))
+ encode_mv_component(w, diff.col, &mvctx->comps[1], MV_SUBPEL_NONE);
+}
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *ctx,
+ MvSubpelPrecision precision) {
+ av1_cost_tokens_from_cdf(mvjoint, ctx->joints_cdf, NULL);
+ build_nmv_component_cost_table(mvcost[0], &ctx->comps[0], precision);
+ build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], precision);
+}
+
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const CANDIDATE_MV *curr_ref_mv_stack =
+ mbmi_ext->ref_mv_stack[ref_frame_type];
+ int_mv ref_mv;
+ ref_mv.as_int = INVALID_MV;
+
+ if (ref_frame[1] > INTRA_FRAME) {
+ if (ref_idx == 0) {
+ ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+ } else {
+ assert(ref_idx == 1);
+ ref_mv = curr_ref_mv_stack[ref_mv_idx].comp_mv;
+ }
+ } else {
+ assert(ref_idx == 0);
+ if (ref_mv_idx < mbmi_ext->ref_mv_count[ref_frame_type]) {
+ ref_mv = curr_ref_mv_stack[ref_mv_idx].this_mv;
+ } else {
+ ref_mv = mbmi_ext->global_mvs[ref_frame_type];
+ }
+ }
+ return ref_mv;
+}
+
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int ref_mv_idx = mbmi->ref_mv_idx;
+ if (mbmi->mode == NEAR_NEWMV || mbmi->mode == NEW_NEARMV) {
+ assert(has_second_ref(mbmi));
+ ref_mv_idx += 1;
+ }
+ return av1_get_ref_mv_from_stack(ref_idx, mbmi->ref_frame, ref_mv_idx,
+ x->mbmi_ext);
+}
+
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer) {
+ const int ref_idx = 0;
+ MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ *nearest_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 0, mbmi_ext);
+ lower_mv_precision(&nearest_mv->as_mv, allow_hp, is_integer);
+ *near_mv = av1_get_ref_mv_from_stack(ref_idx, ref_frames, 1, mbmi_ext);
+ lower_mv_precision(&near_mv->as_mv, allow_hp, is_integer);
+}
diff --git a/third_party/aom/av1/encoder/encodemv.h b/third_party/aom/av1/encoder/encodemv.h
new file mode 100644
index 0000000000..37ff547c8c
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodemv.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODEMV_H_
+#define AOM_AV1_ENCODER_ENCODEMV_H_
+
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_encode_mv(AV1_COMP *cpi, aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx, int usehp);
+
+void av1_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
+ const nmv_context *mvctx,
+ MvSubpelPrecision precision);
+
+void av1_update_mv_count(ThreadData *td);
+
+void av1_encode_dv(aom_writer *w, const MV *mv, const MV *ref,
+ nmv_context *mvctx);
+int_mv av1_get_ref_mv(const MACROBLOCK *x, int ref_idx);
+int_mv av1_get_ref_mv_from_stack(int ref_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ int ref_mv_idx,
+ const MB_MODE_INFO_EXT *mbmi_ext);
+void av1_find_best_ref_mvs_from_stack(int allow_hp,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *nearest_mv, int_mv *near_mv,
+ int is_integer);
+
+static INLINE MV_JOINT_TYPE av1_get_mv_joint(const MV *mv) {
+ if (mv->row == 0) {
+ return mv->col == 0 ? MV_JOINT_ZERO : MV_JOINT_HNZVZ;
+ } else {
+ return mv->col == 0 ? MV_JOINT_HZVNZ : MV_JOINT_HNZVNZ;
+ }
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODEMV_H_
diff --git a/third_party/aom/av1/encoder/encoder.c b/third_party/aom/av1/encoder/encoder.c
new file mode 100644
index 0000000000..a2da2df899
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.c
@@ -0,0 +1,6437 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/grain_table.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom_dsp/psnr.h"
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+#include "aom_util/debug_util.h"
+#endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/cdef.h"
+#include "av1/common/filter.h"
+#include "av1/common/idct.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/resize.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/aq_complexity.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/ethread.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/grain_test_vectors.h"
+#include "av1/encoder/hash_motion.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/temporal_filter.h"
+
+#define DEFAULT_EXPLICIT_ORDER_HINT_BITS 7
+
+// av1 uses 10,000,000 ticks/second as time stamp
+#define TICKS_PER_SEC 10000000LL
+
+#if CONFIG_ENTROPY_STATS
+FRAME_COUNTS aggregate_fc;
+#endif // CONFIG_ENTROPY_STATS
+
+#define AM_SEGMENT_ID_INACTIVE 7
+#define AM_SEGMENT_ID_ACTIVE 0
+
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
+
+// #define OUTPUT_YUV_REC
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
+#ifdef OUTPUT_YUV_REC
+FILE *yuv_rec_file;
+#define FILE_NAME_LEN 100
+#endif
+
+static INLINE void Scale2Ratio(AOM_SCALING mode, int *hr, int *hs) {
+ switch (mode) {
+ case NORMAL:
+ *hr = 1;
+ *hs = 1;
+ break;
+ case FOURFIVE:
+ *hr = 4;
+ *hs = 5;
+ break;
+ case THREEFIVE:
+ *hr = 3;
+ *hs = 5;
+ break;
+ case ONETWO:
+ *hr = 1;
+ *hs = 2;
+ break;
+ default:
+ *hr = 1;
+ *hs = 1;
+ assert(0);
+ break;
+ }
+}
+
+// Mark all inactive blocks as active. Other segmentation features may be set
+// so memset cannot be used, instead only inactive blocks should be reset.
+static void suppress_active_map(AV1_COMP *cpi) {
+ unsigned char *const seg_map = cpi->segmentation_map;
+ int i;
+ if (cpi->active_map.enabled || cpi->active_map.update)
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_INACTIVE)
+ seg_map[i] = AM_SEGMENT_ID_ACTIVE;
+}
+
+static void apply_active_map(AV1_COMP *cpi) {
+ struct segmentation *const seg = &cpi->common.seg;
+ unsigned char *const seg_map = cpi->segmentation_map;
+ const unsigned char *const active_map = cpi->active_map.map;
+ int i;
+
+ assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE);
+
+ if (frame_is_intra_only(&cpi->common)) {
+ cpi->active_map.enabled = 0;
+ cpi->active_map.update = 1;
+ }
+
+ if (cpi->active_map.update) {
+ if (cpi->active_map.enabled) {
+ for (i = 0; i < cpi->common.mi_rows * cpi->common.mi_cols; ++i)
+ if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i];
+ av1_enable_segmentation(seg);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U,
+ -MAX_LOOP_FILTER);
+ av1_set_segdata(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V,
+ -MAX_LOOP_FILTER);
+ } else {
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_V);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_U);
+ av1_disable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_V);
+ if (seg->enabled) {
+ seg->update_data = 1;
+ seg->update_map = 1;
+ }
+ }
+ cpi->active_map.update = 0;
+ }
+}
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
+ unsigned char *const active_map_8x8 = cpi->active_map.map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+ const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+ cpi->active_map.update = 1;
+ if (new_map_16x16) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ active_map_8x8[r * mi_cols + c] =
+ new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)]
+ ? AM_SEGMENT_ID_ACTIVE
+ : AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ cpi->active_map.enabled = 1;
+ } else {
+ cpi->active_map.enabled = 0;
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *new_map_16x16, int rows,
+ int cols) {
+ if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+ new_map_16x16) {
+ unsigned char *const seg_map_8x8 = cpi->segmentation_map;
+ const int mi_rows = cpi->common.mi_rows;
+ const int mi_cols = cpi->common.mi_cols;
+ const int row_scale = mi_size_high[BLOCK_16X16] == 2 ? 1 : 2;
+ const int col_scale = mi_size_wide[BLOCK_16X16] == 2 ? 1 : 2;
+
+ memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+ if (cpi->active_map.enabled) {
+ int r, c;
+ for (r = 0; r < mi_rows; ++r) {
+ for (c = 0; c < mi_cols; ++c) {
+ // Cyclic refresh segments are considered active despite not having
+ // AM_SEGMENT_ID_ACTIVE
+ new_map_16x16[(r >> row_scale) * cols + (c >> col_scale)] |=
+ seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+ }
+ }
+ }
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+static void set_high_precision_mv(AV1_COMP *cpi, int allow_high_precision_mv,
+ int cur_frame_force_integer_mv) {
+ MACROBLOCK *const mb = &cpi->td.mb;
+ cpi->common.allow_high_precision_mv =
+ allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+ const int copy_hp =
+ cpi->common.allow_high_precision_mv && cur_frame_force_integer_mv == 0;
+ int *(*src)[2] = copy_hp ? &mb->nmvcost_hp : &mb->nmvcost;
+ mb->mv_cost_stack = *src;
+}
+
+static BLOCK_SIZE select_sb_size(const AV1_COMP *const cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+
+ if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_64X64)
+ return BLOCK_64X64;
+#if CONFIG_FILEOPTIONS
+ if (cm->options && cm->options->ext_partition)
+#endif
+ if (cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_128X128)
+ return BLOCK_128X128;
+
+ assert(cpi->oxcf.superblock_size == AOM_SUPERBLOCK_SIZE_DYNAMIC);
+
+// TODO(any): Possibly could improve this with a heuristic.
+#if CONFIG_FILEOPTIONS
+ if (cm->options && !cm->options->ext_partition) return BLOCK_64X64;
+#endif
+
+ // When superres / resize is on, 'cm->width / height' can change between
+ // calls, so we don't apply this heuristic there. Also, this heuristic gives
+ // compression gain for speed >= 2 only.
+ if (cpi->oxcf.superres_mode == SUPERRES_NONE &&
+ cpi->oxcf.resize_mode == RESIZE_NONE && cpi->oxcf.speed >= 2) {
+ return (cm->width >= 480 && cm->height >= 360) ? BLOCK_128X128
+ : BLOCK_64X64;
+ }
+
+ return BLOCK_128X128;
+}
+
+static void setup_frame(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ // Set up entropy context depending on frame type. The decoder mandates
+ // the use of the default context, index 0, for keyframes and inter
+ // frames where the error_resilient_mode or intra_only flag is set. For
+ // other inter-frames the encoder currently uses only two contexts;
+ // context 1 for ALTREF frames and context 0 for the others.
+
+ cm->primary_ref_frame = PRIMARY_REF_NONE;
+ if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
+ cm->force_primary_ref_none) {
+ av1_setup_past_independence(cm);
+ for (int i = 0; i < REF_FRAMES; i++) {
+ cm->fb_of_context_type[i] = -1;
+ }
+ cm->fb_of_context_type[REGULAR_FRAME] =
+ cm->show_frame ? get_ref_frame_map_idx(cpi, GOLDEN_FRAME)
+ : get_ref_frame_map_idx(cpi, ALTREF_FRAME);
+ cm->frame_context_idx = REGULAR_FRAME;
+ } else {
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE)
+ cm->frame_context_idx = EXT_ARF_FRAME;
+ else if (cpi->refresh_alt_ref_frame)
+ cm->frame_context_idx = ARF_FRAME;
+ else if (cpi->rc.is_src_frame_alt_ref)
+ cm->frame_context_idx = OVERLAY_FRAME;
+ else if (cpi->refresh_golden_frame)
+ cm->frame_context_idx = GLD_FRAME;
+ else if (cpi->refresh_bwd_ref_frame)
+ cm->frame_context_idx = BRF_FRAME;
+ else
+ cm->frame_context_idx = REGULAR_FRAME;
+ int wanted_fb = cm->fb_of_context_type[cm->frame_context_idx];
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ int fb = get_ref_frame_map_idx(cpi, ref_frame);
+ if (fb == wanted_fb) {
+ cm->primary_ref_frame = ref_frame - LAST_FRAME;
+ }
+ }
+ }
+
+ if (cm->frame_type == KEY_FRAME && cm->show_frame) {
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ av1_zero(cpi->interp_filter_selected);
+ set_sb_size(&cm->seq_params, select_sb_size(cpi));
+ set_use_reference_buffer(cm, 0);
+ } else if (frame_is_sframe(cm)) {
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ av1_zero(cpi->interp_filter_selected);
+ set_sb_size(&cm->seq_params, select_sb_size(cpi));
+ } else {
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+ cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+ av1_setup_past_independence(cm);
+ cm->seg.update_map = 1;
+ cm->seg.update_data = 1;
+ } else {
+ *cm->fc = cm->frame_contexts[cm->frame_refs[cm->primary_ref_frame].idx];
+ }
+ av1_zero(cpi->interp_filter_selected[0]);
+ }
+
+ cm->prev_frame = get_prev_frame(cm);
+ cpi->vaq_refresh = 0;
+}
+
+static void enc_setup_mi(AV1_COMMON *cm) {
+ int i;
+ int mi_rows_sb_aligned = calc_mi_size(cm->mi_rows);
+ cm->mi = cm->mip;
+ memset(cm->mip, 0, cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mip));
+ cm->prev_mi = cm->prev_mip;
+ // Clear top border row
+ memset(cm->prev_mip, 0, sizeof(*cm->prev_mip) * cm->mi_stride);
+ // Clear left border column
+ for (i = 0; i < mi_rows_sb_aligned; ++i)
+ memset(&cm->prev_mip[i * cm->mi_stride], 0, sizeof(*cm->prev_mip));
+ cm->mi_grid_visible = cm->mi_grid_base;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+
+ memset(cm->mi_grid_base, 0,
+ cm->mi_stride * mi_rows_sb_aligned * sizeof(*cm->mi_grid_base));
+}
+
+static int enc_alloc_mi(AV1_COMMON *cm, int mi_size) {
+ cm->mip = aom_calloc(mi_size, sizeof(*cm->mip));
+ if (!cm->mip) return 1;
+ cm->prev_mip = aom_calloc(mi_size, sizeof(*cm->prev_mip));
+ if (!cm->prev_mip) return 1;
+ cm->mi_alloc_size = mi_size;
+
+ cm->mi_grid_base =
+ (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+ if (!cm->mi_grid_base) return 1;
+ cm->prev_mi_grid_base =
+ (MB_MODE_INFO **)aom_calloc(mi_size, sizeof(MB_MODE_INFO *));
+ if (!cm->prev_mi_grid_base) return 1;
+
+ return 0;
+}
+
+static void enc_free_mi(AV1_COMMON *cm) {
+ aom_free(cm->mip);
+ cm->mip = NULL;
+ aom_free(cm->prev_mip);
+ cm->prev_mip = NULL;
+ aom_free(cm->mi_grid_base);
+ cm->mi_grid_base = NULL;
+ aom_free(cm->prev_mi_grid_base);
+ cm->prev_mi_grid_base = NULL;
+ cm->mi_alloc_size = 0;
+}
+
+static void swap_mi_and_prev_mi(AV1_COMMON *cm) {
+ // Current mip will be the prev_mip for the next frame.
+ MB_MODE_INFO **temp_base = cm->prev_mi_grid_base;
+ MB_MODE_INFO *temp = cm->prev_mip;
+ cm->prev_mip = cm->mip;
+ cm->mip = temp;
+
+ // Update the upper left visible macroblock ptrs.
+ cm->mi = cm->mip;
+ cm->prev_mi = cm->prev_mip;
+
+ cm->prev_mi_grid_base = cm->mi_grid_base;
+ cm->mi_grid_base = temp_base;
+ cm->mi_grid_visible = cm->mi_grid_base;
+ cm->prev_mi_grid_visible = cm->prev_mi_grid_base;
+}
+
+void av1_initialize_enc(void) {
+ av1_rtcd();
+ aom_dsp_rtcd();
+ aom_scale_rtcd();
+ av1_init_intra_predictors();
+ av1_init_me_luts();
+ av1_rc_init_minq_luts();
+ av1_init_wedge_masks();
+}
+
+static void dealloc_context_buffers_ext(AV1_COMP *cpi) {
+ if (cpi->mbmi_ext_base) {
+ aom_free(cpi->mbmi_ext_base);
+ cpi->mbmi_ext_base = NULL;
+ }
+}
+
+static void alloc_context_buffers_ext(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int mi_size = cm->mi_cols * cm->mi_rows;
+
+ dealloc_context_buffers_ext(cpi);
+ CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+ aom_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
+}
+
+static void update_film_grain_parameters(struct AV1_COMP *cpi,
+ const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ cpi->oxcf = *oxcf;
+
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ aom_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+
+ if (oxcf->film_grain_test_vector) {
+ cm->seq_params.film_grain_params_present = 1;
+ if (cm->frame_type == KEY_FRAME) {
+ memcpy(&cm->film_grain_params,
+ film_grain_test_vectors + oxcf->film_grain_test_vector - 1,
+ sizeof(cm->film_grain_params));
+
+ cm->film_grain_params.bit_depth = cm->seq_params.bit_depth;
+ if (cm->seq_params.color_range == AOM_CR_FULL_RANGE) {
+ cm->film_grain_params.clip_to_restricted_range = 0;
+ }
+ }
+ } else if (oxcf->film_grain_table_filename) {
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ memset(cpi->film_grain_table, 0, sizeof(aom_film_grain_table_t));
+
+ aom_film_grain_table_read(cpi->film_grain_table,
+ oxcf->film_grain_table_filename, &cm->error);
+ } else {
+ cm->seq_params.film_grain_params_present = 0;
+ memset(&cm->film_grain_params, 0, sizeof(cm->film_grain_params));
+ }
+}
+
+static void dealloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ dealloc_context_buffers_ext(cpi);
+
+ aom_free(cpi->tile_data);
+ cpi->tile_data = NULL;
+
+ // Delete sementation map
+ aom_free(cpi->segmentation_map);
+ cpi->segmentation_map = NULL;
+
+ av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ cpi->cyclic_refresh = NULL;
+
+ aom_free(cpi->active_map.map);
+ cpi->active_map.map = NULL;
+
+ aom_free(cpi->td.mb.above_pred_buf);
+ cpi->td.mb.above_pred_buf = NULL;
+
+ aom_free(cpi->td.mb.left_pred_buf);
+ cpi->td.mb.left_pred_buf = NULL;
+
+ aom_free(cpi->td.mb.wsrc_buf);
+ cpi->td.mb.wsrc_buf = NULL;
+
+ for (int i = 0; i < 2; i++)
+ for (int j = 0; j < 2; j++) {
+ aom_free(cpi->td.mb.hash_value_buffer[i][j]);
+ cpi->td.mb.hash_value_buffer[i][j] = NULL;
+ }
+ aom_free(cpi->td.mb.mask_buf);
+ cpi->td.mb.mask_buf = NULL;
+
+ aom_free(cm->tpl_mvs);
+ cm->tpl_mvs = NULL;
+
+ av1_free_ref_frame_buffers(cm->buffer_pool);
+ av1_free_txb_buf(cpi);
+ av1_free_context_buffers(cm);
+
+ aom_free_frame_buffer(&cpi->last_frame_uf);
+ av1_free_restoration_buffers(cm);
+ aom_free_frame_buffer(&cpi->trial_frame_rst);
+ aom_free_frame_buffer(&cpi->scaled_source);
+ aom_free_frame_buffer(&cpi->scaled_last_source);
+ aom_free_frame_buffer(&cpi->alt_ref_buffer);
+ av1_lookahead_destroy(cpi->lookahead);
+
+ aom_free(cpi->tile_tok[0][0]);
+ cpi->tile_tok[0][0] = 0;
+
+ aom_free(cpi->tplist[0][0]);
+ cpi->tplist[0][0] = NULL;
+
+ av1_free_pc_tree(&cpi->td, num_planes);
+
+ aom_free(cpi->td.mb.palette_buffer);
+
+ aom_free(cpi->td.mb.tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(cpi->td.mb.tmp_obmc_bufs[j]);
+ }
+
+#if CONFIG_DENOISE
+ if (cpi->denoise_and_model) {
+ aom_denoise_and_model_free(cpi->denoise_and_model);
+ cpi->denoise_and_model = NULL;
+ }
+#endif
+ if (cpi->film_grain_table) {
+ aom_film_grain_table_free(cpi->film_grain_table);
+ cpi->film_grain_table = NULL;
+ }
+}
+
+static void save_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+
+ // Stores a snapshot of key state variables which can subsequently be
+ // restored with a call to av1_restore_coding_context. These functions are
+ // intended for use in a re-code loop in av1_compress_frame where the
+ // quantizer value is adjusted between loop iterations.
+ av1_copy(cc->nmv_vec_cost, cpi->td.mb.nmv_vec_cost);
+ av1_copy(cc->nmv_costs, cpi->nmv_costs);
+ av1_copy(cc->nmv_costs_hp, cpi->nmv_costs_hp);
+
+ cc->fc = *cm->fc;
+}
+
+static void restore_coding_context(AV1_COMP *cpi) {
+ CODING_CONTEXT *const cc = &cpi->coding_context;
+ AV1_COMMON *cm = &cpi->common;
+
+ // Restore key state variables to the snapshot state stored in the
+ // previous call to av1_save_coding_context.
+ av1_copy(cpi->td.mb.nmv_vec_cost, cc->nmv_vec_cost);
+ av1_copy(cpi->nmv_costs, cc->nmv_costs);
+ av1_copy(cpi->nmv_costs_hp, cc->nmv_costs_hp);
+
+ *cm->fc = cc->fc;
+}
+
+static void configure_static_seg_features(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ struct segmentation *const seg = &cm->seg;
+
+ int high_q = (int)(rc->avg_q > 48.0);
+ int qi_delta;
+
+ // Disable and clear down for KF
+ if (cm->frame_type == KEY_FRAME) {
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation
+ av1_disable_segmentation(seg);
+
+ // Clear down the segment features.
+ av1_clearall_segfeatures(seg);
+ } else if (cpi->refresh_alt_ref_frame) {
+ // If this is an alt ref frame
+ // Clear down the global segmentation map
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+ seg->update_map = 0;
+ seg->update_data = 0;
+ cpi->static_mb_pct = 0;
+
+ // Disable segmentation and individual segment features by default
+ av1_disable_segmentation(seg);
+ av1_clearall_segfeatures(seg);
+
+ // Scan frames from current to arf frame.
+ // This function re-enables segmentation if appropriate.
+ av1_update_mbgraph_stats(cpi);
+
+ // If segmentation was enabled set those features needed for the
+ // arf itself.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+
+ qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875,
+ cm->seq_params.bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+ }
+ } else if (seg->enabled) {
+ // All other frames if segmentation has been enabled
+
+ // First normal frame in a valid gf or alt ref group
+ if (rc->frames_since_golden == 0) {
+ // Set up segment features for normal frames in an arf group
+ if (rc->source_alt_ref_active) {
+ seg->update_map = 0;
+ seg->update_data = 1;
+
+ qi_delta = av1_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125,
+ cm->seq_params.bit_depth);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_Q);
+
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_H, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_Y_V, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_U, -2);
+ av1_set_segdata(seg, 1, SEG_LVL_ALT_LF_V, -2);
+
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_H);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_Y_V);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_U);
+ av1_enable_segfeature(seg, 1, SEG_LVL_ALT_LF_V);
+
+ // Segment coding disabled for compred testing
+ if (high_q || (cpi->static_mb_pct == 100)) {
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ } else {
+ // Disable segmentation and clear down features if alt ref
+ // is not active for this group
+
+ av1_disable_segmentation(seg);
+
+ memset(cpi->segmentation_map, 0, cm->mi_rows * cm->mi_cols);
+
+ seg->update_map = 0;
+ seg->update_data = 0;
+
+ av1_clearall_segfeatures(seg);
+ }
+ } else if (rc->is_src_frame_alt_ref) {
+ // Special case where we are coding over the top of a previous
+ // alt ref frame.
+ // Segment coding disabled for compred testing
+
+ // Enable ref frame features for segment 0 as well
+ av1_enable_segfeature(seg, 0, SEG_LVL_REF_FRAME);
+ av1_enable_segfeature(seg, 1, SEG_LVL_REF_FRAME);
+
+ // All mbs should use ALTREF_FRAME
+ av1_clear_segdata(seg, 0, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 0, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+ av1_clear_segdata(seg, 1, SEG_LVL_REF_FRAME);
+ av1_set_segdata(seg, 1, SEG_LVL_REF_FRAME, ALTREF_FRAME);
+
+ // Skip all MBs if high Q (0,0 mv and skip coeffs)
+ if (high_q) {
+ av1_enable_segfeature(seg, 0, SEG_LVL_SKIP);
+ av1_enable_segfeature(seg, 1, SEG_LVL_SKIP);
+ }
+ // Enable data update
+ seg->update_data = 1;
+ } else {
+ // All other frames.
+
+ // No updates.. leave things as they are.
+ seg->update_map = 0;
+ seg->update_data = 0;
+ }
+ }
+}
+
+static void update_reference_segmentation_map(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MB_MODE_INFO **mi_4x4_ptr = cm->mi_grid_visible;
+ uint8_t *cache_ptr = cm->current_frame_seg_map;
+ int row, col;
+
+ for (row = 0; row < cm->mi_rows; row++) {
+ MB_MODE_INFO **mi_4x4 = mi_4x4_ptr;
+ uint8_t *cache = cache_ptr;
+ for (col = 0; col < cm->mi_cols; col++, mi_4x4++, cache++)
+ cache[0] = mi_4x4[0]->segment_id;
+ mi_4x4_ptr += cm->mi_stride;
+ cache_ptr += cm->mi_cols;
+ }
+}
+
+static void alloc_raw_frame_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+
+ if (!cpi->lookahead)
+ cpi->lookahead =
+ av1_lookahead_init(oxcf->width, oxcf->height, seq_params->subsampling_x,
+ seq_params->subsampling_y,
+ seq_params->use_highbitdepth, oxcf->lag_in_frames);
+ if (!cpi->lookahead)
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate lag buffers");
+
+ // TODO(agrange) Check if ARF is enabled and skip allocation if not.
+ if (aom_realloc_frame_buffer(
+ &cpi->alt_ref_buffer, oxcf->width, oxcf->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate altref buffer");
+}
+
+static void alloc_util_frame_buffers(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ if (aom_realloc_frame_buffer(
+ &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate last frame buffer");
+
+ if (aom_realloc_frame_buffer(
+ &cpi->trial_frame_rst, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate trial restored frame buffer");
+
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_source, cm->width, cm->height, seq_params->subsampling_x,
+ seq_params->subsampling_y, seq_params->use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled source buffer");
+
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_last_source, cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate scaled last source buffer");
+}
+
+static void alloc_compressor_data(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ av1_alloc_context_buffers(cm, cm->width, cm->height);
+
+ int mi_rows_aligned_to_sb =
+ ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows_aligned_to_sb >> cm->seq_params.mib_size_log2;
+
+ av1_alloc_txb_buf(cpi);
+
+ alloc_context_buffers_ext(cpi);
+
+ aom_free(cpi->tile_tok[0][0]);
+
+ {
+ unsigned int tokens =
+ get_token_alloc(cm->mb_rows, cm->mb_cols, MAX_SB_SIZE_LOG2, num_planes);
+ CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+ aom_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+ }
+ aom_free(cpi->tplist[0][0]);
+
+ CHECK_MEM_ERROR(cm, cpi->tplist[0][0],
+ aom_calloc(sb_rows * MAX_TILE_ROWS * MAX_TILE_COLS,
+ sizeof(*cpi->tplist[0][0])));
+
+ av1_setup_pc_tree(&cpi->common, &cpi->td);
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate) {
+ cpi->framerate = framerate < 0.1 ? 30 : framerate;
+ av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height);
+}
+
+static void set_tile_info(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i, start_sb;
+
+ av1_get_tile_limits(cm);
+
+ // configure tile columns
+ if (cpi->oxcf.tile_width_count == 0 || cpi->oxcf.tile_height_count == 0) {
+ cm->uniform_tile_spacing_flag = 1;
+ cm->log2_tile_cols = AOMMAX(cpi->oxcf.tile_columns, cm->min_log2_tile_cols);
+ cm->log2_tile_cols = AOMMIN(cm->log2_tile_cols, cm->max_log2_tile_cols);
+ } else {
+ int mi_cols = ALIGN_POWER_OF_TWO(cm->mi_cols, cm->seq_params.mib_size_log2);
+ int sb_cols = mi_cols >> cm->seq_params.mib_size_log2;
+ int size_sb, j = 0;
+ cm->uniform_tile_spacing_flag = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_cols && i < MAX_TILE_COLS; i++) {
+ cm->tile_col_start_sb[i] = start_sb;
+ size_sb = cpi->oxcf.tile_widths[j++];
+ if (j >= cpi->oxcf.tile_width_count) j = 0;
+ start_sb += AOMMIN(size_sb, cm->max_tile_width_sb);
+ }
+ cm->tile_cols = i;
+ cm->tile_col_start_sb[i] = sb_cols;
+ }
+ av1_calculate_tile_cols(cm);
+
+ // configure tile rows
+ if (cm->uniform_tile_spacing_flag) {
+ cm->log2_tile_rows = AOMMAX(cpi->oxcf.tile_rows, cm->min_log2_tile_rows);
+ cm->log2_tile_rows = AOMMIN(cm->log2_tile_rows, cm->max_log2_tile_rows);
+ } else {
+ int mi_rows = ALIGN_POWER_OF_TWO(cm->mi_rows, cm->seq_params.mib_size_log2);
+ int sb_rows = mi_rows >> cm->seq_params.mib_size_log2;
+ int size_sb, j = 0;
+ for (i = 0, start_sb = 0; start_sb < sb_rows && i < MAX_TILE_ROWS; i++) {
+ cm->tile_row_start_sb[i] = start_sb;
+ size_sb = cpi->oxcf.tile_heights[j++];
+ if (j >= cpi->oxcf.tile_height_count) j = 0;
+ start_sb += AOMMIN(size_sb, cm->max_tile_height_sb);
+ }
+ cm->tile_rows = i;
+ cm->tile_row_start_sb[i] = sb_rows;
+ }
+ av1_calculate_tile_rows(cm);
+}
+
+static void update_frame_size(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+
+ av1_set_mb_mi(cm, cm->width, cm->height);
+ av1_init_context_buffers(cm);
+ av1_init_macroblockd(cm, xd, NULL);
+ memset(cpi->mbmi_ext_base, 0,
+ cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
+ set_tile_info(cpi);
+}
+
+static void init_buffer_indices(AV1_COMP *cpi) {
+ int fb_idx;
+ for (fb_idx = 0; fb_idx < REF_FRAMES; ++fb_idx)
+ cpi->ref_fb_idx[fb_idx] = fb_idx;
+ cpi->rate_index = 0;
+ cpi->rate_size = 0;
+ cpi->cur_poc = -1;
+}
+
+static INLINE int does_level_match(int width, int height, double fps,
+ int lvl_width, int lvl_height,
+ double lvl_fps, int lvl_dim_mult) {
+ const int64_t lvl_luma_pels = lvl_width * lvl_height;
+ const double lvl_display_sample_rate = lvl_luma_pels * lvl_fps;
+ const int64_t luma_pels = width * height;
+ const double display_sample_rate = luma_pels * fps;
+ return luma_pels <= lvl_luma_pels &&
+ display_sample_rate <= lvl_display_sample_rate &&
+ width <= lvl_width * lvl_dim_mult &&
+ height <= lvl_height * lvl_dim_mult;
+}
+
+static void set_bitstream_level_tier(SequenceHeader *seq, AV1_COMMON *cm,
+ const AV1EncoderConfig *oxcf) {
+ // TODO(any): This is a placeholder function that only addresses dimensions
+ // and max display sample rates.
+ // Need to add checks for max bit rate, max decoded luma sample rate, header
+ // rate, etc. that are not covered by this function.
+ (void)oxcf;
+ BitstreamLevel bl = { 9, 3 };
+ if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate, 512,
+ 288, 30.0, 4)) {
+ bl.major = 2;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 704, 396, 30.0, 4)) {
+ bl.major = 2;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 1088, 612, 30.0, 4)) {
+ bl.major = 3;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 1376, 774, 30.0, 4)) {
+ bl.major = 3;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 2048, 1152, 30.0, 3)) {
+ bl.major = 4;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 2048, 1152, 60.0, 3)) {
+ bl.major = 4;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 4096, 2176, 30.0, 2)) {
+ bl.major = 5;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 4096, 2176, 60.0, 2)) {
+ bl.major = 5;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 4096, 2176, 120.0, 2)) {
+ bl.major = 5;
+ bl.minor = 2;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 8192, 4352, 30.0, 2)) {
+ bl.major = 6;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 8192, 4352, 60.0, 2)) {
+ bl.major = 6;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 8192, 4352, 120.0, 2)) {
+ bl.major = 6;
+ bl.minor = 2;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 16384, 8704, 30.0, 2)) {
+ bl.major = 7;
+ bl.minor = 0;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 16384, 8704, 60.0, 2)) {
+ bl.major = 7;
+ bl.minor = 1;
+ } else if (does_level_match(oxcf->width, oxcf->height, oxcf->init_framerate,
+ 16384, 8704, 120.0, 2)) {
+ bl.major = 7;
+ bl.minor = 2;
+ }
+ for (int i = 0; i < MAX_NUM_OPERATING_POINTS; ++i) {
+ seq->level[i] = bl;
+ seq->tier[i] = 0; // setting main tier by default
+ // Set the maximum parameters for bitrate and buffer size for this profile,
+ // level, and tier
+ cm->op_params[i].bitrate = max_level_bitrate(
+ cm->seq_params.profile, major_minor_to_seq_level_idx(seq->level[i]),
+ seq->tier[i]);
+ // Level with seq_level_idx = 31 returns a high "dummy" bitrate to pass the
+ // check
+ if (cm->op_params[i].bitrate == 0)
+ aom_internal_error(
+ &cm->error, AOM_CODEC_UNSUP_BITSTREAM,
+ "AV1 does not support this combination of profile, level, and tier.");
+ // Buffer size in bits/s is bitrate in bits/s * 1 s
+ cm->op_params[i].buffer_size = cm->op_params[i].bitrate;
+ }
+}
+
+static void init_seq_coding_tools(SequenceHeader *seq, AV1_COMMON *cm,
+ const AV1EncoderConfig *oxcf) {
+ seq->still_picture = (oxcf->limit == 1);
+ seq->reduced_still_picture_hdr = seq->still_picture;
+ seq->reduced_still_picture_hdr &= !oxcf->full_still_picture_hdr;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ seq->enable_order_hint = oxcf->enable_order_hint;
+ seq->frame_id_numbers_present_flag = oxcf->large_scale_tile;
+ if (seq->still_picture && seq->reduced_still_picture_hdr) {
+ seq->enable_order_hint = 0;
+ seq->frame_id_numbers_present_flag = 0;
+ seq->force_screen_content_tools = 2;
+ seq->force_integer_mv = 2;
+ }
+ seq->order_hint_bits_minus_1 =
+ seq->enable_order_hint ? DEFAULT_EXPLICIT_ORDER_HINT_BITS - 1 : -1;
+
+ seq->enable_dual_filter = oxcf->enable_dual_filter;
+ seq->enable_jnt_comp = oxcf->enable_jnt_comp;
+ seq->enable_jnt_comp &= seq->enable_order_hint;
+ seq->enable_ref_frame_mvs = oxcf->enable_ref_frame_mvs;
+ seq->enable_ref_frame_mvs &= seq->enable_order_hint;
+ seq->enable_superres = oxcf->enable_superres;
+ seq->enable_cdef = oxcf->enable_cdef;
+ seq->enable_restoration = oxcf->enable_restoration;
+ seq->enable_warped_motion = oxcf->enable_warped_motion;
+ seq->enable_interintra_compound = 1;
+ seq->enable_masked_compound = 1;
+ seq->enable_intra_edge_filter = 1;
+ seq->enable_filter_intra = 1;
+
+ set_bitstream_level_tier(seq, cm, oxcf);
+
+ if (seq->operating_points_cnt_minus_1 == 0) {
+ seq->operating_point_idc[0] = 0;
+ } else {
+ // Set operating_point_idc[] such that for the i-th operating point the
+ // first (operating_points_cnt-i) spatial layers and the first temporal
+ // layer are decoded Note that highest quality operating point should come
+ // first
+ for (int i = 0; i < seq->operating_points_cnt_minus_1 + 1; i++)
+ seq->operating_point_idc[i] =
+ (~(~0u << (seq->operating_points_cnt_minus_1 + 1 - i)) << 8) | 1;
+ }
+}
+
+static void init_config(struct AV1_COMP *cpi, AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ cpi->oxcf = *oxcf;
+ cpi->framerate = oxcf->init_framerate;
+
+ cm->seq_params.profile = oxcf->profile;
+ cm->seq_params.bit_depth = oxcf->bit_depth;
+ cm->seq_params.use_highbitdepth = oxcf->use_highbitdepth;
+ cm->seq_params.color_primaries = oxcf->color_primaries;
+ cm->seq_params.transfer_characteristics = oxcf->transfer_characteristics;
+ cm->seq_params.matrix_coefficients = oxcf->matrix_coefficients;
+ cm->seq_params.monochrome = oxcf->monochrome;
+ cm->seq_params.chroma_sample_position = oxcf->chroma_sample_position;
+ cm->seq_params.color_range = oxcf->color_range;
+ cm->timing_info_present = oxcf->timing_info_present;
+ cm->timing_info.num_units_in_display_tick =
+ oxcf->timing_info.num_units_in_display_tick;
+ cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+ cm->timing_info.equal_picture_interval =
+ oxcf->timing_info.equal_picture_interval;
+ cm->timing_info.num_ticks_per_picture =
+ oxcf->timing_info.num_ticks_per_picture;
+
+ cm->seq_params.display_model_info_present_flag =
+ oxcf->display_model_info_present_flag;
+ cm->seq_params.decoder_model_info_present_flag =
+ oxcf->decoder_model_info_present_flag;
+ if (oxcf->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ cm->buffer_model.num_units_in_decoding_tick =
+ oxcf->buffer_model.num_units_in_decoding_tick;
+ cm->buffer_removal_time_present = 1;
+ set_aom_dec_model_info(&cm->buffer_model);
+ set_dec_model_op_parameters(&cm->op_params[0]);
+ } else if (cm->timing_info_present &&
+ cm->timing_info.equal_picture_interval &&
+ !cm->seq_params.decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ set_resource_availability_parameters(&cm->op_params[0]);
+ } else {
+ cm->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ if (cm->seq_params.monochrome) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.color_primaries == AOM_CICP_CP_BT_709 &&
+ cm->seq_params.transfer_characteristics == AOM_CICP_TC_SRGB &&
+ cm->seq_params.matrix_coefficients == AOM_CICP_MC_IDENTITY) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.profile == 0) {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 1;
+ } else if (cm->seq_params.profile == 1) {
+ cm->seq_params.subsampling_x = 0;
+ cm->seq_params.subsampling_y = 0;
+ } else {
+ if (cm->seq_params.bit_depth == AOM_BITS_12) {
+ cm->seq_params.subsampling_x = oxcf->chroma_subsampling_x;
+ cm->seq_params.subsampling_y = oxcf->chroma_subsampling_y;
+ } else {
+ cm->seq_params.subsampling_x = 1;
+ cm->seq_params.subsampling_y = 0;
+ }
+ }
+ }
+
+ cm->width = oxcf->width;
+ cm->height = oxcf->height;
+ set_sb_size(&cm->seq_params,
+ select_sb_size(cpi)); // set sb size before allocations
+ alloc_compressor_data(cpi);
+
+ update_film_grain_parameters(cpi, oxcf);
+
+ // Single thread case: use counts in common.
+ cpi->td.counts = &cpi->counts;
+
+ // change includes all joint functionality
+ av1_change_config(cpi, oxcf);
+
+ cpi->static_mb_pct = 0;
+ cpi->ref_frame_flags = 0;
+
+ // Reset resize pending flags
+ cpi->resize_pending_width = 0;
+ cpi->resize_pending_height = 0;
+
+ init_buffer_indices(cpi);
+}
+
+static void set_rc_buffer_sizes(RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ const int64_t bandwidth = oxcf->target_bandwidth;
+ const int64_t starting = oxcf->starting_buffer_level_ms;
+ const int64_t optimal = oxcf->optimal_buffer_level_ms;
+ const int64_t maximum = oxcf->maximum_buffer_size_ms;
+
+ rc->starting_buffer_level = starting * bandwidth / 1000;
+ rc->optimal_buffer_level =
+ (optimal == 0) ? bandwidth / 8 : optimal * bandwidth / 1000;
+ rc->maximum_buffer_size =
+ (maximum == 0) ? bandwidth / 8 : maximum * bandwidth / 1000;
+}
+
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].jsdaf = JSDAF; \
+ cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+#define MAKE_BFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+ int source_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride) >> 4; \
+ }
+
+#define MAKE_BFP_SADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred) >> \
+ 4; \
+ }
+
+#define MAKE_BFP_SAD4D_WRAPPER(fnname) \
+ static void fnname##_bits8(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ } \
+ static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 2; \
+ } \
+ static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
+ const uint8_t *const ref_ptr[], int ref_stride, \
+ unsigned int *sad_array) { \
+ int i; \
+ fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array); \
+ for (i = 0; i < 4; i++) sad_array[i] >>= 4; \
+ }
+
+#define MAKE_BFP_JSADAVG_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const JNT_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const JNT_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred, \
+ const JNT_COMP_PARAMS *jcp_param) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, second_pred, \
+ jcp_param) >> \
+ 4; \
+ }
+
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x128x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x4x4d)
+
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad4x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad4x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad4x16x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x4)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x4_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x4x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad8x32)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad8x32_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad8x32x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad32x8)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad32x8_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad32x8x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad16x64)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad16x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad16x64x4d)
+MAKE_BFP_SAD_WRAPPER(aom_highbd_sad64x16)
+MAKE_BFP_SADAVG_WRAPPER(aom_highbd_sad64x16_avg)
+MAKE_BFP_SAD4D_WRAPPER(aom_highbd_sad64x16x4d)
+
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad128x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x128_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad4x16_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x4_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad8x32_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad32x8_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad16x64_avg)
+MAKE_BFP_JSADAVG_WRAPPER(aom_highbd_jnt_sad64x16_avg)
+
+#define HIGHBD_MBFP(BT, MCSDF, MCSVF) \
+ cpi->fn_ptr[BT].msdf = MCSDF; \
+ cpi->fn_ptr[BT].msvf = MCSVF;
+
+#define MAKE_MBFP_COMPOUND_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask); \
+ } \
+ static unsigned int fnname##_bits10( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 2; \
+ } \
+ static unsigned int fnname##_bits12( \
+ const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred_ptr, const uint8_t *m, \
+ int m_stride, int invert_mask) { \
+ return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+ second_pred_ptr, m, m_stride, invert_mask) >> \
+ 4; \
+ }
+
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad128x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x128)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad4x16)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x4)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad8x32)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad32x8)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad16x64)
+MAKE_MBFP_COMPOUND_SAD_WRAPPER(aom_highbd_masked_sad64x16)
+
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF) \
+ cpi->fn_ptr[BT].osdf = OSDF; \
+ cpi->fn_ptr[BT].ovf = OVF; \
+ cpi->fn_ptr[BT].osvf = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname) \
+ static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk); \
+ } \
+ static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 2; \
+ } \
+ static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride, \
+ const int32_t *wsrc, \
+ const int32_t *msk) { \
+ return fnname(ref, ref_stride, wsrc, msk) >> 4; \
+ }
+
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x128)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad4x16)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x4)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad8x32)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad32x8)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad16x64)
+MAKE_OBFP_SAD_WRAPPER(aom_highbd_obmc_sad64x16)
+
+static void highbd_set_var_fns(AV1_COMP *const cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (cm->seq_params.use_highbitdepth) {
+ switch (cm->seq_params.bit_depth) {
+ case AOM_BITS_8:
+ HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits8,
+ aom_highbd_sad64x16_avg_bits8, aom_highbd_8_variance64x16,
+ aom_highbd_8_sub_pixel_variance64x16,
+ aom_highbd_8_sub_pixel_avg_variance64x16,
+ aom_highbd_sad64x16x4d_bits8,
+ aom_highbd_jnt_sad64x16_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance64x16)
+
+ HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits8,
+ aom_highbd_sad16x64_avg_bits8, aom_highbd_8_variance16x64,
+ aom_highbd_8_sub_pixel_variance16x64,
+ aom_highbd_8_sub_pixel_avg_variance16x64,
+ aom_highbd_sad16x64x4d_bits8,
+ aom_highbd_jnt_sad16x64_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance16x64)
+
+ HIGHBD_BFP(
+ BLOCK_32X8, aom_highbd_sad32x8_bits8, aom_highbd_sad32x8_avg_bits8,
+ aom_highbd_8_variance32x8, aom_highbd_8_sub_pixel_variance32x8,
+ aom_highbd_8_sub_pixel_avg_variance32x8,
+ aom_highbd_sad32x8x4d_bits8, aom_highbd_jnt_sad32x8_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance32x8)
+
+ HIGHBD_BFP(
+ BLOCK_8X32, aom_highbd_sad8x32_bits8, aom_highbd_sad8x32_avg_bits8,
+ aom_highbd_8_variance8x32, aom_highbd_8_sub_pixel_variance8x32,
+ aom_highbd_8_sub_pixel_avg_variance8x32,
+ aom_highbd_sad8x32x4d_bits8, aom_highbd_jnt_sad8x32_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance8x32)
+
+ HIGHBD_BFP(
+ BLOCK_16X4, aom_highbd_sad16x4_bits8, aom_highbd_sad16x4_avg_bits8,
+ aom_highbd_8_variance16x4, aom_highbd_8_sub_pixel_variance16x4,
+ aom_highbd_8_sub_pixel_avg_variance16x4,
+ aom_highbd_sad16x4x4d_bits8, aom_highbd_jnt_sad16x4_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance16x4)
+
+ HIGHBD_BFP(
+ BLOCK_4X16, aom_highbd_sad4x16_bits8, aom_highbd_sad4x16_avg_bits8,
+ aom_highbd_8_variance4x16, aom_highbd_8_sub_pixel_variance4x16,
+ aom_highbd_8_sub_pixel_avg_variance4x16,
+ aom_highbd_sad4x16x4d_bits8, aom_highbd_jnt_sad4x16_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance4x16)
+
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits8,
+ aom_highbd_sad32x16_avg_bits8, aom_highbd_8_variance32x16,
+ aom_highbd_8_sub_pixel_variance32x16,
+ aom_highbd_8_sub_pixel_avg_variance32x16,
+ aom_highbd_sad32x16x4d_bits8,
+ aom_highbd_jnt_sad32x16_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance32x16)
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits8,
+ aom_highbd_sad16x32_avg_bits8, aom_highbd_8_variance16x32,
+ aom_highbd_8_sub_pixel_variance16x32,
+ aom_highbd_8_sub_pixel_avg_variance16x32,
+ aom_highbd_sad16x32x4d_bits8,
+ aom_highbd_jnt_sad16x32_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance16x32)
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits8,
+ aom_highbd_sad64x32_avg_bits8, aom_highbd_8_variance64x32,
+ aom_highbd_8_sub_pixel_variance64x32,
+ aom_highbd_8_sub_pixel_avg_variance64x32,
+ aom_highbd_sad64x32x4d_bits8,
+ aom_highbd_jnt_sad64x32_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance64x32)
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits8,
+ aom_highbd_sad32x64_avg_bits8, aom_highbd_8_variance32x64,
+ aom_highbd_8_sub_pixel_variance32x64,
+ aom_highbd_8_sub_pixel_avg_variance32x64,
+ aom_highbd_sad32x64x4d_bits8,
+ aom_highbd_jnt_sad32x64_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance32x64)
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits8,
+ aom_highbd_sad32x32_avg_bits8, aom_highbd_8_variance32x32,
+ aom_highbd_8_sub_pixel_variance32x32,
+ aom_highbd_8_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x4d_bits8,
+ aom_highbd_jnt_sad32x32_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance32x32)
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits8,
+ aom_highbd_sad64x64_avg_bits8, aom_highbd_8_variance64x64,
+ aom_highbd_8_sub_pixel_variance64x64,
+ aom_highbd_8_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x4d_bits8,
+ aom_highbd_jnt_sad64x64_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance64x64)
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits8,
+ aom_highbd_sad16x16_avg_bits8, aom_highbd_8_variance16x16,
+ aom_highbd_8_sub_pixel_variance16x16,
+ aom_highbd_8_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x4d_bits8,
+ aom_highbd_jnt_sad16x16_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance16x16)
+
+ HIGHBD_BFP(
+ BLOCK_16X8, aom_highbd_sad16x8_bits8, aom_highbd_sad16x8_avg_bits8,
+ aom_highbd_8_variance16x8, aom_highbd_8_sub_pixel_variance16x8,
+ aom_highbd_8_sub_pixel_avg_variance16x8,
+ aom_highbd_sad16x8x4d_bits8, aom_highbd_jnt_sad16x8_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance16x8)
+
+ HIGHBD_BFP(
+ BLOCK_8X16, aom_highbd_sad8x16_bits8, aom_highbd_sad8x16_avg_bits8,
+ aom_highbd_8_variance8x16, aom_highbd_8_sub_pixel_variance8x16,
+ aom_highbd_8_sub_pixel_avg_variance8x16,
+ aom_highbd_sad8x16x4d_bits8, aom_highbd_jnt_sad8x16_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance8x16)
+
+ HIGHBD_BFP(BLOCK_8X8, aom_highbd_sad8x8_bits8,
+ aom_highbd_sad8x8_avg_bits8, aom_highbd_8_variance8x8,
+ aom_highbd_8_sub_pixel_variance8x8,
+ aom_highbd_8_sub_pixel_avg_variance8x8,
+ aom_highbd_sad8x8x4d_bits8, aom_highbd_jnt_sad8x8_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance8x8)
+
+ HIGHBD_BFP(BLOCK_8X4, aom_highbd_sad8x4_bits8,
+ aom_highbd_sad8x4_avg_bits8, aom_highbd_8_variance8x4,
+ aom_highbd_8_sub_pixel_variance8x4,
+ aom_highbd_8_sub_pixel_avg_variance8x4,
+ aom_highbd_sad8x4x4d_bits8, aom_highbd_jnt_sad8x4_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance8x4)
+
+ HIGHBD_BFP(BLOCK_4X8, aom_highbd_sad4x8_bits8,
+ aom_highbd_sad4x8_avg_bits8, aom_highbd_8_variance4x8,
+ aom_highbd_8_sub_pixel_variance4x8,
+ aom_highbd_8_sub_pixel_avg_variance4x8,
+ aom_highbd_sad4x8x4d_bits8, aom_highbd_jnt_sad4x8_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance4x8)
+
+ HIGHBD_BFP(BLOCK_4X4, aom_highbd_sad4x4_bits8,
+ aom_highbd_sad4x4_avg_bits8, aom_highbd_8_variance4x4,
+ aom_highbd_8_sub_pixel_variance4x4,
+ aom_highbd_8_sub_pixel_avg_variance4x4,
+ aom_highbd_sad4x4x4d_bits8, aom_highbd_jnt_sad4x4_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance4x4)
+
+ HIGHBD_BFP(
+ BLOCK_128X128, aom_highbd_sad128x128_bits8,
+ aom_highbd_sad128x128_avg_bits8, aom_highbd_8_variance128x128,
+ aom_highbd_8_sub_pixel_variance128x128,
+ aom_highbd_8_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x4d_bits8, aom_highbd_jnt_sad128x128_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance128x128)
+
+ HIGHBD_BFP(BLOCK_128X64, aom_highbd_sad128x64_bits8,
+ aom_highbd_sad128x64_avg_bits8, aom_highbd_8_variance128x64,
+ aom_highbd_8_sub_pixel_variance128x64,
+ aom_highbd_8_sub_pixel_avg_variance128x64,
+ aom_highbd_sad128x64x4d_bits8,
+ aom_highbd_jnt_sad128x64_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance128x64)
+
+ HIGHBD_BFP(BLOCK_64X128, aom_highbd_sad64x128_bits8,
+ aom_highbd_sad64x128_avg_bits8, aom_highbd_8_variance64x128,
+ aom_highbd_8_sub_pixel_variance64x128,
+ aom_highbd_8_sub_pixel_avg_variance64x128,
+ aom_highbd_sad64x128x4d_bits8,
+ aom_highbd_jnt_sad64x128_avg_bits8,
+ aom_highbd_8_jnt_sub_pixel_avg_variance64x128)
+
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits8,
+ aom_highbd_8_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits8,
+ aom_highbd_8_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits8,
+ aom_highbd_8_masked_sub_pixel_variance64x128)
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits8,
+ aom_highbd_8_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits8,
+ aom_highbd_8_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits8,
+ aom_highbd_8_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits8,
+ aom_highbd_8_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits8,
+ aom_highbd_8_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits8,
+ aom_highbd_8_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits8,
+ aom_highbd_8_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits8,
+ aom_highbd_8_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits8,
+ aom_highbd_8_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits8,
+ aom_highbd_8_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits8,
+ aom_highbd_8_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits8,
+ aom_highbd_8_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits8,
+ aom_highbd_8_masked_sub_pixel_variance4x4)
+ HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits8,
+ aom_highbd_8_masked_sub_pixel_variance64x16)
+ HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits8,
+ aom_highbd_8_masked_sub_pixel_variance16x64)
+ HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits8,
+ aom_highbd_8_masked_sub_pixel_variance32x8)
+ HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits8,
+ aom_highbd_8_masked_sub_pixel_variance8x32)
+ HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits8,
+ aom_highbd_8_masked_sub_pixel_variance16x4)
+ HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits8,
+ aom_highbd_8_masked_sub_pixel_variance4x16)
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits8,
+ aom_highbd_obmc_variance128x128,
+ aom_highbd_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits8,
+ aom_highbd_obmc_variance128x64,
+ aom_highbd_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits8,
+ aom_highbd_obmc_variance64x128,
+ aom_highbd_obmc_sub_pixel_variance64x128)
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits8,
+ aom_highbd_obmc_variance64x64,
+ aom_highbd_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits8,
+ aom_highbd_obmc_variance64x32,
+ aom_highbd_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits8,
+ aom_highbd_obmc_variance32x64,
+ aom_highbd_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits8,
+ aom_highbd_obmc_variance32x32,
+ aom_highbd_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits8,
+ aom_highbd_obmc_variance32x16,
+ aom_highbd_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits8,
+ aom_highbd_obmc_variance16x32,
+ aom_highbd_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits8,
+ aom_highbd_obmc_variance16x16,
+ aom_highbd_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits8,
+ aom_highbd_obmc_variance8x16,
+ aom_highbd_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits8,
+ aom_highbd_obmc_variance16x8,
+ aom_highbd_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits8,
+ aom_highbd_obmc_variance8x8,
+ aom_highbd_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits8,
+ aom_highbd_obmc_variance4x8,
+ aom_highbd_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits8,
+ aom_highbd_obmc_variance8x4,
+ aom_highbd_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits8,
+ aom_highbd_obmc_variance4x4,
+ aom_highbd_obmc_sub_pixel_variance4x4)
+ HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits8,
+ aom_highbd_obmc_variance64x16,
+ aom_highbd_obmc_sub_pixel_variance64x16)
+ HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits8,
+ aom_highbd_obmc_variance16x64,
+ aom_highbd_obmc_sub_pixel_variance16x64)
+ HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits8,
+ aom_highbd_obmc_variance32x8,
+ aom_highbd_obmc_sub_pixel_variance32x8)
+ HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits8,
+ aom_highbd_obmc_variance8x32,
+ aom_highbd_obmc_sub_pixel_variance8x32)
+ HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits8,
+ aom_highbd_obmc_variance16x4,
+ aom_highbd_obmc_sub_pixel_variance16x4)
+ HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits8,
+ aom_highbd_obmc_variance4x16,
+ aom_highbd_obmc_sub_pixel_variance4x16)
+ break;
+
+ case AOM_BITS_10:
+ HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits10,
+ aom_highbd_sad64x16_avg_bits10, aom_highbd_10_variance64x16,
+ aom_highbd_10_sub_pixel_variance64x16,
+ aom_highbd_10_sub_pixel_avg_variance64x16,
+ aom_highbd_sad64x16x4d_bits10,
+ aom_highbd_jnt_sad64x16_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance64x16);
+
+ HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits10,
+ aom_highbd_sad16x64_avg_bits10, aom_highbd_10_variance16x64,
+ aom_highbd_10_sub_pixel_variance16x64,
+ aom_highbd_10_sub_pixel_avg_variance16x64,
+ aom_highbd_sad16x64x4d_bits10,
+ aom_highbd_jnt_sad16x64_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance16x64);
+
+ HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits10,
+ aom_highbd_sad32x8_avg_bits10, aom_highbd_10_variance32x8,
+ aom_highbd_10_sub_pixel_variance32x8,
+ aom_highbd_10_sub_pixel_avg_variance32x8,
+ aom_highbd_sad32x8x4d_bits10,
+ aom_highbd_jnt_sad32x8_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance32x8);
+
+ HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits10,
+ aom_highbd_sad8x32_avg_bits10, aom_highbd_10_variance8x32,
+ aom_highbd_10_sub_pixel_variance8x32,
+ aom_highbd_10_sub_pixel_avg_variance8x32,
+ aom_highbd_sad8x32x4d_bits10,
+ aom_highbd_jnt_sad8x32_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance8x32);
+
+ HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits10,
+ aom_highbd_sad16x4_avg_bits10, aom_highbd_10_variance16x4,
+ aom_highbd_10_sub_pixel_variance16x4,
+ aom_highbd_10_sub_pixel_avg_variance16x4,
+ aom_highbd_sad16x4x4d_bits10,
+ aom_highbd_jnt_sad16x4_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance16x4);
+
+ HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits10,
+ aom_highbd_sad4x16_avg_bits10, aom_highbd_10_variance4x16,
+ aom_highbd_10_sub_pixel_variance4x16,
+ aom_highbd_10_sub_pixel_avg_variance4x16,
+ aom_highbd_sad4x16x4d_bits10,
+ aom_highbd_jnt_sad4x16_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance4x16);
+
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits10,
+ aom_highbd_sad32x16_avg_bits10, aom_highbd_10_variance32x16,
+ aom_highbd_10_sub_pixel_variance32x16,
+ aom_highbd_10_sub_pixel_avg_variance32x16,
+ aom_highbd_sad32x16x4d_bits10,
+ aom_highbd_jnt_sad32x16_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance32x16);
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits10,
+ aom_highbd_sad16x32_avg_bits10, aom_highbd_10_variance16x32,
+ aom_highbd_10_sub_pixel_variance16x32,
+ aom_highbd_10_sub_pixel_avg_variance16x32,
+ aom_highbd_sad16x32x4d_bits10,
+ aom_highbd_jnt_sad16x32_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance16x32);
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits10,
+ aom_highbd_sad64x32_avg_bits10, aom_highbd_10_variance64x32,
+ aom_highbd_10_sub_pixel_variance64x32,
+ aom_highbd_10_sub_pixel_avg_variance64x32,
+ aom_highbd_sad64x32x4d_bits10,
+ aom_highbd_jnt_sad64x32_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance64x32);
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits10,
+ aom_highbd_sad32x64_avg_bits10, aom_highbd_10_variance32x64,
+ aom_highbd_10_sub_pixel_variance32x64,
+ aom_highbd_10_sub_pixel_avg_variance32x64,
+ aom_highbd_sad32x64x4d_bits10,
+ aom_highbd_jnt_sad32x64_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance32x64);
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits10,
+ aom_highbd_sad32x32_avg_bits10, aom_highbd_10_variance32x32,
+ aom_highbd_10_sub_pixel_variance32x32,
+ aom_highbd_10_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x4d_bits10,
+ aom_highbd_jnt_sad32x32_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance32x32);
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits10,
+ aom_highbd_sad64x64_avg_bits10, aom_highbd_10_variance64x64,
+ aom_highbd_10_sub_pixel_variance64x64,
+ aom_highbd_10_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x4d_bits10,
+ aom_highbd_jnt_sad64x64_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance64x64);
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits10,
+ aom_highbd_sad16x16_avg_bits10, aom_highbd_10_variance16x16,
+ aom_highbd_10_sub_pixel_variance16x16,
+ aom_highbd_10_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x4d_bits10,
+ aom_highbd_jnt_sad16x16_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance16x16);
+
+ HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits10,
+ aom_highbd_sad16x8_avg_bits10, aom_highbd_10_variance16x8,
+ aom_highbd_10_sub_pixel_variance16x8,
+ aom_highbd_10_sub_pixel_avg_variance16x8,
+ aom_highbd_sad16x8x4d_bits10,
+ aom_highbd_jnt_sad16x8_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance16x8);
+
+ HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits10,
+ aom_highbd_sad8x16_avg_bits10, aom_highbd_10_variance8x16,
+ aom_highbd_10_sub_pixel_variance8x16,
+ aom_highbd_10_sub_pixel_avg_variance8x16,
+ aom_highbd_sad8x16x4d_bits10,
+ aom_highbd_jnt_sad8x16_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance8x16);
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits10, aom_highbd_sad8x8_avg_bits10,
+ aom_highbd_10_variance8x8, aom_highbd_10_sub_pixel_variance8x8,
+ aom_highbd_10_sub_pixel_avg_variance8x8,
+ aom_highbd_sad8x8x4d_bits10, aom_highbd_jnt_sad8x8_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance8x8);
+
+ HIGHBD_BFP(
+ BLOCK_8X4, aom_highbd_sad8x4_bits10, aom_highbd_sad8x4_avg_bits10,
+ aom_highbd_10_variance8x4, aom_highbd_10_sub_pixel_variance8x4,
+ aom_highbd_10_sub_pixel_avg_variance8x4,
+ aom_highbd_sad8x4x4d_bits10, aom_highbd_jnt_sad8x4_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance8x4);
+
+ HIGHBD_BFP(
+ BLOCK_4X8, aom_highbd_sad4x8_bits10, aom_highbd_sad4x8_avg_bits10,
+ aom_highbd_10_variance4x8, aom_highbd_10_sub_pixel_variance4x8,
+ aom_highbd_10_sub_pixel_avg_variance4x8,
+ aom_highbd_sad4x8x4d_bits10, aom_highbd_jnt_sad4x8_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance4x8);
+
+ HIGHBD_BFP(
+ BLOCK_4X4, aom_highbd_sad4x4_bits10, aom_highbd_sad4x4_avg_bits10,
+ aom_highbd_10_variance4x4, aom_highbd_10_sub_pixel_variance4x4,
+ aom_highbd_10_sub_pixel_avg_variance4x4,
+ aom_highbd_sad4x4x4d_bits10, aom_highbd_jnt_sad4x4_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance4x4);
+
+ HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits10,
+ aom_highbd_sad128x128_avg_bits10,
+ aom_highbd_10_variance128x128,
+ aom_highbd_10_sub_pixel_variance128x128,
+ aom_highbd_10_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x4d_bits10,
+ aom_highbd_jnt_sad128x128_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance128x128);
+
+ HIGHBD_BFP(
+ BLOCK_128X64, aom_highbd_sad128x64_bits10,
+ aom_highbd_sad128x64_avg_bits10, aom_highbd_10_variance128x64,
+ aom_highbd_10_sub_pixel_variance128x64,
+ aom_highbd_10_sub_pixel_avg_variance128x64,
+ aom_highbd_sad128x64x4d_bits10, aom_highbd_jnt_sad128x64_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance128x64);
+
+ HIGHBD_BFP(
+ BLOCK_64X128, aom_highbd_sad64x128_bits10,
+ aom_highbd_sad64x128_avg_bits10, aom_highbd_10_variance64x128,
+ aom_highbd_10_sub_pixel_variance64x128,
+ aom_highbd_10_sub_pixel_avg_variance64x128,
+ aom_highbd_sad64x128x4d_bits10, aom_highbd_jnt_sad64x128_avg_bits10,
+ aom_highbd_10_jnt_sub_pixel_avg_variance64x128);
+
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits10,
+ aom_highbd_10_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits10,
+ aom_highbd_10_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits10,
+ aom_highbd_10_masked_sub_pixel_variance64x128)
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits10,
+ aom_highbd_10_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits10,
+ aom_highbd_10_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits10,
+ aom_highbd_10_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits10,
+ aom_highbd_10_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits10,
+ aom_highbd_10_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits10,
+ aom_highbd_10_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits10,
+ aom_highbd_10_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits10,
+ aom_highbd_10_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits10,
+ aom_highbd_10_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits10,
+ aom_highbd_10_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits10,
+ aom_highbd_10_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits10,
+ aom_highbd_10_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits10,
+ aom_highbd_10_masked_sub_pixel_variance4x4)
+ HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits10,
+ aom_highbd_10_masked_sub_pixel_variance64x16)
+ HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits10,
+ aom_highbd_10_masked_sub_pixel_variance16x64)
+ HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits10,
+ aom_highbd_10_masked_sub_pixel_variance32x8)
+ HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits10,
+ aom_highbd_10_masked_sub_pixel_variance8x32)
+ HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits10,
+ aom_highbd_10_masked_sub_pixel_variance16x4)
+ HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits10,
+ aom_highbd_10_masked_sub_pixel_variance4x16)
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits10,
+ aom_highbd_10_obmc_variance128x128,
+ aom_highbd_10_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits10,
+ aom_highbd_10_obmc_variance128x64,
+ aom_highbd_10_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits10,
+ aom_highbd_10_obmc_variance64x128,
+ aom_highbd_10_obmc_sub_pixel_variance64x128)
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits10,
+ aom_highbd_10_obmc_variance64x64,
+ aom_highbd_10_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits10,
+ aom_highbd_10_obmc_variance64x32,
+ aom_highbd_10_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits10,
+ aom_highbd_10_obmc_variance32x64,
+ aom_highbd_10_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits10,
+ aom_highbd_10_obmc_variance32x32,
+ aom_highbd_10_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits10,
+ aom_highbd_10_obmc_variance32x16,
+ aom_highbd_10_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits10,
+ aom_highbd_10_obmc_variance16x32,
+ aom_highbd_10_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits10,
+ aom_highbd_10_obmc_variance16x16,
+ aom_highbd_10_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits10,
+ aom_highbd_10_obmc_variance8x16,
+ aom_highbd_10_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits10,
+ aom_highbd_10_obmc_variance16x8,
+ aom_highbd_10_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits10,
+ aom_highbd_10_obmc_variance8x8,
+ aom_highbd_10_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits10,
+ aom_highbd_10_obmc_variance4x8,
+ aom_highbd_10_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits10,
+ aom_highbd_10_obmc_variance8x4,
+ aom_highbd_10_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits10,
+ aom_highbd_10_obmc_variance4x4,
+ aom_highbd_10_obmc_sub_pixel_variance4x4)
+
+ HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits10,
+ aom_highbd_10_obmc_variance64x16,
+ aom_highbd_10_obmc_sub_pixel_variance64x16)
+
+ HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits10,
+ aom_highbd_10_obmc_variance16x64,
+ aom_highbd_10_obmc_sub_pixel_variance16x64)
+
+ HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits10,
+ aom_highbd_10_obmc_variance32x8,
+ aom_highbd_10_obmc_sub_pixel_variance32x8)
+
+ HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits10,
+ aom_highbd_10_obmc_variance8x32,
+ aom_highbd_10_obmc_sub_pixel_variance8x32)
+
+ HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits10,
+ aom_highbd_10_obmc_variance16x4,
+ aom_highbd_10_obmc_sub_pixel_variance16x4)
+
+ HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits10,
+ aom_highbd_10_obmc_variance4x16,
+ aom_highbd_10_obmc_sub_pixel_variance4x16)
+ break;
+
+ case AOM_BITS_12:
+ HIGHBD_BFP(BLOCK_64X16, aom_highbd_sad64x16_bits12,
+ aom_highbd_sad64x16_avg_bits12, aom_highbd_12_variance64x16,
+ aom_highbd_12_sub_pixel_variance64x16,
+ aom_highbd_12_sub_pixel_avg_variance64x16,
+ aom_highbd_sad64x16x4d_bits12,
+ aom_highbd_jnt_sad64x16_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance64x16);
+
+ HIGHBD_BFP(BLOCK_16X64, aom_highbd_sad16x64_bits12,
+ aom_highbd_sad16x64_avg_bits12, aom_highbd_12_variance16x64,
+ aom_highbd_12_sub_pixel_variance16x64,
+ aom_highbd_12_sub_pixel_avg_variance16x64,
+ aom_highbd_sad16x64x4d_bits12,
+ aom_highbd_jnt_sad16x64_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance16x64);
+
+ HIGHBD_BFP(BLOCK_32X8, aom_highbd_sad32x8_bits12,
+ aom_highbd_sad32x8_avg_bits12, aom_highbd_12_variance32x8,
+ aom_highbd_12_sub_pixel_variance32x8,
+ aom_highbd_12_sub_pixel_avg_variance32x8,
+ aom_highbd_sad32x8x4d_bits12,
+ aom_highbd_jnt_sad32x8_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance32x8);
+
+ HIGHBD_BFP(BLOCK_8X32, aom_highbd_sad8x32_bits12,
+ aom_highbd_sad8x32_avg_bits12, aom_highbd_12_variance8x32,
+ aom_highbd_12_sub_pixel_variance8x32,
+ aom_highbd_12_sub_pixel_avg_variance8x32,
+ aom_highbd_sad8x32x4d_bits12,
+ aom_highbd_jnt_sad8x32_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance8x32);
+
+ HIGHBD_BFP(BLOCK_16X4, aom_highbd_sad16x4_bits12,
+ aom_highbd_sad16x4_avg_bits12, aom_highbd_12_variance16x4,
+ aom_highbd_12_sub_pixel_variance16x4,
+ aom_highbd_12_sub_pixel_avg_variance16x4,
+ aom_highbd_sad16x4x4d_bits12,
+ aom_highbd_jnt_sad16x4_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance16x4);
+
+ HIGHBD_BFP(BLOCK_4X16, aom_highbd_sad4x16_bits12,
+ aom_highbd_sad4x16_avg_bits12, aom_highbd_12_variance4x16,
+ aom_highbd_12_sub_pixel_variance4x16,
+ aom_highbd_12_sub_pixel_avg_variance4x16,
+ aom_highbd_sad4x16x4d_bits12,
+ aom_highbd_jnt_sad4x16_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance4x16);
+
+ HIGHBD_BFP(BLOCK_32X16, aom_highbd_sad32x16_bits12,
+ aom_highbd_sad32x16_avg_bits12, aom_highbd_12_variance32x16,
+ aom_highbd_12_sub_pixel_variance32x16,
+ aom_highbd_12_sub_pixel_avg_variance32x16,
+ aom_highbd_sad32x16x4d_bits12,
+ aom_highbd_jnt_sad32x16_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance32x16);
+
+ HIGHBD_BFP(BLOCK_16X32, aom_highbd_sad16x32_bits12,
+ aom_highbd_sad16x32_avg_bits12, aom_highbd_12_variance16x32,
+ aom_highbd_12_sub_pixel_variance16x32,
+ aom_highbd_12_sub_pixel_avg_variance16x32,
+ aom_highbd_sad16x32x4d_bits12,
+ aom_highbd_jnt_sad16x32_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance16x32);
+
+ HIGHBD_BFP(BLOCK_64X32, aom_highbd_sad64x32_bits12,
+ aom_highbd_sad64x32_avg_bits12, aom_highbd_12_variance64x32,
+ aom_highbd_12_sub_pixel_variance64x32,
+ aom_highbd_12_sub_pixel_avg_variance64x32,
+ aom_highbd_sad64x32x4d_bits12,
+ aom_highbd_jnt_sad64x32_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance64x32);
+
+ HIGHBD_BFP(BLOCK_32X64, aom_highbd_sad32x64_bits12,
+ aom_highbd_sad32x64_avg_bits12, aom_highbd_12_variance32x64,
+ aom_highbd_12_sub_pixel_variance32x64,
+ aom_highbd_12_sub_pixel_avg_variance32x64,
+ aom_highbd_sad32x64x4d_bits12,
+ aom_highbd_jnt_sad32x64_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance32x64);
+
+ HIGHBD_BFP(BLOCK_32X32, aom_highbd_sad32x32_bits12,
+ aom_highbd_sad32x32_avg_bits12, aom_highbd_12_variance32x32,
+ aom_highbd_12_sub_pixel_variance32x32,
+ aom_highbd_12_sub_pixel_avg_variance32x32,
+ aom_highbd_sad32x32x4d_bits12,
+ aom_highbd_jnt_sad32x32_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance32x32);
+
+ HIGHBD_BFP(BLOCK_64X64, aom_highbd_sad64x64_bits12,
+ aom_highbd_sad64x64_avg_bits12, aom_highbd_12_variance64x64,
+ aom_highbd_12_sub_pixel_variance64x64,
+ aom_highbd_12_sub_pixel_avg_variance64x64,
+ aom_highbd_sad64x64x4d_bits12,
+ aom_highbd_jnt_sad64x64_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance64x64);
+
+ HIGHBD_BFP(BLOCK_16X16, aom_highbd_sad16x16_bits12,
+ aom_highbd_sad16x16_avg_bits12, aom_highbd_12_variance16x16,
+ aom_highbd_12_sub_pixel_variance16x16,
+ aom_highbd_12_sub_pixel_avg_variance16x16,
+ aom_highbd_sad16x16x4d_bits12,
+ aom_highbd_jnt_sad16x16_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance16x16);
+
+ HIGHBD_BFP(BLOCK_16X8, aom_highbd_sad16x8_bits12,
+ aom_highbd_sad16x8_avg_bits12, aom_highbd_12_variance16x8,
+ aom_highbd_12_sub_pixel_variance16x8,
+ aom_highbd_12_sub_pixel_avg_variance16x8,
+ aom_highbd_sad16x8x4d_bits12,
+ aom_highbd_jnt_sad16x8_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance16x8);
+
+ HIGHBD_BFP(BLOCK_8X16, aom_highbd_sad8x16_bits12,
+ aom_highbd_sad8x16_avg_bits12, aom_highbd_12_variance8x16,
+ aom_highbd_12_sub_pixel_variance8x16,
+ aom_highbd_12_sub_pixel_avg_variance8x16,
+ aom_highbd_sad8x16x4d_bits12,
+ aom_highbd_jnt_sad8x16_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance8x16);
+
+ HIGHBD_BFP(
+ BLOCK_8X8, aom_highbd_sad8x8_bits12, aom_highbd_sad8x8_avg_bits12,
+ aom_highbd_12_variance8x8, aom_highbd_12_sub_pixel_variance8x8,
+ aom_highbd_12_sub_pixel_avg_variance8x8,
+ aom_highbd_sad8x8x4d_bits12, aom_highbd_jnt_sad8x8_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance8x8);
+
+ HIGHBD_BFP(
+ BLOCK_8X4, aom_highbd_sad8x4_bits12, aom_highbd_sad8x4_avg_bits12,
+ aom_highbd_12_variance8x4, aom_highbd_12_sub_pixel_variance8x4,
+ aom_highbd_12_sub_pixel_avg_variance8x4,
+ aom_highbd_sad8x4x4d_bits12, aom_highbd_jnt_sad8x4_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance8x4);
+
+ HIGHBD_BFP(
+ BLOCK_4X8, aom_highbd_sad4x8_bits12, aom_highbd_sad4x8_avg_bits12,
+ aom_highbd_12_variance4x8, aom_highbd_12_sub_pixel_variance4x8,
+ aom_highbd_12_sub_pixel_avg_variance4x8,
+ aom_highbd_sad4x8x4d_bits12, aom_highbd_jnt_sad4x8_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance4x8);
+
+ HIGHBD_BFP(
+ BLOCK_4X4, aom_highbd_sad4x4_bits12, aom_highbd_sad4x4_avg_bits12,
+ aom_highbd_12_variance4x4, aom_highbd_12_sub_pixel_variance4x4,
+ aom_highbd_12_sub_pixel_avg_variance4x4,
+ aom_highbd_sad4x4x4d_bits12, aom_highbd_jnt_sad4x4_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance4x4);
+
+ HIGHBD_BFP(BLOCK_128X128, aom_highbd_sad128x128_bits12,
+ aom_highbd_sad128x128_avg_bits12,
+ aom_highbd_12_variance128x128,
+ aom_highbd_12_sub_pixel_variance128x128,
+ aom_highbd_12_sub_pixel_avg_variance128x128,
+ aom_highbd_sad128x128x4d_bits12,
+ aom_highbd_jnt_sad128x128_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance128x128);
+
+ HIGHBD_BFP(
+ BLOCK_128X64, aom_highbd_sad128x64_bits12,
+ aom_highbd_sad128x64_avg_bits12, aom_highbd_12_variance128x64,
+ aom_highbd_12_sub_pixel_variance128x64,
+ aom_highbd_12_sub_pixel_avg_variance128x64,
+ aom_highbd_sad128x64x4d_bits12, aom_highbd_jnt_sad128x64_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance128x64);
+
+ HIGHBD_BFP(
+ BLOCK_64X128, aom_highbd_sad64x128_bits12,
+ aom_highbd_sad64x128_avg_bits12, aom_highbd_12_variance64x128,
+ aom_highbd_12_sub_pixel_variance64x128,
+ aom_highbd_12_sub_pixel_avg_variance64x128,
+ aom_highbd_sad64x128x4d_bits12, aom_highbd_jnt_sad64x128_avg_bits12,
+ aom_highbd_12_jnt_sub_pixel_avg_variance64x128);
+
+ HIGHBD_MBFP(BLOCK_128X128, aom_highbd_masked_sad128x128_bits12,
+ aom_highbd_12_masked_sub_pixel_variance128x128)
+ HIGHBD_MBFP(BLOCK_128X64, aom_highbd_masked_sad128x64_bits12,
+ aom_highbd_12_masked_sub_pixel_variance128x64)
+ HIGHBD_MBFP(BLOCK_64X128, aom_highbd_masked_sad64x128_bits12,
+ aom_highbd_12_masked_sub_pixel_variance64x128)
+ HIGHBD_MBFP(BLOCK_64X64, aom_highbd_masked_sad64x64_bits12,
+ aom_highbd_12_masked_sub_pixel_variance64x64)
+ HIGHBD_MBFP(BLOCK_64X32, aom_highbd_masked_sad64x32_bits12,
+ aom_highbd_12_masked_sub_pixel_variance64x32)
+ HIGHBD_MBFP(BLOCK_32X64, aom_highbd_masked_sad32x64_bits12,
+ aom_highbd_12_masked_sub_pixel_variance32x64)
+ HIGHBD_MBFP(BLOCK_32X32, aom_highbd_masked_sad32x32_bits12,
+ aom_highbd_12_masked_sub_pixel_variance32x32)
+ HIGHBD_MBFP(BLOCK_32X16, aom_highbd_masked_sad32x16_bits12,
+ aom_highbd_12_masked_sub_pixel_variance32x16)
+ HIGHBD_MBFP(BLOCK_16X32, aom_highbd_masked_sad16x32_bits12,
+ aom_highbd_12_masked_sub_pixel_variance16x32)
+ HIGHBD_MBFP(BLOCK_16X16, aom_highbd_masked_sad16x16_bits12,
+ aom_highbd_12_masked_sub_pixel_variance16x16)
+ HIGHBD_MBFP(BLOCK_8X16, aom_highbd_masked_sad8x16_bits12,
+ aom_highbd_12_masked_sub_pixel_variance8x16)
+ HIGHBD_MBFP(BLOCK_16X8, aom_highbd_masked_sad16x8_bits12,
+ aom_highbd_12_masked_sub_pixel_variance16x8)
+ HIGHBD_MBFP(BLOCK_8X8, aom_highbd_masked_sad8x8_bits12,
+ aom_highbd_12_masked_sub_pixel_variance8x8)
+ HIGHBD_MBFP(BLOCK_4X8, aom_highbd_masked_sad4x8_bits12,
+ aom_highbd_12_masked_sub_pixel_variance4x8)
+ HIGHBD_MBFP(BLOCK_8X4, aom_highbd_masked_sad8x4_bits12,
+ aom_highbd_12_masked_sub_pixel_variance8x4)
+ HIGHBD_MBFP(BLOCK_4X4, aom_highbd_masked_sad4x4_bits12,
+ aom_highbd_12_masked_sub_pixel_variance4x4)
+ HIGHBD_MBFP(BLOCK_64X16, aom_highbd_masked_sad64x16_bits12,
+ aom_highbd_12_masked_sub_pixel_variance64x16)
+ HIGHBD_MBFP(BLOCK_16X64, aom_highbd_masked_sad16x64_bits12,
+ aom_highbd_12_masked_sub_pixel_variance16x64)
+ HIGHBD_MBFP(BLOCK_32X8, aom_highbd_masked_sad32x8_bits12,
+ aom_highbd_12_masked_sub_pixel_variance32x8)
+ HIGHBD_MBFP(BLOCK_8X32, aom_highbd_masked_sad8x32_bits12,
+ aom_highbd_12_masked_sub_pixel_variance8x32)
+ HIGHBD_MBFP(BLOCK_16X4, aom_highbd_masked_sad16x4_bits12,
+ aom_highbd_12_masked_sub_pixel_variance16x4)
+ HIGHBD_MBFP(BLOCK_4X16, aom_highbd_masked_sad4x16_bits12,
+ aom_highbd_12_masked_sub_pixel_variance4x16)
+ HIGHBD_OBFP(BLOCK_128X128, aom_highbd_obmc_sad128x128_bits12,
+ aom_highbd_12_obmc_variance128x128,
+ aom_highbd_12_obmc_sub_pixel_variance128x128)
+ HIGHBD_OBFP(BLOCK_128X64, aom_highbd_obmc_sad128x64_bits12,
+ aom_highbd_12_obmc_variance128x64,
+ aom_highbd_12_obmc_sub_pixel_variance128x64)
+ HIGHBD_OBFP(BLOCK_64X128, aom_highbd_obmc_sad64x128_bits12,
+ aom_highbd_12_obmc_variance64x128,
+ aom_highbd_12_obmc_sub_pixel_variance64x128)
+ HIGHBD_OBFP(BLOCK_64X64, aom_highbd_obmc_sad64x64_bits12,
+ aom_highbd_12_obmc_variance64x64,
+ aom_highbd_12_obmc_sub_pixel_variance64x64)
+ HIGHBD_OBFP(BLOCK_64X32, aom_highbd_obmc_sad64x32_bits12,
+ aom_highbd_12_obmc_variance64x32,
+ aom_highbd_12_obmc_sub_pixel_variance64x32)
+ HIGHBD_OBFP(BLOCK_32X64, aom_highbd_obmc_sad32x64_bits12,
+ aom_highbd_12_obmc_variance32x64,
+ aom_highbd_12_obmc_sub_pixel_variance32x64)
+ HIGHBD_OBFP(BLOCK_32X32, aom_highbd_obmc_sad32x32_bits12,
+ aom_highbd_12_obmc_variance32x32,
+ aom_highbd_12_obmc_sub_pixel_variance32x32)
+ HIGHBD_OBFP(BLOCK_32X16, aom_highbd_obmc_sad32x16_bits12,
+ aom_highbd_12_obmc_variance32x16,
+ aom_highbd_12_obmc_sub_pixel_variance32x16)
+ HIGHBD_OBFP(BLOCK_16X32, aom_highbd_obmc_sad16x32_bits12,
+ aom_highbd_12_obmc_variance16x32,
+ aom_highbd_12_obmc_sub_pixel_variance16x32)
+ HIGHBD_OBFP(BLOCK_16X16, aom_highbd_obmc_sad16x16_bits12,
+ aom_highbd_12_obmc_variance16x16,
+ aom_highbd_12_obmc_sub_pixel_variance16x16)
+ HIGHBD_OBFP(BLOCK_8X16, aom_highbd_obmc_sad8x16_bits12,
+ aom_highbd_12_obmc_variance8x16,
+ aom_highbd_12_obmc_sub_pixel_variance8x16)
+ HIGHBD_OBFP(BLOCK_16X8, aom_highbd_obmc_sad16x8_bits12,
+ aom_highbd_12_obmc_variance16x8,
+ aom_highbd_12_obmc_sub_pixel_variance16x8)
+ HIGHBD_OBFP(BLOCK_8X8, aom_highbd_obmc_sad8x8_bits12,
+ aom_highbd_12_obmc_variance8x8,
+ aom_highbd_12_obmc_sub_pixel_variance8x8)
+ HIGHBD_OBFP(BLOCK_4X8, aom_highbd_obmc_sad4x8_bits12,
+ aom_highbd_12_obmc_variance4x8,
+ aom_highbd_12_obmc_sub_pixel_variance4x8)
+ HIGHBD_OBFP(BLOCK_8X4, aom_highbd_obmc_sad8x4_bits12,
+ aom_highbd_12_obmc_variance8x4,
+ aom_highbd_12_obmc_sub_pixel_variance8x4)
+ HIGHBD_OBFP(BLOCK_4X4, aom_highbd_obmc_sad4x4_bits12,
+ aom_highbd_12_obmc_variance4x4,
+ aom_highbd_12_obmc_sub_pixel_variance4x4)
+ HIGHBD_OBFP(BLOCK_64X16, aom_highbd_obmc_sad64x16_bits12,
+ aom_highbd_12_obmc_variance64x16,
+ aom_highbd_12_obmc_sub_pixel_variance64x16)
+ HIGHBD_OBFP(BLOCK_16X64, aom_highbd_obmc_sad16x64_bits12,
+ aom_highbd_12_obmc_variance16x64,
+ aom_highbd_12_obmc_sub_pixel_variance16x64)
+ HIGHBD_OBFP(BLOCK_32X8, aom_highbd_obmc_sad32x8_bits12,
+ aom_highbd_12_obmc_variance32x8,
+ aom_highbd_12_obmc_sub_pixel_variance32x8)
+ HIGHBD_OBFP(BLOCK_8X32, aom_highbd_obmc_sad8x32_bits12,
+ aom_highbd_12_obmc_variance8x32,
+ aom_highbd_12_obmc_sub_pixel_variance8x32)
+ HIGHBD_OBFP(BLOCK_16X4, aom_highbd_obmc_sad16x4_bits12,
+ aom_highbd_12_obmc_variance16x4,
+ aom_highbd_12_obmc_sub_pixel_variance16x4)
+ HIGHBD_OBFP(BLOCK_4X16, aom_highbd_obmc_sad4x16_bits12,
+ aom_highbd_12_obmc_variance4x16,
+ aom_highbd_12_obmc_sub_pixel_variance4x16)
+ break;
+
+ default:
+ assert(0 &&
+ "cm->seq_params.bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ }
+ }
+}
+
+static void realloc_segmentation_maps(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // Create the encoder segmentation map and set all entries to 0
+ aom_free(cpi->segmentation_map);
+ CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+ aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+
+ // Create a map used for cyclic background refresh.
+ if (cpi->cyclic_refresh) av1_cyclic_refresh_free(cpi->cyclic_refresh);
+ CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+ av1_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
+
+ // Create a map used to mark inactive areas.
+ aom_free(cpi->active_map.map);
+ CHECK_MEM_ERROR(cm, cpi->active_map.map,
+ aom_calloc(cm->mi_rows * cm->mi_cols, 1));
+}
+
+void av1_change_config(struct AV1_COMP *cpi, const AV1EncoderConfig *oxcf) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ RATE_CONTROL *const rc = &cpi->rc;
+ MACROBLOCK *const x = &cpi->td.mb;
+
+ if (seq_params->profile != oxcf->profile) seq_params->profile = oxcf->profile;
+ seq_params->bit_depth = oxcf->bit_depth;
+ seq_params->color_primaries = oxcf->color_primaries;
+ seq_params->transfer_characteristics = oxcf->transfer_characteristics;
+ seq_params->matrix_coefficients = oxcf->matrix_coefficients;
+ seq_params->monochrome = oxcf->monochrome;
+ seq_params->chroma_sample_position = oxcf->chroma_sample_position;
+ seq_params->color_range = oxcf->color_range;
+
+ assert(IMPLIES(seq_params->profile <= PROFILE_1,
+ seq_params->bit_depth <= AOM_BITS_10));
+
+ cm->timing_info_present = oxcf->timing_info_present;
+ cm->timing_info.num_units_in_display_tick =
+ oxcf->timing_info.num_units_in_display_tick;
+ cm->timing_info.time_scale = oxcf->timing_info.time_scale;
+ cm->timing_info.equal_picture_interval =
+ oxcf->timing_info.equal_picture_interval;
+ cm->timing_info.num_ticks_per_picture =
+ oxcf->timing_info.num_ticks_per_picture;
+
+ seq_params->display_model_info_present_flag =
+ oxcf->display_model_info_present_flag;
+ seq_params->decoder_model_info_present_flag =
+ oxcf->decoder_model_info_present_flag;
+ if (oxcf->decoder_model_info_present_flag) {
+ // set the decoder model parameters in schedule mode
+ cm->buffer_model.num_units_in_decoding_tick =
+ oxcf->buffer_model.num_units_in_decoding_tick;
+ cm->buffer_removal_time_present = 1;
+ set_aom_dec_model_info(&cm->buffer_model);
+ set_dec_model_op_parameters(&cm->op_params[0]);
+ } else if (cm->timing_info_present &&
+ cm->timing_info.equal_picture_interval &&
+ !seq_params->decoder_model_info_present_flag) {
+ // set the decoder model parameters in resource availability mode
+ set_resource_availability_parameters(&cm->op_params[0]);
+ } else {
+ cm->op_params[0].initial_display_delay =
+ 10; // Default value (not signaled)
+ }
+
+ update_film_grain_parameters(cpi, oxcf);
+
+ cpi->oxcf = *oxcf;
+ cpi->common.options = oxcf->cfg;
+ cpi->row_mt = oxcf->row_mt;
+ x->e_mbd.bd = (int)seq_params->bit_depth;
+ x->e_mbd.global_motion = cm->global_motion;
+
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+ rc->baseline_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
+ }
+
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+
+ cm->refresh_frame_context = (oxcf->frame_parallel_decoding_mode)
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->large_scale_tile)
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ if (x->palette_buffer == NULL) {
+ CHECK_MEM_ERROR(cm, x->palette_buffer,
+ aom_memalign(16, sizeof(*x->palette_buffer)));
+ }
+
+ if (x->tmp_conv_dst == NULL) {
+ CHECK_MEM_ERROR(
+ cm, x->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * sizeof(*x->tmp_conv_dst)));
+ x->e_mbd.tmp_conv_dst = x->tmp_conv_dst;
+ }
+ for (int i = 0; i < 2; ++i) {
+ if (x->tmp_obmc_bufs[i] == NULL) {
+ CHECK_MEM_ERROR(cm, x->tmp_obmc_bufs[i],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*x->tmp_obmc_bufs[i])));
+ x->e_mbd.tmp_obmc_bufs[i] = x->tmp_obmc_bufs[i];
+ }
+ }
+
+ av1_reset_segment_features(cm);
+ set_high_precision_mv(cpi, 1, 0);
+
+ set_rc_buffer_sizes(rc, &cpi->oxcf);
+
+ // Under a configuration change, where maximum_buffer_size may change,
+ // keep buffer level clipped to the maximum allowed buffer size.
+ rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = AOMMIN(rc->buffer_level, rc->maximum_buffer_size);
+
+ // Set up frame rate and related parameters rate control values.
+ av1_new_framerate(cpi, cpi->framerate);
+
+ // Set absolute upper and lower quality limits
+ rc->worst_quality = cpi->oxcf.worst_allowed_q;
+ rc->best_quality = cpi->oxcf.best_allowed_q;
+
+ cm->interp_filter = oxcf->large_scale_tile ? EIGHTTAP_REGULAR : SWITCHABLE;
+ cm->switchable_motion_mode = 1;
+
+ if (cpi->oxcf.render_width > 0 && cpi->oxcf.render_height > 0) {
+ cm->render_width = cpi->oxcf.render_width;
+ cm->render_height = cpi->oxcf.render_height;
+ } else {
+ cm->render_width = cpi->oxcf.width;
+ cm->render_height = cpi->oxcf.height;
+ }
+ cm->width = cpi->oxcf.width;
+ cm->height = cpi->oxcf.height;
+
+ int sb_size = seq_params->sb_size;
+ // Superblock size should not be updated after the first key frame.
+ if (!cpi->seq_params_locked) {
+ set_sb_size(&cm->seq_params, select_sb_size(cpi));
+ }
+
+ if (cpi->initial_width || sb_size != seq_params->sb_size) {
+ if (cm->width > cpi->initial_width || cm->height > cpi->initial_height ||
+ seq_params->sb_size != sb_size) {
+ av1_free_context_buffers(cm);
+ av1_free_pc_tree(&cpi->td, num_planes);
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ }
+ }
+ update_frame_size(cpi);
+
+ cpi->alt_ref_source = NULL;
+ rc->is_src_frame_alt_ref = 0;
+
+ rc->is_bwd_ref_frame = 0;
+ rc->is_last_bipred_frame = 0;
+ rc->is_bipred_frame = 0;
+
+ set_tile_info(cpi);
+
+ cpi->ext_refresh_frame_flags_pending = 0;
+ cpi->ext_refresh_frame_context_pending = 0;
+
+ highbd_set_var_fns(cpi);
+
+ // Init sequence level coding tools
+ // This should not be called after the first key frame.
+ if (!cpi->seq_params_locked) {
+ seq_params->operating_points_cnt_minus_1 =
+ cm->number_spatial_layers > 1 ? cm->number_spatial_layers - 1 : 0;
+ init_seq_coding_tools(&cm->seq_params, cm, oxcf);
+ }
+}
+
+AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+ BufferPool *const pool) {
+ unsigned int i;
+ AV1_COMP *volatile const cpi = aom_memalign(32, sizeof(AV1_COMP));
+ AV1_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
+
+ if (!cm) return NULL;
+
+ av1_zero(*cpi);
+
+ // The jmp_buf is valid only for the duration of the function that calls
+ // setjmp(). Therefore, this function must reset the 'setjmp' field to 0
+ // before it returns.
+ if (setjmp(cm->error.jmp)) {
+ cm->error.setjmp = 0;
+ av1_remove_compressor(cpi);
+ return 0;
+ }
+
+ cm->error.setjmp = 1;
+ cm->alloc_mi = enc_alloc_mi;
+ cm->free_mi = enc_free_mi;
+ cm->setup_mi = enc_setup_mi;
+
+ CHECK_MEM_ERROR(cm, cm->fc,
+ (FRAME_CONTEXT *)aom_memalign(32, sizeof(*cm->fc)));
+ CHECK_MEM_ERROR(cm, cm->frame_contexts,
+ (FRAME_CONTEXT *)aom_memalign(
+ 32, FRAME_CONTEXTS * sizeof(*cm->frame_contexts)));
+ memset(cm->fc, 0, sizeof(*cm->fc));
+ memset(cm->frame_contexts, 0, FRAME_CONTEXTS * sizeof(*cm->frame_contexts));
+
+ cpi->resize_state = 0;
+ cpi->resize_avg_qp = 0;
+ cpi->resize_buffer_underflow = 0;
+
+ cpi->common.buffer_pool = pool;
+
+ init_config(cpi, oxcf);
+ av1_rc_init(&cpi->oxcf, oxcf->pass, &cpi->rc);
+
+ cm->current_video_frame = 0;
+ cpi->seq_params_locked = 0;
+ cpi->partition_search_skippable_frame = 0;
+ cpi->tile_data = NULL;
+ cpi->last_show_frame_buf_idx = INVALID_IDX;
+
+ realloc_segmentation_maps(cpi);
+
+ memset(cpi->nmv_costs, 0, sizeof(cpi->nmv_costs));
+ memset(cpi->nmv_costs_hp, 0, sizeof(cpi->nmv_costs_hp));
+
+ for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
+ i++) {
+ CHECK_MEM_ERROR(
+ cm, cpi->mbgraph_stats[i].mb_stats,
+ aom_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
+ }
+
+#if CONFIG_FP_MB_STATS
+ cpi->use_fp_mb_stats = 0;
+ if (cpi->use_fp_mb_stats) {
+ // a place holder used to store the first pass mb stats in the first pass
+ CHECK_MEM_ERROR(cm, cpi->twopass.frame_mb_stats_buf,
+ aom_calloc(cm->MBs * sizeof(uint8_t), 1));
+ } else {
+ cpi->twopass.frame_mb_stats_buf = NULL;
+ }
+#endif
+
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
+#if CONFIG_INTERNAL_STATS
+ cpi->b_calculate_blockiness = 1;
+ cpi->b_calculate_consistency = 1;
+ cpi->total_inconsistency = 0;
+ cpi->psnr.worst = 100.0;
+ cpi->worst_ssim = 100.0;
+
+ cpi->count = 0;
+ cpi->bytes = 0;
+
+ if (cpi->b_calculate_psnr) {
+ cpi->total_sq_error = 0;
+ cpi->total_samples = 0;
+ cpi->tot_recode_hits = 0;
+ cpi->summed_quality = 0;
+ cpi->summed_weights = 0;
+ }
+
+ cpi->fastssim.worst = 100.0;
+ cpi->psnrhvs.worst = 100.0;
+
+ if (cpi->b_calculate_blockiness) {
+ cpi->total_blockiness = 0;
+ cpi->worst_blockiness = 0.0;
+ }
+
+ if (cpi->b_calculate_consistency) {
+ CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+ aom_malloc(sizeof(*cpi->ssim_vars) * 4 *
+ cpi->common.mi_rows * cpi->common.mi_cols));
+ cpi->worst_consistency = 100.0;
+ }
+#endif
+#if CONFIG_ENTROPY_STATS
+ av1_zero(aggregate_fc);
+#endif // CONFIG_ENTROPY_STATS
+
+ cpi->first_time_stamp_ever = INT64_MAX;
+
+ cpi->td.mb.nmvcost[0] = &cpi->nmv_costs[0][MV_MAX];
+ cpi->td.mb.nmvcost[1] = &cpi->nmv_costs[1][MV_MAX];
+ cpi->td.mb.nmvcost_hp[0] = &cpi->nmv_costs_hp[0][MV_MAX];
+ cpi->td.mb.nmvcost_hp[1] = &cpi->nmv_costs_hp[1][MV_MAX];
+
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
+#ifdef OUTPUT_YUV_REC
+ yuv_rec_file = fopen("rec.yuv", "wb");
+#endif
+
+ if (oxcf->pass == 1) {
+ av1_init_first_pass(cpi);
+ } else if (oxcf->pass == 2) {
+ const size_t packet_sz = sizeof(FIRSTPASS_STATS);
+ const int packets = (int)(oxcf->two_pass_stats_in.sz / packet_sz);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ const size_t psz = cpi->common.MBs * sizeof(uint8_t);
+ const int ps = (int)(oxcf->firstpass_mb_stats_in.sz / psz);
+
+ cpi->twopass.firstpass_mb_stats.mb_stats_start =
+ oxcf->firstpass_mb_stats_in.buf;
+ cpi->twopass.firstpass_mb_stats.mb_stats_end =
+ cpi->twopass.firstpass_mb_stats.mb_stats_start +
+ (ps - 1) * cpi->common.MBs * sizeof(uint8_t);
+ }
+#endif
+
+ cpi->twopass.stats_in_start = oxcf->two_pass_stats_in.buf;
+ cpi->twopass.stats_in = cpi->twopass.stats_in_start;
+ cpi->twopass.stats_in_end = &cpi->twopass.stats_in[packets - 1];
+
+ av1_init_second_pass(cpi);
+ }
+
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.above_pred_buf,
+ (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*cpi->td.mb.above_pred_buf)));
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.left_pred_buf,
+ (uint8_t *)aom_memalign(16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*cpi->td.mb.left_pred_buf)));
+
+ CHECK_MEM_ERROR(cm, cpi->td.mb.wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.wsrc_buf)));
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, cpi->td.mb.hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*cpi->td.mb.hash_value_buffer[0][0])));
+
+ cpi->td.mb.g_crc_initialized = 0;
+
+ CHECK_MEM_ERROR(cm, cpi->td.mb.mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*cpi->td.mb.mask_buf)));
+
+ av1_set_speed_features_framesize_independent(cpi);
+ av1_set_speed_features_framesize_dependent(cpi);
+
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, JSDAF, JSVAF) \
+ cpi->fn_ptr[BT].sdf = SDF; \
+ cpi->fn_ptr[BT].sdaf = SDAF; \
+ cpi->fn_ptr[BT].vf = VF; \
+ cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
+ cpi->fn_ptr[BT].sdx4df = SDX4DF; \
+ cpi->fn_ptr[BT].jsdaf = JSDAF; \
+ cpi->fn_ptr[BT].jsvaf = JSVAF;
+
+ BFP(BLOCK_4X16, aom_sad4x16, aom_sad4x16_avg, aom_variance4x16,
+ aom_sub_pixel_variance4x16, aom_sub_pixel_avg_variance4x16,
+ aom_sad4x16x4d, aom_jnt_sad4x16_avg, aom_jnt_sub_pixel_avg_variance4x16)
+
+ BFP(BLOCK_16X4, aom_sad16x4, aom_sad16x4_avg, aom_variance16x4,
+ aom_sub_pixel_variance16x4, aom_sub_pixel_avg_variance16x4,
+ aom_sad16x4x4d, aom_jnt_sad16x4_avg, aom_jnt_sub_pixel_avg_variance16x4)
+
+ BFP(BLOCK_8X32, aom_sad8x32, aom_sad8x32_avg, aom_variance8x32,
+ aom_sub_pixel_variance8x32, aom_sub_pixel_avg_variance8x32,
+ aom_sad8x32x4d, aom_jnt_sad8x32_avg, aom_jnt_sub_pixel_avg_variance8x32)
+
+ BFP(BLOCK_32X8, aom_sad32x8, aom_sad32x8_avg, aom_variance32x8,
+ aom_sub_pixel_variance32x8, aom_sub_pixel_avg_variance32x8,
+ aom_sad32x8x4d, aom_jnt_sad32x8_avg, aom_jnt_sub_pixel_avg_variance32x8)
+
+ BFP(BLOCK_16X64, aom_sad16x64, aom_sad16x64_avg, aom_variance16x64,
+ aom_sub_pixel_variance16x64, aom_sub_pixel_avg_variance16x64,
+ aom_sad16x64x4d, aom_jnt_sad16x64_avg,
+ aom_jnt_sub_pixel_avg_variance16x64)
+
+ BFP(BLOCK_64X16, aom_sad64x16, aom_sad64x16_avg, aom_variance64x16,
+ aom_sub_pixel_variance64x16, aom_sub_pixel_avg_variance64x16,
+ aom_sad64x16x4d, aom_jnt_sad64x16_avg,
+ aom_jnt_sub_pixel_avg_variance64x16)
+
+ BFP(BLOCK_128X128, aom_sad128x128, aom_sad128x128_avg, aom_variance128x128,
+ aom_sub_pixel_variance128x128, aom_sub_pixel_avg_variance128x128,
+ aom_sad128x128x4d, aom_jnt_sad128x128_avg,
+ aom_jnt_sub_pixel_avg_variance128x128)
+
+ BFP(BLOCK_128X64, aom_sad128x64, aom_sad128x64_avg, aom_variance128x64,
+ aom_sub_pixel_variance128x64, aom_sub_pixel_avg_variance128x64,
+ aom_sad128x64x4d, aom_jnt_sad128x64_avg,
+ aom_jnt_sub_pixel_avg_variance128x64)
+
+ BFP(BLOCK_64X128, aom_sad64x128, aom_sad64x128_avg, aom_variance64x128,
+ aom_sub_pixel_variance64x128, aom_sub_pixel_avg_variance64x128,
+ aom_sad64x128x4d, aom_jnt_sad64x128_avg,
+ aom_jnt_sub_pixel_avg_variance64x128)
+
+ BFP(BLOCK_32X16, aom_sad32x16, aom_sad32x16_avg, aom_variance32x16,
+ aom_sub_pixel_variance32x16, aom_sub_pixel_avg_variance32x16,
+ aom_sad32x16x4d, aom_jnt_sad32x16_avg,
+ aom_jnt_sub_pixel_avg_variance32x16)
+
+ BFP(BLOCK_16X32, aom_sad16x32, aom_sad16x32_avg, aom_variance16x32,
+ aom_sub_pixel_variance16x32, aom_sub_pixel_avg_variance16x32,
+ aom_sad16x32x4d, aom_jnt_sad16x32_avg,
+ aom_jnt_sub_pixel_avg_variance16x32)
+
+ BFP(BLOCK_64X32, aom_sad64x32, aom_sad64x32_avg, aom_variance64x32,
+ aom_sub_pixel_variance64x32, aom_sub_pixel_avg_variance64x32,
+ aom_sad64x32x4d, aom_jnt_sad64x32_avg,
+ aom_jnt_sub_pixel_avg_variance64x32)
+
+ BFP(BLOCK_32X64, aom_sad32x64, aom_sad32x64_avg, aom_variance32x64,
+ aom_sub_pixel_variance32x64, aom_sub_pixel_avg_variance32x64,
+ aom_sad32x64x4d, aom_jnt_sad32x64_avg,
+ aom_jnt_sub_pixel_avg_variance32x64)
+
+ BFP(BLOCK_32X32, aom_sad32x32, aom_sad32x32_avg, aom_variance32x32,
+ aom_sub_pixel_variance32x32, aom_sub_pixel_avg_variance32x32,
+ aom_sad32x32x4d, aom_jnt_sad32x32_avg,
+ aom_jnt_sub_pixel_avg_variance32x32)
+
+ BFP(BLOCK_64X64, aom_sad64x64, aom_sad64x64_avg, aom_variance64x64,
+ aom_sub_pixel_variance64x64, aom_sub_pixel_avg_variance64x64,
+ aom_sad64x64x4d, aom_jnt_sad64x64_avg,
+ aom_jnt_sub_pixel_avg_variance64x64)
+
+ BFP(BLOCK_16X16, aom_sad16x16, aom_sad16x16_avg, aom_variance16x16,
+ aom_sub_pixel_variance16x16, aom_sub_pixel_avg_variance16x16,
+ aom_sad16x16x4d, aom_jnt_sad16x16_avg,
+ aom_jnt_sub_pixel_avg_variance16x16)
+
+ BFP(BLOCK_16X8, aom_sad16x8, aom_sad16x8_avg, aom_variance16x8,
+ aom_sub_pixel_variance16x8, aom_sub_pixel_avg_variance16x8,
+ aom_sad16x8x4d, aom_jnt_sad16x8_avg, aom_jnt_sub_pixel_avg_variance16x8)
+
+ BFP(BLOCK_8X16, aom_sad8x16, aom_sad8x16_avg, aom_variance8x16,
+ aom_sub_pixel_variance8x16, aom_sub_pixel_avg_variance8x16,
+ aom_sad8x16x4d, aom_jnt_sad8x16_avg, aom_jnt_sub_pixel_avg_variance8x16)
+
+ BFP(BLOCK_8X8, aom_sad8x8, aom_sad8x8_avg, aom_variance8x8,
+ aom_sub_pixel_variance8x8, aom_sub_pixel_avg_variance8x8, aom_sad8x8x4d,
+ aom_jnt_sad8x8_avg, aom_jnt_sub_pixel_avg_variance8x8)
+
+ BFP(BLOCK_8X4, aom_sad8x4, aom_sad8x4_avg, aom_variance8x4,
+ aom_sub_pixel_variance8x4, aom_sub_pixel_avg_variance8x4, aom_sad8x4x4d,
+ aom_jnt_sad8x4_avg, aom_jnt_sub_pixel_avg_variance8x4)
+
+ BFP(BLOCK_4X8, aom_sad4x8, aom_sad4x8_avg, aom_variance4x8,
+ aom_sub_pixel_variance4x8, aom_sub_pixel_avg_variance4x8, aom_sad4x8x4d,
+ aom_jnt_sad4x8_avg, aom_jnt_sub_pixel_avg_variance4x8)
+
+ BFP(BLOCK_4X4, aom_sad4x4, aom_sad4x4_avg, aom_variance4x4,
+ aom_sub_pixel_variance4x4, aom_sub_pixel_avg_variance4x4, aom_sad4x4x4d,
+ aom_jnt_sad4x4_avg, aom_jnt_sub_pixel_avg_variance4x4)
+
+#define OBFP(BT, OSDF, OVF, OSVF) \
+ cpi->fn_ptr[BT].osdf = OSDF; \
+ cpi->fn_ptr[BT].ovf = OVF; \
+ cpi->fn_ptr[BT].osvf = OSVF;
+
+ OBFP(BLOCK_128X128, aom_obmc_sad128x128, aom_obmc_variance128x128,
+ aom_obmc_sub_pixel_variance128x128)
+ OBFP(BLOCK_128X64, aom_obmc_sad128x64, aom_obmc_variance128x64,
+ aom_obmc_sub_pixel_variance128x64)
+ OBFP(BLOCK_64X128, aom_obmc_sad64x128, aom_obmc_variance64x128,
+ aom_obmc_sub_pixel_variance64x128)
+ OBFP(BLOCK_64X64, aom_obmc_sad64x64, aom_obmc_variance64x64,
+ aom_obmc_sub_pixel_variance64x64)
+ OBFP(BLOCK_64X32, aom_obmc_sad64x32, aom_obmc_variance64x32,
+ aom_obmc_sub_pixel_variance64x32)
+ OBFP(BLOCK_32X64, aom_obmc_sad32x64, aom_obmc_variance32x64,
+ aom_obmc_sub_pixel_variance32x64)
+ OBFP(BLOCK_32X32, aom_obmc_sad32x32, aom_obmc_variance32x32,
+ aom_obmc_sub_pixel_variance32x32)
+ OBFP(BLOCK_32X16, aom_obmc_sad32x16, aom_obmc_variance32x16,
+ aom_obmc_sub_pixel_variance32x16)
+ OBFP(BLOCK_16X32, aom_obmc_sad16x32, aom_obmc_variance16x32,
+ aom_obmc_sub_pixel_variance16x32)
+ OBFP(BLOCK_16X16, aom_obmc_sad16x16, aom_obmc_variance16x16,
+ aom_obmc_sub_pixel_variance16x16)
+ OBFP(BLOCK_16X8, aom_obmc_sad16x8, aom_obmc_variance16x8,
+ aom_obmc_sub_pixel_variance16x8)
+ OBFP(BLOCK_8X16, aom_obmc_sad8x16, aom_obmc_variance8x16,
+ aom_obmc_sub_pixel_variance8x16)
+ OBFP(BLOCK_8X8, aom_obmc_sad8x8, aom_obmc_variance8x8,
+ aom_obmc_sub_pixel_variance8x8)
+ OBFP(BLOCK_4X8, aom_obmc_sad4x8, aom_obmc_variance4x8,
+ aom_obmc_sub_pixel_variance4x8)
+ OBFP(BLOCK_8X4, aom_obmc_sad8x4, aom_obmc_variance8x4,
+ aom_obmc_sub_pixel_variance8x4)
+ OBFP(BLOCK_4X4, aom_obmc_sad4x4, aom_obmc_variance4x4,
+ aom_obmc_sub_pixel_variance4x4)
+ OBFP(BLOCK_4X16, aom_obmc_sad4x16, aom_obmc_variance4x16,
+ aom_obmc_sub_pixel_variance4x16)
+ OBFP(BLOCK_16X4, aom_obmc_sad16x4, aom_obmc_variance16x4,
+ aom_obmc_sub_pixel_variance16x4)
+ OBFP(BLOCK_8X32, aom_obmc_sad8x32, aom_obmc_variance8x32,
+ aom_obmc_sub_pixel_variance8x32)
+ OBFP(BLOCK_32X8, aom_obmc_sad32x8, aom_obmc_variance32x8,
+ aom_obmc_sub_pixel_variance32x8)
+ OBFP(BLOCK_16X64, aom_obmc_sad16x64, aom_obmc_variance16x64,
+ aom_obmc_sub_pixel_variance16x64)
+ OBFP(BLOCK_64X16, aom_obmc_sad64x16, aom_obmc_variance64x16,
+ aom_obmc_sub_pixel_variance64x16)
+
+#define MBFP(BT, MCSDF, MCSVF) \
+ cpi->fn_ptr[BT].msdf = MCSDF; \
+ cpi->fn_ptr[BT].msvf = MCSVF;
+
+ MBFP(BLOCK_128X128, aom_masked_sad128x128,
+ aom_masked_sub_pixel_variance128x128)
+ MBFP(BLOCK_128X64, aom_masked_sad128x64, aom_masked_sub_pixel_variance128x64)
+ MBFP(BLOCK_64X128, aom_masked_sad64x128, aom_masked_sub_pixel_variance64x128)
+ MBFP(BLOCK_64X64, aom_masked_sad64x64, aom_masked_sub_pixel_variance64x64)
+ MBFP(BLOCK_64X32, aom_masked_sad64x32, aom_masked_sub_pixel_variance64x32)
+ MBFP(BLOCK_32X64, aom_masked_sad32x64, aom_masked_sub_pixel_variance32x64)
+ MBFP(BLOCK_32X32, aom_masked_sad32x32, aom_masked_sub_pixel_variance32x32)
+ MBFP(BLOCK_32X16, aom_masked_sad32x16, aom_masked_sub_pixel_variance32x16)
+ MBFP(BLOCK_16X32, aom_masked_sad16x32, aom_masked_sub_pixel_variance16x32)
+ MBFP(BLOCK_16X16, aom_masked_sad16x16, aom_masked_sub_pixel_variance16x16)
+ MBFP(BLOCK_16X8, aom_masked_sad16x8, aom_masked_sub_pixel_variance16x8)
+ MBFP(BLOCK_8X16, aom_masked_sad8x16, aom_masked_sub_pixel_variance8x16)
+ MBFP(BLOCK_8X8, aom_masked_sad8x8, aom_masked_sub_pixel_variance8x8)
+ MBFP(BLOCK_4X8, aom_masked_sad4x8, aom_masked_sub_pixel_variance4x8)
+ MBFP(BLOCK_8X4, aom_masked_sad8x4, aom_masked_sub_pixel_variance8x4)
+ MBFP(BLOCK_4X4, aom_masked_sad4x4, aom_masked_sub_pixel_variance4x4)
+
+ MBFP(BLOCK_4X16, aom_masked_sad4x16, aom_masked_sub_pixel_variance4x16)
+
+ MBFP(BLOCK_16X4, aom_masked_sad16x4, aom_masked_sub_pixel_variance16x4)
+
+ MBFP(BLOCK_8X32, aom_masked_sad8x32, aom_masked_sub_pixel_variance8x32)
+
+ MBFP(BLOCK_32X8, aom_masked_sad32x8, aom_masked_sub_pixel_variance32x8)
+
+ MBFP(BLOCK_16X64, aom_masked_sad16x64, aom_masked_sub_pixel_variance16x64)
+
+ MBFP(BLOCK_64X16, aom_masked_sad64x16, aom_masked_sub_pixel_variance64x16)
+
+ highbd_set_var_fns(cpi);
+
+ /* av1_init_quantizer() is first called here. Add check in
+ * av1_frame_init_quantizer() so that av1_init_quantizer is only
+ * called later when needed. This will avoid unnecessary calls of
+ * av1_init_quantizer() for every frame.
+ */
+ av1_init_quantizer(cpi);
+ av1_qm_init(cm);
+
+ av1_loop_filter_init(cm);
+ cm->superres_scale_denominator = SCALE_NUMERATOR;
+ cm->superres_upscaled_width = oxcf->width;
+ cm->superres_upscaled_height = oxcf->height;
+ av1_loop_restoration_precal();
+
+ cm->error.setjmp = 0;
+
+ return cpi;
+}
+
+#if CONFIG_INTERNAL_STATS
+#define SNPRINT(H, T) snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
+
+#define SNPRINT2(H, T, V) \
+ snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
+#endif // CONFIG_INTERNAL_STATS
+
+void av1_remove_compressor(AV1_COMP *cpi) {
+ AV1_COMMON *cm;
+ unsigned int i;
+ int t;
+
+ if (!cpi) return;
+
+ cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ if (cm->current_video_frame > 0) {
+#if CONFIG_ENTROPY_STATS
+ if (cpi->oxcf.pass != 1) {
+ fprintf(stderr, "Writing counts.stt\n");
+ FILE *f = fopen("counts.stt", "wb");
+ fwrite(&aggregate_fc, sizeof(aggregate_fc), 1, f);
+ fclose(f);
+ }
+#endif // CONFIG_ENTROPY_STATS
+#if CONFIG_INTERNAL_STATS
+ aom_clear_system_state();
+
+ if (cpi->oxcf.pass != 1) {
+ char headings[512] = { 0 };
+ char results[512] = { 0 };
+ FILE *f = fopen("opsnr.stt", "a");
+ double time_encoded =
+ (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
+ 10000000.000;
+ double total_encode_time =
+ (cpi->time_receive_data + cpi->time_compress_data) / 1000.000;
+ const double dr =
+ (double)cpi->bytes * (double)8 / (double)1000 / time_encoded;
+ const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
+ const double target_rate = (double)cpi->oxcf.target_bandwidth / 1000;
+ const double rate_err = ((100.0 * (dr - target_rate)) / target_rate);
+
+ if (cpi->b_calculate_psnr) {
+ const double total_psnr = aom_sse_to_psnr(
+ (double)cpi->total_samples, peak, (double)cpi->total_sq_error);
+ const double total_ssim =
+ 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+ snprintf(headings, sizeof(headings),
+ "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
+ "AOMSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
+ "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+ "AVPsrnY\tAPsnrCb\tAPsnrCr");
+ snprintf(results, sizeof(results),
+ "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+ "%7.3f\t%7.3f\t%7.3f",
+ dr, cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr,
+ cpi->psnr.stat[STAT_ALL] / cpi->count, total_psnr, total_ssim,
+ total_ssim, cpi->fastssim.stat[STAT_ALL] / cpi->count,
+ cpi->psnrhvs.stat[STAT_ALL] / cpi->count, cpi->psnr.worst,
+ cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+ cpi->psnr.stat[STAT_Y] / cpi->count,
+ cpi->psnr.stat[STAT_U] / cpi->count,
+ cpi->psnr.stat[STAT_V] / cpi->count);
+
+ if (cpi->b_calculate_blockiness) {
+ SNPRINT(headings, "\t Block\tWstBlck");
+ SNPRINT2(results, "\t%7.3f", cpi->total_blockiness / cpi->count);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_blockiness);
+ }
+
+ if (cpi->b_calculate_consistency) {
+ double consistency =
+ aom_sse_to_psnr((double)cpi->total_samples, peak,
+ (double)cpi->total_inconsistency);
+
+ SNPRINT(headings, "\tConsist\tWstCons");
+ SNPRINT2(results, "\t%7.3f", consistency);
+ SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
+ }
+ fprintf(f, "%s\t Time\tRcErr\tAbsErr\n", headings);
+ fprintf(f, "%s\t%8.0f\t%7.2f\t%7.2f\n", results, total_encode_time,
+ rate_err, fabs(rate_err));
+ }
+
+ fclose(f);
+ }
+#endif // CONFIG_INTERNAL_STATS
+ }
+
+ for (t = 0; t < cpi->num_workers; ++t) {
+ AVxWorker *const worker = &cpi->workers[t];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+ // Deallocate allocated threads.
+ aom_get_worker_interface()->end(worker);
+
+ // Deallocate allocated thread data.
+ if (t < cpi->num_workers - 1) {
+ aom_free(thread_data->td->palette_buffer);
+ aom_free(thread_data->td->tmp_conv_dst);
+ for (int j = 0; j < 2; ++j) {
+ aom_free(thread_data->td->tmp_obmc_bufs[j]);
+ }
+ aom_free(thread_data->td->above_pred_buf);
+ aom_free(thread_data->td->left_pred_buf);
+ aom_free(thread_data->td->wsrc_buf);
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ aom_free(thread_data->td->hash_value_buffer[x][y]);
+ thread_data->td->hash_value_buffer[x][y] = NULL;
+ }
+ }
+ aom_free(thread_data->td->mask_buf);
+ aom_free(thread_data->td->counts);
+ av1_free_pc_tree(thread_data->td, num_planes);
+ aom_free(thread_data->td);
+ }
+ }
+ aom_free(cpi->tile_thr_data);
+ aom_free(cpi->workers);
+
+ if (cpi->num_workers > 1) {
+ av1_loop_filter_dealloc(&cpi->lf_row_sync);
+ av1_loop_restoration_dealloc(&cpi->lr_row_sync, cpi->num_workers);
+ }
+
+ dealloc_compressor_data(cpi);
+
+ for (i = 0; i < sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]);
+ ++i) {
+ aom_free(cpi->mbgraph_stats[i].mb_stats);
+ }
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ aom_free(cpi->twopass.frame_mb_stats_buf);
+ cpi->twopass.frame_mb_stats_buf = NULL;
+ }
+#endif
+#if CONFIG_INTERNAL_STATS
+ aom_free(cpi->ssim_vars);
+ cpi->ssim_vars = NULL;
+#endif // CONFIG_INTERNAL_STATS
+
+ av1_remove_common(cm);
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ av1_hash_table_destroy(&cm->buffer_pool->frame_bufs[i].hash_table);
+ }
+ if (cpi->sf.use_hash_based_trellis) hbt_destroy();
+ av1_free_ref_frame_buffers(cm->buffer_pool);
+ aom_free(cpi);
+
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
+#ifdef OUTPUT_YUV_REC
+ fclose(yuv_rec_file);
+#endif
+}
+
+static void generate_psnr_packet(AV1_COMP *cpi) {
+ struct aom_codec_cx_pkt pkt;
+ int i;
+ PSNR_STATS psnr;
+ aom_calc_highbd_psnr(cpi->source, cpi->common.frame_to_show, &psnr,
+ cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+
+ for (i = 0; i < 4; ++i) {
+ pkt.data.psnr.samples[i] = psnr.samples[i];
+ pkt.data.psnr.sse[i] = psnr.sse[i];
+ pkt.data.psnr.psnr[i] = psnr.psnr[i];
+ }
+ pkt.kind = AOM_CODEC_PSNR_PKT;
+ aom_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags) {
+ if (ref_frame_flags > ((1 << INTER_REFS_PER_FRAME) - 1)) return -1;
+
+ cpi->ext_ref_frame_flags = ref_frame_flags;
+ return 0;
+}
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_upd_flags) {
+ cpi->ext_refresh_last_frame = (ref_frame_upd_flags & AOM_LAST_FLAG) != 0;
+ cpi->ext_refresh_golden_frame = (ref_frame_upd_flags & AOM_GOLD_FLAG) != 0;
+ cpi->ext_refresh_alt_ref_frame = (ref_frame_upd_flags & AOM_ALT_FLAG) != 0;
+ cpi->ext_refresh_bwd_ref_frame = (ref_frame_upd_flags & AOM_BWD_FLAG) != 0;
+ cpi->ext_refresh_alt2_ref_frame = (ref_frame_upd_flags & AOM_ALT2_FLAG) != 0;
+ cpi->ext_refresh_frame_flags_pending = 1;
+}
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(cfg, sd, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ YV12_BUFFER_CONFIG *cfg = get_ref_frame(cm, idx);
+ if (cfg) {
+ aom_yv12_copy_frame(sd, cfg, num_planes);
+ return 0;
+ } else {
+ return -1;
+ }
+}
+
+int av1_update_entropy(AV1_COMP *cpi, int update) {
+ cpi->ext_refresh_frame_context = update;
+ cpi->ext_refresh_frame_context_pending = 1;
+ return 0;
+}
+
+#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
+// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
+// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
+// not denoise the UV channels at this time. If ever we implement UV channel
+// denoising we will have to modify this.
+void aom_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
+ uint8_t *src = s->y_buffer;
+ int h = s->y_height;
+
+ do {
+ fwrite(src, s->y_width, 1, f);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, f);
+ src += s->uv_stride;
+ } while (--h);
+}
+#endif
+
+static void check_show_existing_frame(AV1_COMP *cpi) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ AV1_COMMON *const cm = &cpi->common;
+ const FRAME_UPDATE_TYPE next_frame_update_type =
+ gf_group->update_type[gf_group->index];
+#if USE_SYMM_MULTI_LAYER
+ const int which_arf = (cpi->new_bwdref_update_rule == 1)
+ ? gf_group->arf_update_idx[gf_group->index] > 0
+ : gf_group->arf_update_idx[gf_group->index];
+#else
+ const int which_arf = gf_group->arf_update_idx[gf_group->index];
+#endif
+
+ if (cm->show_existing_frame == 1) {
+ cm->show_existing_frame = 0;
+ } else if (cpi->rc.is_last_bipred_frame) {
+#if USE_SYMM_MULTI_LAYER
+ // NOTE: When new structure is used, every bwdref will have one overlay
+ // frame. Therefore, there is no need to find out which frame to
+ // show in advance.
+ if (cpi->new_bwdref_update_rule == 0) {
+#endif
+ // NOTE: If the current frame is a last bi-predictive frame, it is
+ // needed next to show the BWDREF_FRAME, which is pointed by
+ // the last_fb_idxes[0] after reference frame buffer update
+ cpi->rc.is_last_bipred_frame = 0;
+ cm->show_existing_frame = 1;
+ cpi->existing_fb_idx_to_show = cpi->ref_fb_idx[0];
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ } else if (cpi->is_arf_filter_off[which_arf] &&
+ (next_frame_update_type == OVERLAY_UPDATE ||
+ next_frame_update_type == INTNL_OVERLAY_UPDATE)) {
+#if USE_SYMM_MULTI_LAYER
+ const int bwdref_to_show =
+ (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+ const int bwdref_to_show = ALTREF2_FRAME;
+#endif
+ // Other parameters related to OVERLAY_UPDATE will be taken care of
+ // in av1_rc_get_second_pass_params(cpi)
+ cm->show_existing_frame = 1;
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cpi->existing_fb_idx_to_show = (next_frame_update_type == OVERLAY_UPDATE)
+ ? cpi->ref_fb_idx[ALTREF_FRAME - 1]
+ : cpi->ref_fb_idx[bwdref_to_show - 1];
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0)
+#endif
+ cpi->is_arf_filter_off[which_arf] = 0;
+ }
+ cpi->rc.is_src_frame_ext_arf = 0;
+}
+
+#ifdef OUTPUT_YUV_REC
+void aom_write_one_yuv_frame(AV1_COMMON *cm, YV12_BUFFER_CONFIG *s) {
+ uint8_t *src = s->y_buffer;
+ int h = cm->height;
+ if (yuv_rec_file == NULL) return;
+ if (s->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(s->y_buffer);
+
+ do {
+ fwrite(src16, s->y_width, 2, yuv_rec_file);
+ src16 += s->y_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->u_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ src16 = CONVERT_TO_SHORTPTR(s->v_buffer);
+ h = s->uv_height;
+
+ do {
+ fwrite(src16, s->uv_width, 2, yuv_rec_file);
+ src16 += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+ return;
+ }
+
+ do {
+ fwrite(src, s->y_width, 1, yuv_rec_file);
+ src += s->y_stride;
+ } while (--h);
+
+ src = s->u_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ src = s->v_buffer;
+ h = s->uv_height;
+
+ do {
+ fwrite(src, s->uv_width, 1, yuv_rec_file);
+ src += s->uv_stride;
+ } while (--h);
+
+ fflush(yuv_rec_file);
+}
+#endif // OUTPUT_YUV_REC
+
+#define GM_RECODE_LOOP_NUM4X4_FACTOR 192
+static int recode_loop_test_global_motion(AV1_COMP *cpi) {
+ int i;
+ int recode = 0;
+ RD_COUNTS *const rdc = &cpi->td.rd_counts;
+ AV1_COMMON *const cm = &cpi->common;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ if (cm->global_motion[i].wmtype != IDENTITY &&
+ rdc->global_motion_used[i] * GM_RECODE_LOOP_NUM4X4_FACTOR <
+ cpi->gmparams_cost[i]) {
+ cm->global_motion[i] = default_warp_params;
+ assert(cm->global_motion[i].wmtype == IDENTITY);
+ cpi->gmparams_cost[i] = 0;
+ recode = 1;
+ // TODO(sarahparker): The earlier condition for recoding here was:
+ // "recode |= (rdc->global_motion_used[i] > 0);". Can we bring something
+ // similar to that back to speed up global motion?
+ }
+ }
+ return recode;
+}
+
+// Function to test for conditions that indicate we should loop
+// back and recode a frame.
+static int recode_loop_test(AV1_COMP *cpi, int high_limit, int low_limit, int q,
+ int maxq, int minq) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frame_is_kfgfarf = frame_is_kf_gf_arf(cpi);
+ int force_recode = 0;
+
+ if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+ (cpi->sf.recode_loop == ALLOW_RECODE) ||
+ (frame_is_kfgfarf && (cpi->sf.recode_loop == ALLOW_RECODE_KFARFGF))) {
+ // TODO(agrange) high_limit could be greater than the scale-down threshold.
+ if ((rc->projected_frame_size > high_limit && q < maxq) ||
+ (rc->projected_frame_size < low_limit && q > minq)) {
+ force_recode = 1;
+ } else if (cpi->oxcf.rc_mode == AOM_CQ) {
+ // Deal with frame undershoot and whether or not we are
+ // below the automatically set cq level.
+ if (q > oxcf->cq_level &&
+ rc->projected_frame_size < ((rc->this_frame_target * 7) >> 3)) {
+ force_recode = 1;
+ }
+ }
+ }
+ return force_recode;
+}
+
+#define DUMP_REF_FRAME_IMAGES 0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(AV1_COMMON *cm,
+ const YV12_BUFFER_CONFIG *const ref_buf,
+ char *file_name) {
+ int h;
+ FILE *f_ref = NULL;
+
+ if (ref_buf == NULL) {
+ printf("Frame data buffer is NULL.\n");
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ if ((f_ref = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return AOM_CODEC_MEM_ERROR;
+ }
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&ref_buf->y_buffer[h * ref_buf->y_stride], 1, cm->width, f_ref);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->u_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&ref_buf->v_buffer[h * ref_buf->uv_stride], 1, (cm->width >> 1),
+ f_ref);
+ }
+
+ fclose(f_ref);
+
+ return AOM_CODEC_OK;
+}
+
+static void dump_ref_frame_images(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MV_REFERENCE_FRAME ref_frame;
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ char file_name[256] = "";
+ snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+ cm->current_video_frame, ref_frame);
+ dump_one_image(cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+ }
+}
+#endif // DUMP_REF_FRAME_IMAGES == 1
+
+// This function is used to shift the virtual indices of last reference frames
+// as follows:
+// LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+// when the LAST_FRAME is updated.
+static INLINE void shift_last_ref_frames(AV1_COMP *cpi) {
+ // TODO(isbs): shift the scaled indices as well
+ int ref_frame;
+ for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+ cpi->ref_fb_idx[ref_frame] = cpi->ref_fb_idx[ref_frame - 1];
+
+ // [0] is allocated to the current coded frame. The statistics for the
+ // reference frames start at [LAST_FRAME], i.e. [1].
+ if (!cpi->rc.is_src_frame_alt_ref) {
+ memcpy(cpi->interp_filter_selected[ref_frame + LAST_FRAME],
+ cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ref_frame - 1 + LAST_FRAME]));
+ }
+ }
+}
+
+#if USE_SYMM_MULTI_LAYER
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF -> ALT2_REF -> EXT_REF
+// to clear a space to store the closest bwdref
+static INLINE void rshift_bwd_ref_frames(AV1_COMP *cpi) {
+ // TODO(isbs): shift the scaled indices as well
+ static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+ EXTREF_FRAME - 1 };
+
+ for (int i = 2; i > 0; --i) {
+ // [0] is allocated to the current coded frame, i.e. bwdref
+ memcpy(
+ cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+ cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ordered_bwd[i - 1] + LAST_FRAME]));
+
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i - 1]];
+ }
+}
+
+// This function is used to shift the virtual indices of bwd reference
+// frames as follows:
+// BWD_REF <- ALT2_REF <- EXT_REF
+// to update the bwd reference frame for coding the next frame.
+static INLINE void lshift_bwd_ref_frames(AV1_COMP *cpi) {
+ // TODO(isbs): shift the scaled indices as well
+ static const int ordered_bwd[3] = { BWDREF_FRAME - 1, ALTREF2_FRAME - 1,
+ EXTREF_FRAME - 1 };
+
+ for (int i = 0; i < 2; ++i) {
+ // [0] is allocated to the current coded frame, i.e. bwdref
+ memcpy(
+ cpi->interp_filter_selected[ordered_bwd[i] + LAST_FRAME],
+ cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME],
+ sizeof(cpi->interp_filter_selected[ordered_bwd[i + 1] + LAST_FRAME]));
+
+ cpi->ref_fb_idx[ordered_bwd[i]] = cpi->ref_fb_idx[ordered_bwd[i + 1]];
+ }
+}
+#endif // USE_SYMM_MULTI_LAYER
+
+static void update_reference_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+
+ // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+ // for the purpose to verify no mismatch between encoder and decoder.
+ if (cm->show_frame) cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
+ // In the case of show_existing frame, we will not send fresh flag
+ // to decoder. Any change in the reference frame buffer can be done by
+ // switching the virtual indices.
+ if (cm->show_existing_frame) {
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
+ }
+
+ BufferPool *const pool = cm->buffer_pool;
+
+ // At this point the new frame has been encoded.
+ // If any buffer copy / swapping is signaled it should be done here.
+
+ // Only update all of the reference buffers if a KEY_FRAME is also a
+ // show_frame. This ensures a fwd keyframe does not update all of the buffers
+ if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[ref_frame]],
+ cm->new_fb_idx);
+ }
+ return;
+ }
+
+ if (av1_preserve_existing_gf(cpi)) {
+ // We have decided to preserve the previously existing golden frame as our
+ // new ARF frame. However, in the short term in function
+ // av1_bitstream.c::get_refresh_mask() we left it in the GF slot and, if
+ // we're updating the GF with the current decoded frame, we save it to the
+ // ARF slot instead.
+ // We now have to update the ARF with the current frame and swap gld_fb_idx
+ // and alt_fb_idx so that, overall, we've stored the old GF in the new ARF
+ // slot and, if we're updating the GF, the current frame becomes the new GF.
+ int tmp;
+
+ // ARF in general is a better reference than overlay. We shouldkeep ARF as
+ // reference instead of replacing it with overlay.
+
+ if (!cpi->preserve_arf_as_gld) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF_FRAME - 1]],
+ cm->new_fb_idx);
+ }
+
+ tmp = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+ cpi->ref_fb_idx[ALTREF_FRAME - 1] = cpi->ref_fb_idx[GOLDEN_FRAME - 1];
+ cpi->ref_fb_idx[GOLDEN_FRAME - 1] = tmp;
+
+ // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+ // cpi->interp_filter_selected[GOLDEN_FRAME]?
+ } else if (cpi->rc.is_src_frame_ext_arf && cm->show_existing_frame) {
+#if CONFIG_DEBUG
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ assert(gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE);
+#endif
+#if USE_SYMM_MULTI_LAYER
+ const int bwdref_to_show =
+ (cpi->new_bwdref_update_rule == 1) ? BWDREF_FRAME : ALTREF2_FRAME;
+#else
+ const int bwdref_to_show = ALTREF2_FRAME;
+#endif
+ // Deal with the special case for showing existing internal ALTREF_FRAME
+ // Refresh the LAST_FRAME with the ALTREF_FRAME and retire the LAST3_FRAME
+ // by updating the virtual indices.
+ const int tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+ shift_last_ref_frames(cpi);
+
+ cpi->ref_fb_idx[LAST_FRAME - 1] = cpi->ref_fb_idx[bwdref_to_show - 1];
+
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[bwdref_to_show],
+ sizeof(cpi->interp_filter_selected[bwdref_to_show]));
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 1) {
+ lshift_bwd_ref_frames(cpi);
+ // pass outdated forward reference frame (previous LAST3) to the
+ // spared space
+ cpi->ref_fb_idx[EXTREF_FRAME - 1] = tmp;
+ } else {
+#endif
+ cpi->ref_fb_idx[bwdref_to_show - 1] = tmp;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ } else { /* For non key/golden frames */
+ // === ALTREF_FRAME ===
+ if (cpi->refresh_alt_ref_frame) {
+ int arf_idx = cpi->ref_fb_idx[ALTREF_FRAME - 1];
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+
+ memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+ // === GOLDEN_FRAME ===
+ if (cpi->refresh_golden_frame) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+ cm->new_fb_idx);
+
+ memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+ // === BWDREF_FRAME ===
+ if (cpi->refresh_bwd_ref_frame) {
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule) {
+ // We shift the backward reference frame as follows:
+ // BWDREF -> ALTREF2 -> EXTREF
+ // and assign the newly coded frame to BWDREF so that it always
+ // keeps the nearest future frame
+ int tmp = cpi->ref_fb_idx[EXTREF_FRAME - 1];
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[tmp], cm->new_fb_idx);
+
+ rshift_bwd_ref_frames(cpi);
+ cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+ } else {
+#endif // USE_SYMM_MULTI_LAYER
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[BWDREF_FRAME - 1]],
+ cm->new_fb_idx);
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+
+ // === ALTREF2_FRAME ===
+ if (cpi->refresh_alt2_ref_frame) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]],
+ cm->new_fb_idx);
+
+ memcpy(cpi->interp_filter_selected[ALTREF2_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+ }
+ }
+
+ if (cpi->refresh_last_frame) {
+ // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
+ // reference to the reference frame buffer virtual index; and then (2) from
+ // the virtual index to the reference frame buffer physical index:
+ //
+ // LAST_FRAME, ..., LAST3_FRAME, ..., ALTREF_FRAME
+ // | | |
+ // v v v
+ // ref_fb_idx[0], ..., ref_fb_idx[2], ..., ref_fb_idx[ALTREF_FRAME-1]
+ // | | |
+ // v v v
+ // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
+ //
+ // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
+ // have the other 2 LAST reference frames shifted as follows:
+ // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+ // , and then have LAST_FRAME refreshed by the newly coded frame.
+ //
+ // To fulfill it, the decoder will be notified to execute following 2 steps:
+ //
+ // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
+ // to point to the newly coded frame, i.e.
+ // ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
+ //
+ // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
+ // original virtual index of LAST3_FRAME and have the other mappings
+ // shifted as follows:
+ // LAST_FRAME, LAST2_FRAME, LAST3_FRAME
+ // | | |
+ // v v v
+ // ref_fb_idx[2], ref_fb_idx[0], ref_fb_idx[1]
+ int tmp;
+
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[LAST_REF_FRAMES - 1]],
+ cm->new_fb_idx);
+
+ tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+
+ shift_last_ref_frames(cpi);
+ cpi->ref_fb_idx[0] = tmp;
+
+ assert(cm->show_existing_frame == 0);
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[0],
+ sizeof(cpi->interp_filter_selected[0]));
+
+ // If the new structure is used, we will always have overlay frames coupled
+ // with bwdref frames. Therefore, we won't have to perform this update
+ // in advance (we do this update when the overlay frame shows up).
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0 && cpi->rc.is_last_bipred_frame) {
+#else
+ if (cpi->rc.is_last_bipred_frame) {
+#endif
+ // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the
+ // LAST3_FRAME by updating the virtual indices.
+ //
+ // NOTE: The source frame for BWDREF does not have a holding position as
+ // the OVERLAY frame for ALTREF's. Hence, to resolve the reference
+ // virtual index reshuffling for BWDREF, the encoder always
+ // specifies a LAST_BIPRED right before BWDREF and completes the
+ // reshuffling job accordingly.
+ tmp = cpi->ref_fb_idx[LAST_REF_FRAMES - 1];
+
+ shift_last_ref_frames(cpi);
+ cpi->ref_fb_idx[0] = cpi->ref_fb_idx[BWDREF_FRAME - 1];
+ cpi->ref_fb_idx[BWDREF_FRAME - 1] = tmp;
+
+ memcpy(cpi->interp_filter_selected[LAST_FRAME],
+ cpi->interp_filter_selected[BWDREF_FRAME],
+ sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+ }
+ }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+ // Dump out all reference frame images.
+ dump_ref_frame_images(cpi);
+#endif // DUMP_REF_FRAME_IMAGES
+}
+
+static INLINE void alloc_frame_mvs(AV1_COMMON *const cm, int buffer_idx) {
+ assert(buffer_idx != INVALID_IDX);
+ RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+ ensure_mv_buffer(new_fb_ptr, cm);
+ new_fb_ptr->width = cm->width;
+ new_fb_ptr->height = cm->height;
+}
+
+static void scale_references(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MV_REFERENCE_FRAME ref_frame;
+ const AOM_REFFRAME ref_mask[INTER_REFS_PER_FRAME] = {
+ AOM_LAST_FLAG, AOM_LAST2_FLAG, AOM_LAST3_FLAG, AOM_GOLD_FLAG,
+ AOM_BWD_FLAG, AOM_ALT2_FLAG, AOM_ALT_FLAG
+ };
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ // Need to convert from AOM_REFFRAME to index into ref_mask (subtract 1).
+ if (cpi->ref_frame_flags & ref_mask[ref_frame - 1]) {
+ BufferPool *const pool = cm->buffer_pool;
+ const YV12_BUFFER_CONFIG *const ref =
+ get_ref_frame_buffer(cpi, ref_frame);
+
+ if (ref == NULL) {
+ cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ continue;
+ }
+
+ if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
+ RefCntBuffer *new_fb_ptr = NULL;
+ int force_scaling = 0;
+ int new_fb = cpi->scaled_ref_idx[ref_frame - 1];
+ if (new_fb == INVALID_IDX) {
+ new_fb = get_free_fb(cm);
+ force_scaling = 1;
+ }
+ if (new_fb == INVALID_IDX) return;
+ new_fb_ptr = &pool->frame_bufs[new_fb];
+ if (force_scaling || new_fb_ptr->buf.y_crop_width != cm->width ||
+ new_fb_ptr->buf.y_crop_height != cm->height) {
+ if (aom_realloc_frame_buffer(
+ &new_fb_ptr->buf, cm->width, cm->height,
+ cm->seq_params.subsampling_x, cm->seq_params.subsampling_y,
+ cm->seq_params.use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+ av1_resize_and_extend_frame(
+ ref, &new_fb_ptr->buf, (int)cm->seq_params.bit_depth, num_planes);
+ cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
+ alloc_frame_mvs(cm, new_fb);
+ }
+ } else {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
+ buf->buf.y_crop_width = ref->y_crop_width;
+ buf->buf.y_crop_height = ref->y_crop_height;
+ cpi->scaled_ref_idx[ref_frame - 1] = buf_idx;
+ ++buf->ref_count;
+ }
+ } else {
+ if (cpi->oxcf.pass != 0) cpi->scaled_ref_idx[ref_frame - 1] = INVALID_IDX;
+ }
+ }
+}
+
+static void release_scaled_references(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int i;
+ // TODO(isbs): only refresh the necessary frames, rather than all of them
+ for (i = 0; i < REF_FRAMES; ++i) {
+ const int idx = cpi->scaled_ref_idx[i];
+ RefCntBuffer *const buf =
+ idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[idx] : NULL;
+ if (buf != NULL) {
+ --buf->ref_count;
+ cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+ }
+}
+
+static void set_mv_search_params(AV1_COMP *cpi) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const unsigned int max_mv_def = AOMMIN(cm->width, cm->height);
+
+ // Default based on max resolution.
+ cpi->mv_step_param = av1_init_search_range(max_mv_def);
+
+ if (cpi->sf.mv.auto_mv_step_size) {
+ if (frame_is_intra_only(cm)) {
+ // Initialize max_mv_magnitude for use in the first INTER frame
+ // after a key/intra-only frame.
+ cpi->max_mv_magnitude = max_mv_def;
+ } else {
+ if (cm->show_frame) {
+ // Allow mv_steps to correspond to twice the max mv magnitude found
+ // in the previous frame, capped by the default max_mv_magnitude based
+ // on resolution.
+ cpi->mv_step_param = av1_init_search_range(
+ AOMMIN(max_mv_def, 2 * cpi->max_mv_magnitude));
+ }
+ cpi->max_mv_magnitude = 0;
+ }
+ }
+}
+
+static void set_size_independent_vars(AV1_COMP *cpi) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
+ cpi->common.global_motion[i] = default_warp_params;
+ }
+ cpi->global_motion_search_done = 0;
+ av1_set_speed_features_framesize_independent(cpi);
+ av1_set_rd_speed_thresholds(cpi);
+ av1_set_rd_speed_thresholds_sub8x8(cpi);
+ cpi->common.interp_filter = SWITCHABLE;
+ cpi->common.switchable_motion_mode = 1;
+}
+
+static void set_size_dependent_vars(AV1_COMP *cpi, int *q, int *bottom_index,
+ int *top_index) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Setup variables that depend on the dimensions of the frame.
+ av1_set_speed_features_framesize_dependent(cpi);
+
+ // Decide q and q bounds.
+ *q = av1_rc_pick_q_and_bounds(cpi, cm->width, cm->height, bottom_index,
+ top_index);
+
+ if (!frame_is_intra_only(cm)) {
+ set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH,
+ cpi->common.cur_frame_force_integer_mv);
+ }
+
+ // Configure experimental use of segmentation for enhanced coding of
+ // static regions if indicated.
+ // Only allowed in the second pass of a two pass encode, as it requires
+ // lagged coding, and if the relevant speed feature flag is set.
+ if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+ configure_static_seg_features(cpi);
+}
+
+static void init_motion_estimation(AV1_COMP *cpi) {
+ int y_stride = cpi->scaled_source.y_stride;
+
+ if (cpi->sf.mv.search_method == NSTEP) {
+ av1_init3smotion_compensation(&cpi->ss_cfg, y_stride);
+ } else if (cpi->sf.mv.search_method == DIAMOND) {
+ av1_init_dsmotion_compensation(&cpi->ss_cfg, y_stride);
+ }
+}
+
+#define COUPLED_CHROMA_FROM_LUMA_RESTORATION 0
+static void set_restoration_unit_size(int width, int height, int sx, int sy,
+ RestorationInfo *rst) {
+ (void)width;
+ (void)height;
+ (void)sx;
+ (void)sy;
+#if COUPLED_CHROMA_FROM_LUMA_RESTORATION
+ int s = AOMMIN(sx, sy);
+#else
+ int s = 0;
+#endif // !COUPLED_CHROMA_FROM_LUMA_RESTORATION
+
+ if (width * height > 352 * 288)
+ rst[0].restoration_unit_size = RESTORATION_UNITSIZE_MAX;
+ else
+ rst[0].restoration_unit_size = (RESTORATION_UNITSIZE_MAX >> 1);
+ rst[1].restoration_unit_size = rst[0].restoration_unit_size >> s;
+ rst[2].restoration_unit_size = rst[1].restoration_unit_size;
+}
+
+static void init_ref_frame_bufs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i;
+ BufferPool *const pool = cm->buffer_pool;
+ cm->new_fb_idx = INVALID_IDX;
+ for (i = 0; i < REF_FRAMES; ++i) {
+ cm->ref_frame_map[i] = INVALID_IDX;
+ pool->frame_bufs[i].ref_count = 0;
+ }
+ if (cm->seq_params.force_screen_content_tools) {
+ for (i = 0; i < FRAME_BUFFERS; ++i) {
+ av1_hash_table_init(&pool->frame_bufs[i].hash_table, &cpi->td.mb);
+ }
+ }
+}
+
+static void check_initial_width(AV1_COMP *cpi, int use_highbitdepth,
+ int subsampling_x, int subsampling_y) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
+
+ if (!cpi->initial_width || seq_params->use_highbitdepth != use_highbitdepth ||
+ seq_params->subsampling_x != subsampling_x ||
+ seq_params->subsampling_y != subsampling_y) {
+ seq_params->subsampling_x = subsampling_x;
+ seq_params->subsampling_y = subsampling_y;
+ seq_params->use_highbitdepth = use_highbitdepth;
+
+ alloc_raw_frame_buffers(cpi);
+ init_ref_frame_bufs(cpi);
+ alloc_util_frame_buffers(cpi);
+
+ init_motion_estimation(cpi); // TODO(agrange) This can be removed.
+
+ cpi->initial_width = cm->width;
+ cpi->initial_height = cm->height;
+ cpi->initial_mbs = cm->MBs;
+ }
+}
+
+// Returns 1 if the assigned width or height was <= 0.
+static int set_size_literal(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ check_initial_width(cpi, cm->seq_params.use_highbitdepth,
+ cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
+
+ if (width <= 0 || height <= 0) return 1;
+
+ cm->width = width;
+ cm->height = height;
+
+ if (cpi->initial_width && cpi->initial_height &&
+ (cm->width > cpi->initial_width || cm->height > cpi->initial_height)) {
+ av1_free_context_buffers(cm);
+ av1_free_pc_tree(&cpi->td, num_planes);
+ alloc_compressor_data(cpi);
+ realloc_segmentation_maps(cpi);
+ cpi->initial_width = cpi->initial_height = 0;
+ }
+ update_frame_size(cpi);
+
+ return 0;
+}
+
+static void set_frame_size(AV1_COMP *cpi, int width, int height) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+ int ref_frame;
+
+ if (width != cm->width || height != cm->height) {
+ // There has been a change in the encoded frame size
+ set_size_literal(cpi, width, height);
+ set_mv_search_params(cpi);
+ // Recalculate 'all_lossless' in case super-resolution was (un)selected.
+ cm->all_lossless = cm->coded_lossless && !av1_superres_scaled(cm);
+ }
+
+ if (cpi->oxcf.pass == 2) {
+ av1_set_target_rate(cpi, cm->width, cm->height);
+ }
+
+ alloc_frame_mvs(cm, cm->new_fb_idx);
+
+ // Allocate above context buffers
+ if (cm->num_allocated_above_context_planes < av1_num_planes(cm) ||
+ cm->num_allocated_above_context_mi_col < cm->mi_cols ||
+ cm->num_allocated_above_contexts < cm->tile_rows) {
+ av1_free_above_context_buffers(cm, cm->num_allocated_above_contexts);
+ if (av1_alloc_above_context_buffers(cm, cm->tile_rows))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate context buffers");
+ }
+
+ // Reset the frame pointers to the current frame size.
+ if (aom_realloc_frame_buffer(
+ get_frame_new_buffer(cm), cm->width, cm->height,
+ seq_params->subsampling_x, seq_params->subsampling_y,
+ seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS,
+ cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to allocate frame buffer");
+
+ const int frame_width = cm->superres_upscaled_width;
+ const int frame_height = cm->superres_upscaled_height;
+ set_restoration_unit_size(frame_width, frame_height,
+ seq_params->subsampling_x,
+ seq_params->subsampling_y, cm->rst_info);
+ for (int i = 0; i < num_planes; ++i)
+ cm->rst_info[i].frame_restoration_type = RESTORE_NONE;
+
+ av1_alloc_restoration_buffers(cm);
+ alloc_util_frame_buffers(cpi); // TODO(afergs): Remove? Gets called anyways.
+ init_motion_estimation(cpi);
+
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+
+ ref_buf->idx = buf_idx;
+
+ if (buf_idx != INVALID_IDX) {
+ YV12_BUFFER_CONFIG *const buf = &cm->buffer_pool->frame_bufs[buf_idx].buf;
+ ref_buf->buf = buf;
+ av1_setup_scale_factors_for_frame(&ref_buf->sf, buf->y_crop_width,
+ buf->y_crop_height, cm->width,
+ cm->height);
+ if (av1_is_scaled(&ref_buf->sf))
+ aom_extend_frame_borders(buf, num_planes);
+ } else {
+ ref_buf->buf = NULL;
+ }
+ }
+
+ av1_setup_scale_factors_for_frame(&cm->sf_identity, cm->width, cm->height,
+ cm->width, cm->height);
+
+ set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
+}
+
+static uint8_t calculate_next_resize_scale(const AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 56789;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ if (oxcf->pass == 1) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ if (cpi->common.seq_params.reduced_still_picture_hdr) return SCALE_NUMERATOR;
+ switch (oxcf->resize_mode) {
+ case RESIZE_NONE: new_denom = SCALE_NUMERATOR; break;
+ case RESIZE_FIXED:
+ if (cpi->common.frame_type == KEY_FRAME)
+ new_denom = oxcf->resize_kf_scale_denominator;
+ else
+ new_denom = oxcf->resize_scale_denominator;
+ break;
+ case RESIZE_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+static uint8_t calculate_next_superres_scale(AV1_COMP *cpi) {
+ // Choose an arbitrary random number
+ static unsigned int seed = 34567;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ if (oxcf->pass == 1) return SCALE_NUMERATOR;
+ uint8_t new_denom = SCALE_NUMERATOR;
+
+ // Make sure that superres mode of the frame is consistent with the
+ // sequence-level flag.
+ assert(IMPLIES(oxcf->superres_mode != SUPERRES_NONE,
+ cpi->common.seq_params.enable_superres));
+ assert(IMPLIES(!cpi->common.seq_params.enable_superres,
+ oxcf->superres_mode == SUPERRES_NONE));
+
+ switch (oxcf->superres_mode) {
+ case SUPERRES_NONE: new_denom = SCALE_NUMERATOR; break;
+ case SUPERRES_FIXED:
+ if (cpi->common.frame_type == KEY_FRAME)
+ new_denom = oxcf->superres_kf_scale_denominator;
+ else
+ new_denom = oxcf->superres_scale_denominator;
+ break;
+ case SUPERRES_RANDOM: new_denom = lcg_rand16(&seed) % 9 + 8; break;
+ case SUPERRES_QTHRESH: {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const RATE_FACTOR_LEVEL rf_level = gf_group->rf_level[gf_group->index];
+ const double rate_factor_delta = rate_factor_deltas[rf_level];
+ const int qthresh = (rate_factor_delta <= 1.0)
+ ? oxcf->superres_qthresh
+ : oxcf->superres_kf_qthresh;
+ av1_set_target_rate(cpi, cpi->oxcf.width, cpi->oxcf.height);
+ int bottom_index, top_index;
+ const int q = av1_rc_pick_q_and_bounds(
+ cpi, cpi->oxcf.width, cpi->oxcf.height, &bottom_index, &top_index);
+ if (q < qthresh) {
+ new_denom = SCALE_NUMERATOR;
+ } else {
+ const uint8_t min_denom = SCALE_NUMERATOR + 1;
+ const uint8_t denom_step = (MAXQ - qthresh + 1) >> 3;
+
+ if (q == qthresh) {
+ new_denom = min_denom;
+ } else if (denom_step == 0) {
+ new_denom = SCALE_NUMERATOR << 1;
+ } else {
+ const uint8_t additional_denom = (q - qthresh) / denom_step;
+ new_denom =
+ AOMMIN(min_denom + additional_denom, SCALE_NUMERATOR << 1);
+ }
+ }
+ break;
+ }
+ default: assert(0);
+ }
+ return new_denom;
+}
+
+static int dimension_is_ok(int orig_dim, int resized_dim, int denom) {
+ return (resized_dim * SCALE_NUMERATOR >= orig_dim * denom / 2);
+}
+
+static int dimensions_are_ok(int owidth, int oheight, size_params_type *rsz) {
+ // Only need to check the width, as scaling is horizontal only.
+ (void)oheight;
+ return dimension_is_ok(owidth, rsz->resize_width, rsz->superres_denom);
+}
+
+static int validate_size_scales(RESIZE_MODE resize_mode,
+ SUPERRES_MODE superres_mode, int owidth,
+ int oheight, size_params_type *rsz) {
+ if (dimensions_are_ok(owidth, oheight, rsz)) { // Nothing to do.
+ return 1;
+ }
+
+ // Calculate current resize scale.
+ int resize_denom =
+ AOMMAX(DIVIDE_AND_ROUND(owidth * SCALE_NUMERATOR, rsz->resize_width),
+ DIVIDE_AND_ROUND(oheight * SCALE_NUMERATOR, rsz->resize_height));
+
+ if (resize_mode != RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+ // Alter superres scale as needed to enforce conformity.
+ rsz->superres_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / resize_denom;
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (rsz->superres_denom > SCALE_NUMERATOR) --rsz->superres_denom;
+ }
+ } else if (resize_mode == RESIZE_RANDOM && superres_mode != SUPERRES_RANDOM) {
+ // Alter resize scale as needed to enforce conformity.
+ resize_denom =
+ (2 * SCALE_NUMERATOR * SCALE_NUMERATOR) / rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ if (!dimensions_are_ok(owidth, oheight, rsz)) {
+ if (resize_denom > SCALE_NUMERATOR) {
+ --resize_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ }
+ }
+ } else if (resize_mode == RESIZE_RANDOM && superres_mode == SUPERRES_RANDOM) {
+ // Alter both resize and superres scales as needed to enforce conformity.
+ do {
+ if (resize_denom > rsz->superres_denom)
+ --resize_denom;
+ else
+ --rsz->superres_denom;
+ rsz->resize_width = owidth;
+ rsz->resize_height = oheight;
+ av1_calculate_scaled_size(&rsz->resize_width, &rsz->resize_height,
+ resize_denom);
+ } while (!dimensions_are_ok(owidth, oheight, rsz) &&
+ (resize_denom > SCALE_NUMERATOR ||
+ rsz->superres_denom > SCALE_NUMERATOR));
+ } else { // We are allowed to alter neither resize scale nor superres
+ // scale.
+ return 0;
+ }
+ return dimensions_are_ok(owidth, oheight, rsz);
+}
+
+// Calculates resize and superres params for next frame
+size_params_type av1_calculate_next_size_params(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ size_params_type rsz = { oxcf->width, oxcf->height, SCALE_NUMERATOR };
+ int resize_denom;
+ if (oxcf->pass == 1) return rsz;
+ if (cpi->resize_pending_width && cpi->resize_pending_height) {
+ rsz.resize_width = cpi->resize_pending_width;
+ rsz.resize_height = cpi->resize_pending_height;
+ cpi->resize_pending_width = cpi->resize_pending_height = 0;
+ } else {
+ resize_denom = calculate_next_resize_scale(cpi);
+ rsz.resize_width = cpi->oxcf.width;
+ rsz.resize_height = cpi->oxcf.height;
+ av1_calculate_scaled_size(&rsz.resize_width, &rsz.resize_height,
+ resize_denom);
+ }
+ rsz.superres_denom = calculate_next_superres_scale(cpi);
+ if (!validate_size_scales(oxcf->resize_mode, oxcf->superres_mode, oxcf->width,
+ oxcf->height, &rsz))
+ assert(0 && "Invalid scale parameters");
+ return rsz;
+}
+
+static void setup_frame_size_from_params(AV1_COMP *cpi, size_params_type *rsz) {
+ int encode_width = rsz->resize_width;
+ int encode_height = rsz->resize_height;
+
+ AV1_COMMON *cm = &cpi->common;
+ cm->superres_upscaled_width = encode_width;
+ cm->superres_upscaled_height = encode_height;
+ cm->superres_scale_denominator = rsz->superres_denom;
+ av1_calculate_scaled_superres_size(&encode_width, &encode_height,
+ rsz->superres_denom);
+ set_frame_size(cpi, encode_width, encode_height);
+}
+
+static void setup_frame_size(AV1_COMP *cpi) {
+ size_params_type rsz = av1_calculate_next_size_params(cpi);
+ setup_frame_size_from_params(cpi, &rsz);
+}
+
+static void superres_post_encode(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+
+ if (!av1_superres_scaled(cm)) return;
+
+ assert(cpi->oxcf.enable_superres);
+ assert(!is_lossless_requested(&cpi->oxcf));
+ assert(!cm->all_lossless);
+
+ av1_superres_upscale(cm, NULL);
+
+ // If regular resizing is occurring the source will need to be downscaled to
+ // match the upscaled superres resolution. Otherwise the original source is
+ // used.
+ if (!av1_resize_scaled(cm)) {
+ cpi->source = cpi->unscaled_source;
+ if (cpi->last_source != NULL) cpi->last_source = cpi->unscaled_last_source;
+ } else {
+ assert(cpi->unscaled_source->y_crop_width != cm->superres_upscaled_width);
+ assert(cpi->unscaled_source->y_crop_height != cm->superres_upscaled_height);
+ // Do downscale. cm->(width|height) has been updated by
+ // av1_superres_upscale
+ if (aom_realloc_frame_buffer(
+ &cpi->scaled_source, cm->superres_upscaled_width,
+ cm->superres_upscaled_height, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y, cm->seq_params.use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, cm->byte_alignment, NULL, NULL, NULL))
+ aom_internal_error(
+ &cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to reallocate scaled source buffer for superres");
+ assert(cpi->scaled_source.y_crop_width == cm->superres_upscaled_width);
+ assert(cpi->scaled_source.y_crop_height == cm->superres_upscaled_height);
+ av1_resize_and_extend_frame(cpi->unscaled_source, &cpi->scaled_source,
+ (int)cm->seq_params.bit_depth, num_planes);
+ cpi->source = &cpi->scaled_source;
+ }
+}
+
+static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &cpi->td.mb.e_mbd;
+
+ assert(IMPLIES(is_lossless_requested(&cpi->oxcf),
+ cm->coded_lossless && cm->all_lossless));
+
+ const int no_loopfilter = cm->coded_lossless || cm->large_scale_tile;
+ const int no_cdef =
+ !cm->seq_params.enable_cdef || cm->coded_lossless || cm->large_scale_tile;
+ const int no_restoration = !cm->seq_params.enable_restoration ||
+ cm->all_lossless || cm->large_scale_tile;
+
+ struct loopfilter *lf = &cm->lf;
+
+ if (no_loopfilter) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else {
+ struct aom_usec_timer timer;
+
+ aom_clear_system_state();
+
+ aom_usec_timer_start(&timer);
+
+ av1_pick_filter_level(cpi->source, cpi, cpi->sf.lpf_pick);
+
+ aom_usec_timer_mark(&timer);
+ cpi->time_pick_lpf += aom_usec_timer_elapsed(&timer);
+ }
+
+ if (lf->filter_level[0] || lf->filter_level[1]) {
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, 0, num_planes, 0);
+#else
+ if (cpi->num_workers > 1)
+ av1_loop_filter_frame_mt(cm->frame_to_show, cm, xd, 0, num_planes, 0,
+ cpi->workers, cpi->num_workers,
+ &cpi->lf_row_sync);
+ else
+ av1_loop_filter_frame(cm->frame_to_show, cm, xd, 0, num_planes, 0);
+#endif
+ }
+
+ if (!no_restoration)
+ av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 0);
+
+ if (no_cdef) {
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->nb_cdef_strengths = 1;
+ cm->cdef_uv_strengths[0] = 0;
+ } else {
+ // Find CDEF parameters
+ av1_cdef_search(cm->frame_to_show, cpi->source, cm, xd,
+ cpi->sf.fast_cdef_search);
+
+ // Apply the filter
+ av1_cdef_frame(cm->frame_to_show, cm, xd);
+ }
+
+ superres_post_encode(cpi);
+
+ if (no_restoration) {
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ } else {
+ av1_loop_restoration_save_boundary_lines(cm->frame_to_show, cm, 1);
+ av1_pick_filter_restoration(cpi->source, cpi);
+ if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
+ cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
+ if (cpi->num_workers > 1)
+ av1_loop_restoration_filter_frame_mt(cm->frame_to_show, cm, 0,
+ cpi->workers, cpi->num_workers,
+ &cpi->lr_row_sync, &cpi->lr_ctxt);
+ else
+ av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0,
+ &cpi->lr_ctxt);
+ }
+ }
+}
+
+static int encode_without_recode_loop(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int q = 0, bottom_index = 0, top_index = 0; // Dummy variables.
+
+ aom_clear_system_state();
+
+ set_size_independent_vars(cpi);
+
+ setup_frame_size(cpi);
+
+ assert(cm->width == cpi->scaled_source.y_crop_width);
+ assert(cm->height == cpi->scaled_source.y_crop_height);
+
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ cpi->source =
+ av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+ if (cpi->unscaled_last_source != NULL)
+ cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+ cpi->source->buf_8bit_valid = 0;
+ if (frame_is_intra_only(cm) == 0) {
+ scale_references(cpi);
+ }
+
+ av1_set_quantizer(cm, q);
+ setup_frame(cpi);
+ suppress_active_map(cpi);
+
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+ av1_cyclic_refresh_setup(cpi);
+ }
+ apply_active_map(cpi);
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ } else {
+ calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+
+ // transform / motion compensation build reconstruction frame
+ av1_encode_frame(cpi);
+
+ // Update some stats from cyclic refresh, and check if we should not update
+ // golden reference, for 1 pass CBR.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->frame_type != KEY_FRAME &&
+ (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == AOM_CBR))
+ av1_cyclic_refresh_check_golden_update(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+ aom_clear_system_state();
+ return AOM_CODEC_OK;
+}
+
+static int encode_with_recode_loop(AV1_COMP *cpi, size_t *size, uint8_t *dest) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int bottom_index, top_index;
+ int loop_count = 0;
+ int loop_at_this_size = 0;
+ int loop = 0;
+ int overshoot_seen = 0;
+ int undershoot_seen = 0;
+ int frame_over_shoot_limit;
+ int frame_under_shoot_limit;
+ int q = 0, q_low = 0, q_high = 0;
+
+ set_size_independent_vars(cpi);
+
+ cpi->source->buf_8bit_valid = 0;
+
+ aom_clear_system_state();
+ setup_frame_size(cpi);
+ set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
+
+ do {
+ aom_clear_system_state();
+
+ if (loop_count == 0) {
+ // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
+ set_mv_search_params(cpi);
+
+ // Reset the loop state for new frame size.
+ overshoot_seen = 0;
+ undershoot_seen = 0;
+
+ q_low = bottom_index;
+ q_high = top_index;
+
+ loop_at_this_size = 0;
+
+ // Decide frame size bounds first time through.
+ av1_rc_compute_frame_size_bounds(cpi, rc->this_frame_target,
+ &frame_under_shoot_limit,
+ &frame_over_shoot_limit);
+ }
+
+ // if frame was scaled calculate global_motion_search again if already
+ // done
+ if (loop_count > 0 && cpi->source && cpi->global_motion_search_done)
+ if (cpi->source->y_crop_width != cm->width ||
+ cpi->source->y_crop_height != cm->height)
+ cpi->global_motion_search_done = 0;
+ cpi->source =
+ av1_scale_if_required(cm, cpi->unscaled_source, &cpi->scaled_source);
+ if (cpi->unscaled_last_source != NULL)
+ cpi->last_source = av1_scale_if_required(cm, cpi->unscaled_last_source,
+ &cpi->scaled_last_source);
+
+ if (frame_is_intra_only(cm) == 0) {
+ if (loop_count > 0) {
+ release_scaled_references(cpi);
+ }
+ scale_references(cpi);
+ }
+ av1_set_quantizer(cm, q);
+ // printf("Frame %d/%d: q = %d, frame_type = %d\n", cm->current_video_frame,
+ // cm->show_frame, q, cm->frame_type);
+
+ if (loop_count == 0) setup_frame(cpi);
+
+ // Base q-index may have changed, so we need to assign proper default coef
+ // probs before every iteration.
+ if (cm->primary_ref_frame == PRIMARY_REF_NONE ||
+ cm->frame_refs[cm->primary_ref_frame].idx < 0) {
+ av1_default_coef_probs(cm);
+ av1_setup_frame_contexts(cm);
+ }
+
+ // Variance adaptive and in frame q adjustment experiments are mutually
+ // exclusive.
+ if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+ av1_vaq_frame_setup(cpi);
+ } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+ av1_setup_in_frame_q_adj(cpi);
+ }
+ if (cm->seg.enabled) {
+ if (!cm->seg.update_data && cm->prev_frame) {
+ segfeatures_copy(&cm->seg, &cm->prev_frame->seg);
+ } else {
+ calculate_segdata(&cm->seg);
+ }
+ } else {
+ memset(&cm->seg, 0, sizeof(cm->seg));
+ }
+ segfeatures_copy(&cm->cur_frame->seg, &cm->seg);
+
+ // transform / motion compensation build reconstruction frame
+ save_coding_context(cpi);
+ av1_encode_frame(cpi);
+
+ // Update the skip mb flag probabilities based on the distribution
+ // seen in the last encoder iteration.
+ // update_base_skip_probs(cpi);
+
+ aom_clear_system_state();
+
+ // Dummy pack of the bitstream using up to date stats to get an
+ // accurate estimate of output frame size to determine if we need
+ // to recode.
+ if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
+ restore_coding_context(cpi);
+
+ if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ rc->projected_frame_size = (int)(*size) << 3;
+ restore_coding_context(cpi);
+
+ if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
+ }
+
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ loop = 0;
+ } else {
+ if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
+ (rc->projected_frame_size < rc->max_frame_bandwidth)) {
+ int last_q = q;
+ int64_t kf_err;
+
+ int64_t high_err_target = cpi->ambient_err;
+ int64_t low_err_target = cpi->ambient_err >> 1;
+
+ if (cm->seq_params.use_highbitdepth) {
+ kf_err = aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ } else {
+ kf_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ }
+ // Prevent possible divide by zero error below for perfect KF
+ kf_err += !kf_err;
+
+ // The key frame is not good enough or we can afford
+ // to make it better without undue risk of popping.
+ if ((kf_err > high_err_target &&
+ rc->projected_frame_size <= frame_over_shoot_limit) ||
+ (kf_err > low_err_target &&
+ rc->projected_frame_size <= frame_under_shoot_limit)) {
+ // Lower q_high
+ q_high = q > q_low ? q - 1 : q_low;
+
+ // Adjust Q
+ q = (int)((q * high_err_target) / kf_err);
+ q = AOMMIN(q, (q_high + q_low) >> 1);
+ } else if (kf_err < low_err_target &&
+ rc->projected_frame_size >= frame_under_shoot_limit) {
+ // The key frame is much better than the previous frame
+ // Raise q_low
+ q_low = q < q_high ? q + 1 : q_high;
+
+ // Adjust Q
+ q = (int)((q * low_err_target) / kf_err);
+ q = AOMMIN(q, (q_high + q_low + 1) >> 1);
+ }
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = q != last_q;
+ } else if (recode_loop_test(cpi, frame_over_shoot_limit,
+ frame_under_shoot_limit, q,
+ AOMMAX(q_high, top_index), bottom_index)) {
+ // Is the projected frame size out of range and are we allowed
+ // to attempt to recode.
+ int last_q = q;
+ int retries = 0;
+
+ // Frame size out of permitted range:
+ // Update correction factor & compute new Q to try...
+ // Frame is too large
+ if (rc->projected_frame_size > rc->this_frame_target) {
+ // Special case if the projected size is > the max allowed.
+ if (rc->projected_frame_size >= rc->max_frame_bandwidth)
+ q_high = rc->worst_quality;
+
+ // Raise Qlow as to at least the current value
+ q_low = q < q_high ? q + 1 : q_high;
+
+ if (undershoot_seen || loop_at_this_size > 1) {
+ // Update rate_correction_factor unless
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+ q = (q_high + q_low + 1) / 2;
+ } else {
+ // Update rate_correction_factor unless
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width,
+ cm->height);
+
+ while (q < q_low && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ AOMMAX(q_high, top_index), cm->width,
+ cm->height);
+ retries++;
+ }
+ }
+
+ overshoot_seen = 1;
+ } else {
+ // Frame is too small
+ q_high = q > q_low ? q - 1 : q_low;
+
+ if (overshoot_seen || loop_at_this_size > 1) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q = (q_high + q_low) / 2;
+ } else {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+ // Special case reset for qlow for constrained quality.
+ // This should only trigger where there is very substantial
+ // undershoot on a frame and the auto cq level is above
+ // the user passsed in value.
+ if (cpi->oxcf.rc_mode == AOM_CQ && q < q_low) {
+ q_low = q;
+ }
+
+ while (q > q_high && retries < 10) {
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
+ top_index, cm->width, cm->height);
+ retries++;
+ }
+ }
+
+ undershoot_seen = 1;
+ }
+
+ // Clamp Q to upper and lower limits:
+ q = clamp(q, q_low, q_high);
+
+ loop = (q != last_q);
+ } else {
+ loop = 0;
+ }
+ }
+
+ // Special case for overlay frame.
+ if (rc->is_src_frame_alt_ref &&
+ rc->projected_frame_size < rc->max_frame_bandwidth)
+ loop = 0;
+
+ if (!cpi->sf.gm_disable_recode) {
+ if (recode_loop_test_global_motion(cpi)) loop = 1;
+ }
+
+ if (loop) {
+ ++loop_count;
+ ++loop_at_this_size;
+
+#if CONFIG_INTERNAL_STATS
+ ++cpi->tot_recode_hits;
+#endif
+ }
+ } while (loop);
+
+ return AOM_CODEC_OK;
+}
+
+static int get_ref_frame_flags(const AV1_COMP *cpi) {
+ const int *const map = cpi->common.ref_frame_map;
+
+ // No.1 Priority: LAST_FRAME
+ const int last2_is_last = map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[0]];
+ const int last3_is_last = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[0]];
+ const int gld_is_last =
+ map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+ const int bwd_is_last =
+ map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+ const int alt2_is_last =
+ map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+ const int alt_is_last =
+ map[cpi->ref_fb_idx[ALTREF_FRAME - 1]] == map[cpi->ref_fb_idx[0]];
+
+ // No.2 Priority: ALTREF_FRAME
+ const int last2_is_alt =
+ map[cpi->ref_fb_idx[1]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+ const int last3_is_alt =
+ map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+ const int gld_is_alt = map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+ const int bwd_is_alt = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+ const int alt2_is_alt = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[ALTREF_FRAME - 1]];
+
+ // No.3 Priority: LAST2_FRAME
+ const int last3_is_last2 = map[cpi->ref_fb_idx[2]] == map[cpi->ref_fb_idx[1]];
+ const int gld_is_last2 =
+ map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+ const int bwd_is_last2 =
+ map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+ const int alt2_is_last2 =
+ map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[1]];
+
+ // No.4 Priority: LAST3_FRAME
+ const int gld_is_last3 =
+ map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+ const int bwd_is_last3 =
+ map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+ const int alt2_is_last3 =
+ map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] == map[cpi->ref_fb_idx[2]];
+
+ // No.5 Priority: GOLDEN_FRAME
+ const int bwd_is_gld = map[cpi->ref_fb_idx[BWDREF_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+ const int alt2_is_gld = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]];
+
+ // No.6 Priority: BWDREF_FRAME
+ const int alt2_is_bwd = map[cpi->ref_fb_idx[ALTREF2_FRAME - 1]] ==
+ map[cpi->ref_fb_idx[BWDREF_FRAME - 1]];
+
+ // No.7 Priority: ALTREF2_FRAME
+
+ // After av1_apply_encoding_flags() is called, cpi->ref_frame_flags might be
+ // adjusted according to external encoder flags.
+ int flags = cpi->ext_ref_frame_flags;
+
+ if (cpi->rc.frames_till_gf_update_due == INT_MAX) flags &= ~AOM_GOLD_FLAG;
+
+ if (alt_is_last) flags &= ~AOM_ALT_FLAG;
+
+ if (last2_is_last || last2_is_alt) flags &= ~AOM_LAST2_FLAG;
+
+ if (last3_is_last || last3_is_alt || last3_is_last2) flags &= ~AOM_LAST3_FLAG;
+
+ if (gld_is_last || gld_is_alt || gld_is_last2 || gld_is_last3)
+ flags &= ~AOM_GOLD_FLAG;
+
+ if ((bwd_is_last || bwd_is_alt || bwd_is_last2 || bwd_is_last3 ||
+ bwd_is_gld) &&
+ (flags & AOM_BWD_FLAG))
+ flags &= ~AOM_BWD_FLAG;
+
+ if ((alt2_is_last || alt2_is_alt || alt2_is_last2 || alt2_is_last3 ||
+ alt2_is_gld || alt2_is_bwd) &&
+ (flags & AOM_ALT2_FLAG))
+ flags &= ~AOM_ALT2_FLAG;
+
+ return flags;
+}
+
+static void set_ext_overrides(AV1_COMP *cpi) {
+ // Overrides the defaults with the externally supplied values with
+ // av1_update_reference() and av1_update_entropy() calls
+ // Note: The overrides are valid only for the next frame passed
+ // to encode_frame_to_data_rate() function
+ if (cpi->ext_use_s_frame) cpi->common.frame_type = S_FRAME;
+ cpi->common.force_primary_ref_none = cpi->ext_use_primary_ref_none;
+
+ if (cpi->ext_refresh_frame_context_pending) {
+ cpi->common.refresh_frame_context = cpi->ext_refresh_frame_context;
+ cpi->ext_refresh_frame_context_pending = 0;
+ }
+ if (cpi->ext_refresh_frame_flags_pending) {
+ cpi->refresh_last_frame = cpi->ext_refresh_last_frame;
+ cpi->refresh_golden_frame = cpi->ext_refresh_golden_frame;
+ cpi->refresh_alt_ref_frame = cpi->ext_refresh_alt_ref_frame;
+ cpi->refresh_bwd_ref_frame = cpi->ext_refresh_bwd_ref_frame;
+ cpi->refresh_alt2_ref_frame = cpi->ext_refresh_alt2_ref_frame;
+ cpi->ext_refresh_frame_flags_pending = 0;
+ }
+ cpi->common.allow_ref_frame_mvs = cpi->ext_use_ref_frame_mvs;
+ // A keyframe is already error resilient and keyframes with
+ // error_resilient_mode interferes with the use of show_existing_frame
+ // when forward reference keyframes are enabled.
+ cpi->common.error_resilient_mode =
+ cpi->ext_use_error_resilient && cpi->common.frame_type != KEY_FRAME;
+}
+
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+
+ if (recon_buf == NULL) {
+ printf("Frame %d is not ready.\n", cm->current_video_frame);
+ return;
+ }
+
+ static const int flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+ printf(
+ "\n***Frame=%d (frame_offset=%d, show_frame=%d, "
+ "show_existing_frame=%d) "
+ "[LAST LAST2 LAST3 GOLDEN BWD ALT2 ALT]=[",
+ cm->current_video_frame, cm->frame_offset, cm->show_frame,
+ cm->show_existing_frame);
+ for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = cm->frame_refs[ref_frame - LAST_FRAME].idx;
+ const int ref_offset =
+ (buf_idx >= 0)
+ ? (int)cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset
+ : -1;
+ printf(
+ " %d(%c-%d-%4.2f)", ref_offset,
+ (cpi->ref_frame_flags & flag_list[ref_frame]) ? 'Y' : 'N',
+ (buf_idx >= 0) ? (int)cpi->frame_rf_level[buf_idx] : -1,
+ (buf_idx >= 0) ? rate_factor_deltas[cpi->frame_rf_level[buf_idx]] : -1);
+ }
+ printf(" ]\n");
+
+ if (!cm->show_frame) {
+ printf("Frame %d is a no show frame, so no image dump.\n",
+ cm->current_video_frame);
+ return;
+ }
+
+ int h;
+ char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+ FILE *f_recon = NULL;
+
+ if (cm->current_video_frame == 0) {
+ if ((f_recon = fopen(file_name, "wb")) == NULL) {
+ printf("Unable to open file %s to write.\n", file_name);
+ return;
+ }
+ } else {
+ if ((f_recon = fopen(file_name, "ab")) == NULL) {
+ printf("Unable to open file %s to append.\n", file_name);
+ return;
+ }
+ }
+ printf(
+ "\nFrame=%5d, encode_update_type[%5d]=%1d, frame_offset=%d, "
+ "show_frame=%d, show_existing_frame=%d, source_alt_ref_active=%d, "
+ "refresh_alt_ref_frame=%d, rf_level=%d, "
+ "y_stride=%4d, uv_stride=%4d, cm->width=%4d, cm->height=%4d\n\n",
+ cm->current_video_frame, cpi->twopass.gf_group.index,
+ cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+ cm->frame_offset, cm->show_frame, cm->show_existing_frame,
+ cpi->rc.source_alt_ref_active, cpi->refresh_alt_ref_frame,
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index],
+ recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+#if 0
+ int ref_frame;
+ printf("get_ref_frame_map_idx: [");
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ printf(" %d", get_ref_frame_map_idx(cpi, ref_frame));
+ printf(" ]\n");
+ printf("cm->new_fb_idx = %d\n", cm->new_fb_idx);
+ printf("cm->ref_frame_map = [");
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ printf(" %d", cm->ref_frame_map[ref_frame - LAST_FRAME]);
+ }
+ printf(" ]\n");
+#endif // 0
+
+ // --- Y ---
+ for (h = 0; h < cm->height; ++h) {
+ fwrite(&recon_buf->y_buffer[h * recon_buf->y_stride], 1, cm->width,
+ f_recon);
+ }
+ // --- U ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->u_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+ // --- V ---
+ for (h = 0; h < (cm->height >> 1); ++h) {
+ fwrite(&recon_buf->v_buffer[h * recon_buf->uv_stride], 1, (cm->width >> 1),
+ f_recon);
+ }
+
+ fclose(f_recon);
+}
+#endif // DUMP_RECON_FRAMES
+
+static INLINE int is_frame_droppable(AV1_COMP *cpi) {
+ return !(cpi->refresh_alt_ref_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_bwd_ref_frame || cpi->refresh_golden_frame ||
+ cpi->refresh_last_frame);
+}
+
+static int encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+ int skip_adapt,
+ unsigned int *frame_flags) {
+ AV1_COMMON *const cm = &cpi->common;
+ SequenceHeader *const seq_params = &cm->seq_params;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ struct segmentation *const seg = &cm->seg;
+
+ set_ext_overrides(cpi);
+ aom_clear_system_state();
+
+ // frame type has been decided outside of this function call
+ cm->cur_frame->intra_only = frame_is_intra_only(cm);
+ cm->cur_frame->frame_type = cm->frame_type;
+
+ // S_FRAMEs are always error resilient
+ cm->error_resilient_mode |= frame_is_sframe(cm);
+
+ cm->large_scale_tile = cpi->oxcf.large_scale_tile;
+ cm->single_tile_decoding = cpi->oxcf.single_tile_decoding;
+ if (cm->large_scale_tile) seq_params->frame_id_numbers_present_flag = 0;
+
+ cm->allow_ref_frame_mvs &= frame_might_allow_ref_frame_mvs(cm);
+ // cm->allow_ref_frame_mvs needs to be written into the frame header while
+ // cm->large_scale_tile is 1, therefore, "cm->large_scale_tile=1" case is
+ // separated from frame_might_allow_ref_frame_mvs().
+ cm->allow_ref_frame_mvs &= !cm->large_scale_tile;
+
+ cm->allow_warped_motion =
+ cpi->oxcf.allow_warped_motion && frame_might_allow_warped_motion(cm);
+
+ // Reset the frame packet stamp index.
+ if (cm->frame_type == KEY_FRAME && cm->show_frame)
+ cm->current_video_frame = 0;
+
+ // NOTE:
+ // (1) Move the setup of the ref_frame_flags upfront as it would be
+ // determined by the current frame properties;
+ // (2) The setup of the ref_frame_flags applies to both
+ // show_existing_frame's
+ // and the other cases.
+ if (cm->current_video_frame > 0)
+ cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+ if (encode_show_existing_frame(cm)) {
+ // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+ // BWDREF_FRAME in the reference frame buffer.
+ if (cm->frame_type == KEY_FRAME) {
+ cm->reset_decoder_state = 1;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ cm->show_frame = 1;
+ cpi->frame_flags = *frame_flags;
+
+ restore_coding_context(cpi);
+
+ // Build the bitstream
+ if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ cpi->seq_params_locked = 1;
+
+ // Set up frame to show to get ready for stats collection.
+ cm->frame_to_show = get_frame_new_buffer(cm);
+
+ // Update current frame offset.
+ cm->frame_offset =
+ cm->buffer_pool->frame_bufs[cm->new_fb_idx].cur_frame_offset;
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ // Update the LAST_FRAME in the reference frame buffer.
+ // NOTE:
+ // (1) For BWDREF_FRAME as the show_existing_frame, the reference frame
+ // update has been done previously when handling the LAST_BIPRED_FRAME
+ // right before BWDREF_FRAME (in the display order);
+ // (2) For INTNL_OVERLAY as the show_existing_frame, the reference frame
+ // update will be done when the following is called, which will
+ // exchange
+ // the virtual indexes between LAST_FRAME and ALTREF2_FRAME, so that
+ // LAST3 will get retired, LAST2 becomes LAST3, LAST becomes LAST2,
+ // and
+ // ALTREF2_FRAME will serve as the new LAST_FRAME.
+ update_reference_frames(cpi);
+
+ // Update frame flags
+ cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+ cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+ cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+ *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+ // Update the frame type
+ cm->last_frame_type = cm->frame_type;
+
+ // Since we allocate a spot for the OVERLAY frame in the gf group, we need
+ // to do post-encoding update accordingly.
+ if (cpi->rc.is_src_frame_alt_ref) {
+ av1_set_target_rate(cpi, cm->width, cm->height);
+ av1_rc_postencode_update(cpi, *size);
+ }
+
+ ++cm->current_video_frame;
+
+ return AOM_CODEC_OK;
+ }
+
+ // Set default state for segment based loop filter update flags.
+ cm->lf.mode_ref_delta_update = 0;
+
+ // Set various flags etc to special state if it is a key frame.
+ if (frame_is_intra_only(cm) || frame_is_sframe(cm)) {
+ // Reset the loop filter deltas and segmentation map.
+ av1_reset_segment_features(cm);
+
+ // If segmentation is enabled force a map update for key frames.
+ if (seg->enabled) {
+ seg->update_map = 1;
+ seg->update_data = 1;
+ }
+
+ // The alternate reference frame cannot be active for a key frame.
+ cpi->rc.source_alt_ref_active = 0;
+ }
+ if (cpi->oxcf.mtu == 0) {
+ cm->num_tg = cpi->oxcf.num_tile_groups;
+ } else {
+ // Use a default value for the purposes of weighting costs in probability
+ // updates
+ cm->num_tg = DEFAULT_MAX_NUM_TG;
+ }
+
+ // For 1 pass CBR, check if we are dropping this frame.
+ // Never drop on key frame.
+ if (oxcf->pass == 0 && oxcf->rc_mode == AOM_CBR &&
+ cm->frame_type != KEY_FRAME) {
+ if (av1_rc_drop_frame(cpi)) {
+ av1_rc_postencode_update_drop_frame(cpi);
+ return AOM_CODEC_OK;
+ }
+ }
+
+ aom_clear_system_state();
+
+#if CONFIG_INTERNAL_STATS
+ memset(cpi->mode_chosen_counts, 0,
+ MAX_MODES * sizeof(*cpi->mode_chosen_counts));
+#endif
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ /* Non-normative definition of current_frame_id ("frame counter" with
+ * wraparound) */
+ const int frame_id_length = FRAME_ID_LENGTH;
+ if (cm->current_frame_id == -1) {
+ int lsb, msb;
+ /* quasi-random initialization of current_frame_id for a key frame */
+ if (cpi->source->flags & YV12_FLAG_HIGHBITDEPTH) {
+ lsb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[0] & 0xff;
+ msb = CONVERT_TO_SHORTPTR(cpi->source->y_buffer)[1] & 0xff;
+ } else {
+ lsb = cpi->source->y_buffer[0] & 0xff;
+ msb = cpi->source->y_buffer[1] & 0xff;
+ }
+ cm->current_frame_id = ((msb << 8) + lsb) % (1 << frame_id_length);
+
+ // S_frame is meant for stitching different streams of different
+ // resolutions together, so current_frame_id must be the
+ // same across different streams of the same content current_frame_id
+ // should be the same and not random. 0x37 is a chosen number as start
+ // point
+ if (cpi->oxcf.sframe_enabled) cm->current_frame_id = 0x37;
+ } else {
+ cm->current_frame_id =
+ (cm->current_frame_id + 1 + (1 << frame_id_length)) %
+ (1 << frame_id_length);
+ }
+ }
+
+ switch (cpi->oxcf.cdf_update_mode) {
+ case 0: // No CDF update for any frames(4~6% compression loss).
+ cm->disable_cdf_update = 1;
+ break;
+ case 1: // Enable CDF update for all frames.
+ cm->disable_cdf_update = 0;
+ break;
+ case 2:
+ // Strategically determine at which frames to do CDF update.
+ // Currently only enable CDF update for all-intra and no-show frames(1.5%
+ // compression loss).
+ // TODO(huisu@google.com): design schemes for various trade-offs between
+ // compression quality and decoding speed.
+ cm->disable_cdf_update =
+ (frame_is_intra_only(cm) || !cm->show_frame) ? 0 : 1;
+ break;
+ }
+ cm->timing_info_present &= !seq_params->reduced_still_picture_hdr;
+
+ if (cpi->sf.recode_loop == DISALLOW_RECODE) {
+ if (encode_without_recode_loop(cpi) != AOM_CODEC_OK) return AOM_CODEC_ERROR;
+ } else {
+ if (encode_with_recode_loop(cpi, size, dest) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+ }
+
+ cm->last_tile_cols = cm->tile_cols;
+ cm->last_tile_rows = cm->tile_rows;
+
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ av1_compute_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif // OUTPUT_YUV_SKINMAP
+
+ // Special case code to reduce pulsing when key frames are forced at a
+ // fixed interval. Note the reconstruction error if it is the frame before
+ // the force key frame
+ if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
+ if (seq_params->use_highbitdepth) {
+ cpi->ambient_err =
+ aom_highbd_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ } else {
+ cpi->ambient_err = aom_get_y_sse(cpi->source, get_frame_new_buffer(cm));
+ }
+ }
+
+ // If the encoder forced a KEY_FRAME decision or if frame is an S_FRAME
+ if ((cm->frame_type == KEY_FRAME && cm->show_frame) || frame_is_sframe(cm)) {
+ cpi->refresh_last_frame = 1;
+ }
+
+ cm->frame_to_show = get_frame_new_buffer(cm);
+ cm->frame_to_show->color_primaries = seq_params->color_primaries;
+ cm->frame_to_show->transfer_characteristics =
+ seq_params->transfer_characteristics;
+ cm->frame_to_show->matrix_coefficients = seq_params->matrix_coefficients;
+ cm->frame_to_show->monochrome = seq_params->monochrome;
+ cm->frame_to_show->chroma_sample_position =
+ seq_params->chroma_sample_position;
+ cm->frame_to_show->color_range = seq_params->color_range;
+ cm->frame_to_show->render_width = cm->render_width;
+ cm->frame_to_show->render_height = cm->render_height;
+
+ // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+ // off.
+
+ // Pick the loop filter level for the frame.
+ if (!cm->allow_intrabc) {
+ loopfilter_frame(cpi, cm);
+ } else {
+ cm->lf.filter_level[0] = 0;
+ cm->lf.filter_level[1] = 0;
+ cm->cdef_bits = 0;
+ cm->cdef_strengths[0] = 0;
+ cm->nb_cdef_strengths = 1;
+ cm->cdef_uv_strengths[0] = 0;
+ cm->rst_info[0].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[1].frame_restoration_type = RESTORE_NONE;
+ cm->rst_info[2].frame_restoration_type = RESTORE_NONE;
+ }
+
+ // TODO(debargha): Fix mv search range on encoder side
+ // aom_extend_frame_inner_borders(cm->frame_to_show, av1_num_planes(cm));
+ aom_extend_frame_borders(cm->frame_to_show, av1_num_planes(cm));
+
+#ifdef OUTPUT_YUV_REC
+ aom_write_one_yuv_frame(cm, cm->frame_to_show);
+#endif
+
+ // Build the bitstream
+ if (av1_pack_bitstream(cpi, dest, size) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ cpi->seq_params_locked = 1;
+
+ if (skip_adapt) return AOM_CODEC_OK;
+
+ if (seq_params->frame_id_numbers_present_flag) {
+ int i;
+ // Update reference frame id values based on the value of refresh_frame_mask
+ for (i = 0; i < REF_FRAMES; i++) {
+ if ((cpi->refresh_frame_mask >> i) & 1) {
+ cm->ref_frame_id[i] = cm->current_frame_id;
+ }
+ }
+ }
+
+#if DUMP_RECON_FRAMES == 1
+ // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+ dump_filtered_recon_frames(cpi);
+#endif // DUMP_RECON_FRAMES
+
+ if (cm->seg.enabled) {
+ if (cm->seg.update_map) {
+ update_reference_segmentation_map(cpi);
+ } else if (cm->last_frame_seg_map) {
+ memcpy(cm->current_frame_seg_map, cm->last_frame_seg_map,
+ cm->mi_cols * cm->mi_rows * sizeof(uint8_t));
+ }
+ }
+
+ if (frame_is_intra_only(cm) == 0) {
+ release_scaled_references(cpi);
+ }
+
+ update_reference_frames(cpi);
+
+#if CONFIG_ENTROPY_STATS
+ av1_accumulate_frame_counts(&aggregate_fc, &cpi->counts);
+#endif // CONFIG_ENTROPY_STATS
+
+ if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+ *cm->fc = cpi->tile_data[cm->largest_tile_id].tctx;
+ av1_reset_cdf_symbol_counters(cm->fc);
+ }
+
+ if (cpi->refresh_golden_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_GOLDEN;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+
+ if (cpi->refresh_alt_ref_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_ALTREF;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+ if (cpi->refresh_bwd_ref_frame == 1)
+ cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+ else
+ cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+
+ cm->last_frame_type = cm->frame_type;
+
+ av1_rc_postencode_update(cpi, *size);
+
+ if (cm->frame_type == KEY_FRAME) {
+ // Tell the caller that the frame was coded as a key frame
+ *frame_flags = cpi->frame_flags | FRAMEFLAGS_KEY;
+ } else {
+ *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+ }
+
+ // Clear the one shot update flags for segmentation map and mode/ref loop
+ // filter deltas.
+ cm->seg.update_map = 0;
+ cm->seg.update_data = 0;
+ cm->lf.mode_ref_delta_update = 0;
+
+ // A droppable frame might not be shown but it always
+ // takes a space in the gf group. Therefore, even when
+ // it is not shown, we still need update the count down.
+
+ if (cm->show_frame) {
+ // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that
+ // are
+ // being used as reference.
+ swap_mi_and_prev_mi(cm);
+ // Don't increment frame counters if this was an altref buffer
+ // update not a real frame
+
+ ++cm->current_video_frame;
+ }
+
+ // NOTE: Shall not refer to any frame not used as reference.
+ if (cm->is_reference_frame) {
+ // keep track of the last coded dimensions
+ cm->last_width = cm->width;
+ cm->last_height = cm->height;
+
+ // reset to normal state now that we are done.
+ cm->last_show_frame = cm->show_frame;
+ }
+
+ return AOM_CODEC_OK;
+}
+
+static INLINE void update_keyframe_counters(AV1_COMP *cpi) {
+ // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+ // differently here for rc->avg_frame_bandwidth.
+ if (cpi->common.show_frame || cpi->rc.is_bwd_ref_frame) {
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ // If this is a show_existing_frame with a source other than altref,
+ // or if it is not a displayed forward keyframe, the keyframe update
+ // counters were incremented when it was originally encoded.
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ }
+ }
+}
+
+static INLINE void update_frames_till_gf_update(AV1_COMP *cpi) {
+ // TODO(weitinglin): Updating this counter for is_frame_droppable
+ // is a work-around to handle the condition when a frame is drop.
+ // We should fix the cpi->common.show_frame flag
+ // instead of checking the other condition to update the counter properly.
+ if (cpi->common.show_frame || is_frame_droppable(cpi)) {
+ // Decrement count down till next gf
+ if (cpi->rc.frames_till_gf_update_due > 0)
+ cpi->rc.frames_till_gf_update_due--;
+ }
+}
+
+static INLINE void update_twopass_gf_group_index(AV1_COMP *cpi) {
+ // Increment the gf group index ready for the next frame. If this is
+ // a show_existing_frame with a source other than altref, or if it is not
+ // a displayed forward keyframe, the index was incremented when it was
+ // originally encoded.
+ if (!cpi->common.show_existing_frame || cpi->rc.is_src_frame_alt_ref ||
+ cpi->common.frame_type == KEY_FRAME) {
+ ++cpi->twopass.gf_group.index;
+ }
+}
+
+static void update_rc_counts(AV1_COMP *cpi) {
+ update_keyframe_counters(cpi);
+ update_frames_till_gf_update(cpi);
+ if (cpi->oxcf.pass == 2) update_twopass_gf_group_index(cpi);
+}
+
+static int Pass0Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+ int skip_adapt, unsigned int *frame_flags) {
+ if (cpi->oxcf.rc_mode == AOM_CBR) {
+ av1_rc_get_one_pass_cbr_params(cpi);
+ } else {
+ av1_rc_get_one_pass_vbr_params(cpi);
+ }
+ if (encode_frame_to_data_rate(cpi, size, dest, skip_adapt, frame_flags) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+ update_rc_counts(cpi);
+ check_show_existing_frame(cpi);
+ return AOM_CODEC_OK;
+}
+
+static int Pass2Encode(AV1_COMP *cpi, size_t *size, uint8_t *dest,
+ unsigned int *frame_flags) {
+#if CONFIG_MISMATCH_DEBUG
+ mismatch_move_frame_idx_w();
+#endif
+#if TXCOEFF_COST_TIMER
+ AV1_COMMON *cm = &cpi->common;
+ cm->txcoeff_cost_timer = 0;
+ cm->txcoeff_cost_count = 0;
+#endif
+
+ if (encode_frame_to_data_rate(cpi, size, dest, 0, frame_flags) !=
+ AOM_CODEC_OK) {
+ return AOM_CODEC_ERROR;
+ }
+
+#if TXCOEFF_COST_TIMER
+ cm->cum_txcoeff_cost_timer += cm->txcoeff_cost_timer;
+ fprintf(stderr,
+ "\ntxb coeff cost block number: %ld, frame time: %ld, cum time %ld "
+ "in us\n",
+ cm->txcoeff_cost_count, cm->txcoeff_cost_timer,
+ cm->cum_txcoeff_cost_timer);
+#endif
+
+ av1_twopass_postencode_update(cpi);
+ update_rc_counts(cpi);
+ check_show_existing_frame(cpi);
+ return AOM_CODEC_OK;
+}
+
+#if CONFIG_DENOISE
+static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd,
+ int block_size, float noise_level,
+ int64_t time_stamp, int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->denoise_and_model) {
+ cpi->denoise_and_model = aom_denoise_and_model_alloc(
+ cm->seq_params.bit_depth, block_size, noise_level);
+ if (!cpi->denoise_and_model) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating denoise and model");
+ return -1;
+ }
+ }
+ if (!cpi->film_grain_table) {
+ cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table));
+ if (!cpi->film_grain_table) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Error allocating grain table");
+ return -1;
+ }
+ memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table));
+ }
+ if (aom_denoise_and_model_run(cpi->denoise_and_model, sd,
+ &cm->film_grain_params)) {
+ if (cm->film_grain_params.apply_grain) {
+ aom_film_grain_table_append(cpi->film_grain_table, time_stamp, end_time,
+ &cm->film_grain_params);
+ }
+ }
+ return 0;
+}
+#endif
+
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time) {
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ struct aom_usec_timer timer;
+ int res = 0;
+ const int subsampling_x = sd->subsampling_x;
+ const int subsampling_y = sd->subsampling_y;
+ const int use_highbitdepth = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+
+ check_initial_width(cpi, use_highbitdepth, subsampling_x, subsampling_y);
+
+ aom_usec_timer_start(&timer);
+
+#if CONFIG_DENOISE
+ if (cpi->oxcf.noise_level > 0)
+ if (apply_denoise_2d(cpi, sd, cpi->oxcf.noise_block_size,
+ cpi->oxcf.noise_level, time_stamp, end_time) < 0)
+ res = -1;
+#endif // CONFIG_DENOISE
+
+ if (av1_lookahead_push(cpi->lookahead, sd, time_stamp, end_time,
+ use_highbitdepth, frame_flags))
+ res = -1;
+ aom_usec_timer_mark(&timer);
+ cpi->time_receive_data += aom_usec_timer_elapsed(&timer);
+
+ if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome &&
+ (subsampling_x != 1 || subsampling_y != 1)) {
+ aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+ "Non-4:2:0 color format requires profile 1 or 2");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_1) &&
+ !(subsampling_x == 0 && subsampling_y == 0)) {
+ aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 1 requires 4:4:4 color format");
+ res = -1;
+ }
+ if ((seq_params->profile == PROFILE_2) &&
+ (seq_params->bit_depth <= AOM_BITS_10) &&
+ !(subsampling_x == 1 && subsampling_y == 0)) {
+ aom_internal_error(&cm->error, AOM_CODEC_INVALID_PARAM,
+ "Profile 2 bit-depth < 10 requires 4:2:2 color format");
+ res = -1;
+ }
+
+ return res;
+}
+
+static int frame_is_reference(const AV1_COMP *cpi) {
+ const AV1_COMMON *cm = &cpi->common;
+
+ return cm->frame_type == KEY_FRAME || cpi->refresh_last_frame ||
+ cpi->refresh_golden_frame || cpi->refresh_bwd_ref_frame ||
+ cpi->refresh_alt2_ref_frame || cpi->refresh_alt_ref_frame ||
+ !cm->error_resilient_mode || cm->lf.mode_ref_delta_update ||
+ cm->seg.update_map || cm->seg.update_data;
+}
+
+static void adjust_frame_rate(AV1_COMP *cpi,
+ const struct lookahead_entry *source) {
+ int64_t this_duration;
+ int step = 0;
+
+ if (source->ts_start == cpi->first_time_stamp_ever) {
+ this_duration = source->ts_end - source->ts_start;
+ step = 1;
+ } else {
+ int64_t last_duration =
+ cpi->last_end_time_stamp_seen - cpi->last_time_stamp_seen;
+
+ this_duration = source->ts_end - cpi->last_end_time_stamp_seen;
+
+ // do a step update if the duration changes by 10%
+ if (last_duration)
+ step = (int)((this_duration - last_duration) * 10 / last_duration);
+ }
+
+ if (this_duration) {
+ if (step) {
+ av1_new_framerate(cpi, 10000000.0 / this_duration);
+ } else {
+ // Average this frame's rate into the last second's average
+ // frame rate. If we haven't seen 1 second yet, then average
+ // over the whole interval seen.
+ const double interval = AOMMIN(
+ (double)(source->ts_end - cpi->first_time_stamp_ever), 10000000.0);
+ double avg_duration = 10000000.0 / cpi->framerate;
+ avg_duration *= (interval - avg_duration + this_duration);
+ avg_duration /= interval;
+
+ av1_new_framerate(cpi, 10000000.0 / avg_duration);
+ }
+ }
+ cpi->last_time_stamp_seen = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_end;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf_src_index(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int arf_src_index = 0;
+ if (is_altref_enabled(cpi)) {
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ arf_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ } else if (rc->source_alt_ref_pending) {
+ arf_src_index = rc->frames_till_gf_update_due;
+ }
+ }
+ return arf_src_index;
+}
+
+static int get_brf_src_index(AV1_COMP *cpi) {
+ int brf_src_index = 0;
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+ // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+ // flag.
+ if (gf_group->bidir_pred_enabled[gf_group->index]) {
+ if (cpi->oxcf.pass == 2) {
+ if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+ brf_src_index = gf_group->brf_src_offset[gf_group->index];
+ } else {
+ // TODO(zoeliu): To re-visit the setup for this scenario
+ brf_src_index = cpi->rc.bipred_group_interval - 1;
+ }
+ }
+
+ return brf_src_index;
+}
+
+// Returns 0 if this is not an alt ref else the offset of the source frame
+// used as the arf midpoint.
+static int get_arf2_src_index(AV1_COMP *cpi) {
+ int arf2_src_index = 0;
+ if (is_altref_enabled(cpi) && cpi->num_extra_arfs) {
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+ arf2_src_index = gf_group->arf_src_offset[gf_group->index];
+ }
+ }
+ }
+ return arf2_src_index;
+}
+
+static void check_src_altref(AV1_COMP *cpi,
+ const struct lookahead_entry *source) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // If pass == 2, the parameters set here will be reset in
+ // av1_rc_get_second_pass_params()
+
+ if (cpi->oxcf.pass == 2) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ rc->is_src_frame_alt_ref =
+ (gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE) ||
+ (gf_group->update_type[gf_group->index] == OVERLAY_UPDATE);
+ rc->is_src_frame_ext_arf =
+ gf_group->update_type[gf_group->index] == INTNL_OVERLAY_UPDATE;
+ } else {
+ rc->is_src_frame_alt_ref =
+ cpi->alt_ref_source && (source == cpi->alt_ref_source);
+ }
+
+ if (rc->is_src_frame_alt_ref) {
+ // Current frame is an ARF overlay frame.
+ cpi->alt_ref_source = NULL;
+
+ if (rc->is_src_frame_ext_arf && !cpi->common.show_existing_frame) {
+ // For INTNL_OVERLAY, when show_existing_frame == 0, they do need to
+ // refresh the LAST_FRAME, i.e. LAST3 gets retired, LAST2 becomes LAST3,
+ // LAST becomes LAST2, and INTNL_OVERLAY becomes LAST.
+ cpi->refresh_last_frame = 1;
+ } else {
+ // Don't refresh the last buffer for an ARF overlay frame. It will
+ // become the GF so preserve last as an alternative prediction option.
+ cpi->refresh_last_frame = 0;
+ }
+ }
+}
+
+#if CONFIG_INTERNAL_STATS
+extern double av1_get_blockiness(const unsigned char *img1, int img1_pitch,
+ const unsigned char *img2, int img2_pitch,
+ int width, int height);
+
+static void adjust_image_stat(double y, double u, double v, double all,
+ ImageStat *s) {
+ s->stat[STAT_Y] += y;
+ s->stat[STAT_U] += u;
+ s->stat[STAT_V] += v;
+ s->stat[STAT_ALL] += all;
+ s->worst = AOMMIN(s->worst, all);
+}
+
+static void compute_internal_stats(AV1_COMP *cpi, int frame_bytes) {
+ AV1_COMMON *const cm = &cpi->common;
+ double samples = 0.0;
+ uint32_t in_bit_depth = 8;
+ uint32_t bit_depth = 8;
+
+#if CONFIG_INTER_STATS_ONLY
+ if (cm->frame_type == KEY_FRAME) return; // skip key frame
+#endif
+ cpi->bytes += frame_bytes;
+
+ if (cm->seq_params.use_highbitdepth) {
+ in_bit_depth = cpi->oxcf.input_bit_depth;
+ bit_depth = cm->seq_params.bit_depth;
+ }
+ if (cm->show_frame) {
+ const YV12_BUFFER_CONFIG *orig = cpi->source;
+ const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+ double y, u, v, frame_all;
+
+ cpi->count++;
+ if (cpi->b_calculate_psnr) {
+ PSNR_STATS psnr;
+ double frame_ssim2 = 0.0, weight = 0.0;
+ aom_clear_system_state();
+ // TODO(yaowu): unify these two versions into one.
+ aom_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+
+ adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3], psnr.psnr[0],
+ &cpi->psnr);
+ cpi->total_sq_error += psnr.sse[0];
+ cpi->total_samples += psnr.samples[0];
+ samples = psnr.samples[0];
+ // TODO(yaowu): unify these two versions into one.
+ if (cm->seq_params.use_highbitdepth)
+ frame_ssim2 =
+ aom_highbd_calc_ssim(orig, recon, &weight, bit_depth, in_bit_depth);
+ else
+ frame_ssim2 = aom_calc_ssim(orig, recon, &weight);
+
+ cpi->worst_ssim = AOMMIN(cpi->worst_ssim, frame_ssim2);
+ cpi->summed_quality += frame_ssim2 * weight;
+ cpi->summed_weights += weight;
+
+#if 0
+ {
+ FILE *f = fopen("q_used.stt", "a");
+ double y2 = psnr.psnr[1];
+ double u2 = psnr.psnr[2];
+ double v2 = psnr.psnr[3];
+ double frame_psnr2 = psnr.psnr[0];
+ fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+ cm->current_video_frame, y2, u2, v2,
+ frame_psnr2, frame_ssim2);
+ fclose(f);
+ }
+#endif
+ }
+ if (cpi->b_calculate_blockiness) {
+ if (!cm->seq_params.use_highbitdepth) {
+ const double frame_blockiness =
+ av1_get_blockiness(orig->y_buffer, orig->y_stride, recon->y_buffer,
+ recon->y_stride, orig->y_width, orig->y_height);
+ cpi->worst_blockiness = AOMMAX(cpi->worst_blockiness, frame_blockiness);
+ cpi->total_blockiness += frame_blockiness;
+ }
+
+ if (cpi->b_calculate_consistency) {
+ if (!cm->seq_params.use_highbitdepth) {
+ const double this_inconsistency = aom_get_ssim_metrics(
+ orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+ orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+ const double peak = (double)((1 << in_bit_depth) - 1);
+ const double consistency =
+ aom_sse_to_psnr(samples, peak, cpi->total_inconsistency);
+ if (consistency > 0.0)
+ cpi->worst_consistency =
+ AOMMIN(cpi->worst_consistency, consistency);
+ cpi->total_inconsistency += this_inconsistency;
+ }
+ }
+ }
+
+ frame_all =
+ aom_calc_fastssim(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+ frame_all = aom_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+ adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+static int is_integer_mv(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *cur_picture,
+ const YV12_BUFFER_CONFIG *last_picture,
+ hash_table *last_hash_table) {
+ aom_clear_system_state();
+ // check use hash ME
+ int k;
+ uint32_t hash_value_1;
+ uint32_t hash_value_2;
+
+ const int block_size = 8;
+ const double threshold_current = 0.8;
+ const double threshold_average = 0.95;
+ const int max_history_size = 32;
+ int T = 0; // total block
+ int C = 0; // match with collocated block
+ int S = 0; // smooth region but not match with collocated block
+ int M = 0; // match with other block
+
+ const int pic_width = cur_picture->y_width;
+ const int pic_height = cur_picture->y_height;
+ for (int i = 0; i + block_size <= pic_height; i += block_size) {
+ for (int j = 0; j + block_size <= pic_width; j += block_size) {
+ const int x_pos = j;
+ const int y_pos = i;
+ int match = 1;
+ T++;
+
+ // check whether collocated block match with current
+ uint8_t *p_cur = cur_picture->y_buffer;
+ uint8_t *p_ref = last_picture->y_buffer;
+ int stride_cur = cur_picture->y_stride;
+ int stride_ref = last_picture->y_stride;
+ p_cur += (y_pos * stride_cur + x_pos);
+ p_ref += (y_pos * stride_ref + x_pos);
+
+ if (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *p16_cur = CONVERT_TO_SHORTPTR(p_cur);
+ uint16_t *p16_ref = CONVERT_TO_SHORTPTR(p_ref);
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p16_cur[tmpX] != p16_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p16_cur += stride_cur;
+ p16_ref += stride_ref;
+ }
+ } else {
+ for (int tmpY = 0; tmpY < block_size && match; tmpY++) {
+ for (int tmpX = 0; tmpX < block_size && match; tmpX++) {
+ if (p_cur[tmpX] != p_ref[tmpX]) {
+ match = 0;
+ }
+ }
+ p_cur += stride_cur;
+ p_ref += stride_ref;
+ }
+ }
+
+ if (match) {
+ C++;
+ continue;
+ }
+
+ if (av1_hash_is_horizontal_perfect(cur_picture, block_size, x_pos,
+ y_pos) ||
+ av1_hash_is_vertical_perfect(cur_picture, block_size, x_pos, y_pos)) {
+ S++;
+ continue;
+ }
+
+ av1_get_block_hash_value(
+ cur_picture->y_buffer + y_pos * stride_cur + x_pos, stride_cur,
+ block_size, &hash_value_1, &hash_value_2,
+ (cur_picture->flags & YV12_FLAG_HIGHBITDEPTH), &cpi->td.mb);
+ // Hashing does not work for highbitdepth currently.
+ // TODO(Roger): Make it work for highbitdepth.
+ if (av1_use_hash_me(&cpi->common)) {
+ if (av1_has_exact_match(last_hash_table, hash_value_1, hash_value_2)) {
+ M++;
+ }
+ }
+ }
+ }
+
+ assert(T > 0);
+ double csm_rate = ((double)(C + S + M)) / ((double)(T));
+ double m_rate = ((double)(M)) / ((double)(T));
+
+ cpi->csm_rate_array[cpi->rate_index] = csm_rate;
+ cpi->m_rate_array[cpi->rate_index] = m_rate;
+
+ cpi->rate_index = (cpi->rate_index + 1) % max_history_size;
+ cpi->rate_size++;
+ cpi->rate_size = AOMMIN(cpi->rate_size, max_history_size);
+
+ if (csm_rate < threshold_current) {
+ return 0;
+ }
+
+ if (C == T) {
+ return 1;
+ }
+
+ double csm_average = 0.0;
+ double m_average = 0.0;
+
+ for (k = 0; k < cpi->rate_size; k++) {
+ csm_average += cpi->csm_rate_array[k];
+ m_average += cpi->m_rate_array[k];
+ }
+ csm_average /= cpi->rate_size;
+ m_average /= cpi->rate_size;
+
+ if (csm_average < threshold_average) {
+ return 0;
+ }
+
+ if (M > (T - C - S) / 3) {
+ return 1;
+ }
+
+ if (csm_rate > 0.99 && m_rate > 0.01) {
+ return 1;
+ }
+
+ if (csm_average + m_average > 1.01) {
+ return 1;
+ }
+
+ return 0;
+}
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush,
+ const aom_rational_t *timebase) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ BufferPool *const pool = cm->buffer_pool;
+ RATE_CONTROL *const rc = &cpi->rc;
+ struct aom_usec_timer cmptimer;
+ YV12_BUFFER_CONFIG *force_src_buffer = NULL;
+ struct lookahead_entry *last_source = NULL;
+ struct lookahead_entry *source = NULL;
+ int arf_src_index;
+ int brf_src_index;
+ int i;
+
+#if CONFIG_BITSTREAM_DEBUG
+ assert(cpi->oxcf.max_threads == 0 &&
+ "bitstream debug tool does not support multithreading");
+ bitstream_queue_record_write();
+ bitstream_queue_set_frame_write(cm->current_video_frame * 2 + cm->show_frame);
+#endif
+
+ cm->showable_frame = 0;
+ aom_usec_timer_start(&cmptimer);
+
+ set_high_precision_mv(cpi, ALTREF_HIGH_PRECISION_MV, 0);
+
+ // Normal defaults
+ cm->refresh_frame_context = oxcf->frame_parallel_decoding_mode
+ ? REFRESH_FRAME_CONTEXT_DISABLED
+ : REFRESH_FRAME_CONTEXT_BACKWARD;
+ if (oxcf->large_scale_tile)
+ cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_DISABLED;
+
+ // default reference buffers update config
+ av1_configure_buffer_updates_firstpass(cpi, LF_UPDATE);
+
+ // Initialize fields related to forward keyframes
+ cpi->no_show_kf = 0;
+ cm->reset_decoder_state = 0;
+
+ // Don't allow a show_existing_frame to coincide with an error resilient or
+ // S-Frame. An exception can be made in the case of a keyframe, since it
+ // does not depend on any previous frames. We must make this exception here
+ // because of the use of show_existing_frame with forward coded keyframes.
+ struct lookahead_entry *lookahead_src = NULL;
+ if (cm->current_video_frame > 0)
+ lookahead_src = av1_lookahead_peek(cpi->lookahead, 0);
+
+ int use_show_existing = 1;
+ if (lookahead_src != NULL) {
+ const int is_error_resilient =
+ cpi->oxcf.error_resilient_mode ||
+ (lookahead_src->flags & AOM_EFLAG_ERROR_RESILIENT);
+ const int is_s_frame = cpi->oxcf.s_frame_mode ||
+ (lookahead_src->flags & AOM_EFLAG_SET_S_FRAME);
+ const int is_key_frame =
+ (rc->frames_to_key == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY);
+ use_show_existing = !(is_error_resilient || is_s_frame) || is_key_frame;
+ }
+
+ if (oxcf->pass == 2 && cm->show_existing_frame && use_show_existing) {
+ // Manage the source buffer and flush out the source frame that has been
+ // coded already; Also get prepared for PSNR calculation if needed.
+ if ((source = av1_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+ *size = 0;
+ return -1;
+ }
+ av1_apply_encoding_flags(cpi, source->flags);
+ cpi->source = &source->img;
+ // TODO(zoeliu): To track down to determine whether it's needed to adjust
+ // the frame rate.
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+
+ // We need to adjust frame rate for an overlay frame
+ if (cpi->rc.is_src_frame_alt_ref) adjust_frame_rate(cpi, source);
+
+ // Find a free buffer for the new frame, releasing the reference
+ // previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ // Start with a 0 size frame.
+ *size = 0;
+
+ // We need to update the gf_group for show_existing overlay frame
+ if (cpi->rc.is_src_frame_alt_ref) av1_rc_get_second_pass_params(cpi);
+
+ if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+
+ if (cpi->b_calculate_psnr) generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+ compute_internal_stats(cpi, (int)(*size));
+#endif // CONFIG_INTERNAL_STATS
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ cm->show_existing_frame = 0;
+ return 0;
+ }
+
+ // Should we encode an arf frame.
+ arf_src_index = get_arf_src_index(cpi);
+ if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ if (arf_src_index) {
+ assert(arf_src_index <= rc->frames_to_key);
+
+ if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+ cm->showable_frame = 1;
+ cpi->alt_ref_source = source;
+ // When arf_src_index == rc->frames_to_key, it indicates a fwd_kf
+ if (arf_src_index == rc->frames_to_key) {
+ // Skip temporal filtering and mark as intra_only if we have a fwd_kf
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ int which_arf = gf_group->arf_update_idx[gf_group->index];
+ cpi->is_arf_filter_off[which_arf] = 1;
+ cpi->no_show_kf = 1;
+ } else {
+ if (oxcf->arnr_max_frames > 0) {
+ // Produce the filtered ARF frame.
+ av1_temporal_filter(cpi, arf_src_index);
+ aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+ }
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, ARF_UPDATE);
+ }
+ }
+ rc->source_alt_ref_pending = 0;
+ }
+
+ // Should we encode an arf2 frame.
+ arf_src_index = get_arf2_src_index(cpi);
+ if (arf_src_index) {
+ for (i = 0; i <= arf_src_index; ++i) {
+ struct lookahead_entry *e = av1_lookahead_peek(cpi->lookahead, i);
+ // Avoid creating an alt-ref if there's a forced keyframe pending.
+ if (e == NULL) {
+ break;
+ } else if (e->flags == AOM_EFLAG_FORCE_KF) {
+ arf_src_index = 0;
+ flush = 1;
+ break;
+ }
+ }
+ }
+
+ if (arf_src_index) {
+ assert(arf_src_index <= rc->frames_to_key);
+
+ if ((source = av1_lookahead_peek(cpi->lookahead, arf_src_index)) != NULL) {
+ cm->showable_frame = 1;
+ cpi->alt_ref_source = source;
+
+ if (oxcf->arnr_max_frames > 0) {
+ // Produce the filtered ARF frame.
+ av1_temporal_filter(cpi, arf_src_index);
+ aom_extend_frame_borders(&cpi->alt_ref_buffer, num_planes);
+ force_src_buffer = &cpi->alt_ref_buffer;
+ }
+
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, INTNL_ARF_UPDATE);
+ }
+ }
+ rc->source_alt_ref_pending = 0;
+ }
+
+ rc->is_bwd_ref_frame = 0;
+ brf_src_index = get_brf_src_index(cpi);
+ if (brf_src_index) {
+ assert(brf_src_index <= rc->frames_to_key);
+ if ((source = av1_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+ cm->showable_frame = 1;
+ cm->show_frame = 0;
+ cm->intra_only = 0;
+
+ if (oxcf->pass < 2) {
+ // In second pass, the buffer updates configure will be set
+ // in the function av1_rc_get_second_pass_params
+ av1_configure_buffer_updates_firstpass(cpi, BIPRED_UPDATE);
+ }
+ }
+ }
+
+ if (!source) {
+ // Get last frame source.
+ if (cm->current_video_frame > 0) {
+ if ((last_source = av1_lookahead_peek(cpi->lookahead, -1)) == NULL)
+ return -1;
+ }
+ if (cm->current_video_frame > 0) assert(last_source != NULL);
+ // Read in the source frame.
+ source = av1_lookahead_pop(cpi->lookahead, flush);
+
+ if (source != NULL) {
+ cm->show_frame = 1;
+ cm->intra_only = 0;
+
+ // Check to see if the frame should be encoded as an arf overlay.
+ check_src_altref(cpi, source);
+ }
+ }
+ if (source) {
+ cpi->unscaled_source = cpi->source =
+ force_src_buffer ? force_src_buffer : &source->img;
+ cpi->unscaled_last_source = last_source != NULL ? &last_source->img : NULL;
+
+ *time_stamp = source->ts_start;
+ *time_end = source->ts_end;
+ av1_apply_encoding_flags(cpi, source->flags);
+ *frame_flags = (source->flags & AOM_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0;
+
+ } else {
+ *size = 0;
+ if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
+ av1_end_first_pass(cpi); /* get last stats packet */
+ cpi->twopass.first_pass_done = 1;
+ }
+ return -1;
+ }
+
+ if (source->ts_start < cpi->first_time_stamp_ever) {
+ cpi->first_time_stamp_ever = source->ts_start;
+ cpi->last_end_time_stamp_seen = source->ts_start;
+ }
+
+ // Clear down mmx registers
+ aom_clear_system_state();
+
+ // adjust frame rates based on timestamps given
+ if (cm->show_frame) adjust_frame_rate(cpi, source);
+
+ // Find a free buffer for the new frame, releasing the reference previously
+ // held.
+ if (cm->new_fb_idx != INVALID_IDX) {
+ --pool->frame_bufs[cm->new_fb_idx].ref_count;
+ }
+ cm->new_fb_idx = get_free_fb(cm);
+
+ if (cm->new_fb_idx == INVALID_IDX) return -1;
+
+ // Retain the RF_LEVEL for the current newly coded frame.
+ cpi->frame_rf_level[cm->new_fb_idx] =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+
+ cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
+ cm->cur_frame->buf.buf_8bit_valid = 0;
+
+ if (cpi->film_grain_table) {
+ cm->seq_params.film_grain_params_present = aom_film_grain_table_lookup(
+ cpi->film_grain_table, *time_stamp, *time_end, 0 /* =erase */,
+ &cm->film_grain_params);
+ }
+ cm->cur_frame->film_grain_params_present =
+ cm->seq_params.film_grain_params_present;
+
+ // only one operating point supported now
+ const int64_t pts64 = ticks_to_timebase_units(timebase, *time_stamp);
+ if (pts64 < 0 || pts64 > UINT32_MAX) return AOM_CODEC_ERROR;
+ cpi->common.frame_presentation_time = (uint32_t)pts64;
+
+ // Start with a 0 size frame.
+ *size = 0;
+
+ cpi->frame_flags = *frame_flags;
+
+ if (oxcf->pass == 2) {
+ av1_rc_get_second_pass_params(cpi);
+ } else if (oxcf->pass == 1) {
+ setup_frame_size(cpi);
+ }
+
+ if (cpi->oxcf.pass != 0 || frame_is_intra_only(cm) == 1) {
+ for (i = 0; i < REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
+ }
+
+ cm->using_qmatrix = cpi->oxcf.using_qm;
+ cm->min_qmlevel = cpi->oxcf.qm_minlevel;
+ cm->max_qmlevel = cpi->oxcf.qm_maxlevel;
+
+ if (cm->seq_params.frame_id_numbers_present_flag) {
+ if (*time_stamp == 0) {
+ cpi->common.current_frame_id = -1;
+ }
+ }
+
+ cpi->cur_poc++;
+ if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools &&
+ !frame_is_intra_only(cm)) {
+ if (cpi->common.seq_params.force_integer_mv == 2) {
+ struct lookahead_entry *previous_entry =
+ av1_lookahead_peek(cpi->lookahead, cpi->previous_index);
+ if (!previous_entry)
+ cpi->common.cur_frame_force_integer_mv = 0;
+ else
+ cpi->common.cur_frame_force_integer_mv = is_integer_mv(
+ cpi, cpi->source, &previous_entry->img, cpi->previous_hash_table);
+ } else {
+ cpi->common.cur_frame_force_integer_mv =
+ cpi->common.seq_params.force_integer_mv;
+ }
+ } else {
+ cpi->common.cur_frame_force_integer_mv = 0;
+ }
+
+ if (oxcf->pass == 1) {
+ cpi->td.mb.e_mbd.lossless[0] = is_lossless_requested(oxcf);
+ av1_first_pass(cpi, source);
+ } else if (oxcf->pass == 2) {
+ if (Pass2Encode(cpi, size, dest, frame_flags) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+ } else {
+ // One pass encode
+ if (Pass0Encode(cpi, size, dest, 0, frame_flags) != AOM_CODEC_OK)
+ return AOM_CODEC_ERROR;
+ }
+ if (oxcf->pass != 1 && cpi->common.allow_screen_content_tools) {
+ cpi->previous_hash_table = &cm->cur_frame->hash_table;
+ {
+ int l;
+ for (l = -MAX_PRE_FRAMES; l < cpi->lookahead->max_sz; l++) {
+ if ((cpi->lookahead->buf + l) == source) {
+ cpi->previous_index = l;
+ break;
+ }
+ }
+
+ if (l == cpi->lookahead->max_sz) {
+ aom_internal_error(&cm->error, AOM_CODEC_MEM_ERROR,
+ "Failed to find last frame original buffer");
+ }
+ }
+ }
+
+ if (!cm->large_scale_tile) {
+ cm->frame_contexts[cm->new_fb_idx] = *cm->fc;
+ }
+
+#define EXT_TILE_DEBUG 0
+#if EXT_TILE_DEBUG
+ if (cm->large_scale_tile && oxcf->pass == 2) {
+ char fn[20] = "./fc";
+ fn[4] = cm->current_video_frame / 100 + '0';
+ fn[5] = (cm->current_video_frame % 100) / 10 + '0';
+ fn[6] = (cm->current_video_frame % 10) + '0';
+ fn[7] = '\0';
+ av1_print_frame_contexts(cm->fc, fn);
+ }
+#endif // EXT_TILE_DEBUG
+#undef EXT_TILE_DEBUG
+
+ cm->showable_frame = !cm->show_frame && cm->showable_frame;
+
+ // No frame encoded, or frame was dropped, release scaled references.
+ if ((*size == 0) && (frame_is_intra_only(cm) == 0)) {
+ release_scaled_references(cpi);
+ }
+
+ if (*size > 0) {
+ cpi->droppable = !frame_is_reference(cpi);
+ }
+
+ aom_usec_timer_mark(&cmptimer);
+ cpi->time_compress_data += aom_usec_timer_elapsed(&cmptimer);
+
+ if (cpi->b_calculate_psnr && oxcf->pass != 1 && cm->show_frame)
+ generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+ if (oxcf->pass != 1) {
+ compute_internal_stats(cpi, (int)(*size));
+ }
+#endif // CONFIG_INTERNAL_STATS
+
+ aom_clear_system_state();
+
+ return 0;
+}
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest) {
+ AV1_COMMON *cm = &cpi->common;
+ if (!cm->show_frame) {
+ return -1;
+ } else {
+ int ret;
+ if (cm->frame_to_show) {
+ *dest = *cm->frame_to_show;
+ dest->y_width = cm->width;
+ dest->y_height = cm->height;
+ dest->uv_width = cm->width >> cm->seq_params.subsampling_x;
+ dest->uv_height = cm->height >> cm->seq_params.subsampling_y;
+ ret = 0;
+ } else {
+ ret = -1;
+ }
+ aom_clear_system_state();
+ return ret;
+ }
+}
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+ if (cpi->last_show_frame_buf_idx == INVALID_IDX) return -1;
+
+ *frame =
+ cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+ return 0;
+}
+
+static int equal_dimensions_and_border(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b) {
+ return a->y_height == b->y_height && a->y_width == b->y_width &&
+ a->uv_height == b->uv_height && a->uv_width == b->uv_width &&
+ a->y_stride == b->y_stride && a->uv_stride == b->uv_stride &&
+ a->border == b->border &&
+ (a->flags & YV12_FLAG_HIGHBITDEPTH) ==
+ (b->flags & YV12_FLAG_HIGHBITDEPTH);
+}
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd) {
+ const int num_planes = av1_num_planes(cm);
+ if (!equal_dimensions_and_border(new_frame, sd))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Incorrect buffer dimensions");
+ else
+ aom_yv12_copy_frame(new_frame, sd, num_planes);
+
+ return cm->error.error_code;
+}
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+ AOM_SCALING vert_mode) {
+ int hr = 0, hs = 0, vr = 0, vs = 0;
+
+ if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+
+ Scale2Ratio(horiz_mode, &hr, &hs);
+ Scale2Ratio(vert_mode, &vr, &vs);
+
+ // always go to the next whole number
+ cpi->resize_pending_width = (hs - 1 + cpi->oxcf.width * hr) / hs;
+ cpi->resize_pending_height = (vs - 1 + cpi->oxcf.height * vr) / vs;
+
+ return 0;
+}
+
+int av1_get_quantizer(AV1_COMP *cpi) { return cpi->common.base_qindex; }
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *frame_size) {
+ size_t output_size = 0;
+ size_t total_bytes_read = 0;
+ size_t remaining_size = *frame_size;
+ uint8_t *buff_ptr = buffer;
+
+ // go through each OBUs
+ while (total_bytes_read < *frame_size) {
+ uint8_t saved_obu_header[2];
+ uint64_t obu_payload_size;
+ size_t length_of_payload_size;
+ size_t length_of_obu_size;
+ uint32_t obu_header_size = (buff_ptr[0] >> 2) & 0x1 ? 2 : 1;
+ size_t obu_bytes_read = obu_header_size; // bytes read for current obu
+
+ // save the obu header (1 or 2 bytes)
+ memmove(saved_obu_header, buff_ptr, obu_header_size);
+ // clear the obu_has_size_field
+ saved_obu_header[0] = saved_obu_header[0] & (~0x2);
+
+ // get the payload_size and length of payload_size
+ if (aom_uleb_decode(buff_ptr + obu_header_size, remaining_size,
+ &obu_payload_size, &length_of_payload_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+ obu_bytes_read += length_of_payload_size;
+
+ // calculate the length of size of the obu header plus payload
+ length_of_obu_size =
+ aom_uleb_size_in_bytes((uint64_t)(obu_header_size + obu_payload_size));
+
+ // move the rest of data to new location
+ memmove(buff_ptr + length_of_obu_size + obu_header_size,
+ buff_ptr + obu_bytes_read, remaining_size - obu_bytes_read);
+ obu_bytes_read += (size_t)obu_payload_size;
+
+ // write the new obu size
+ const uint64_t obu_size = obu_header_size + obu_payload_size;
+ size_t coded_obu_size;
+ if (aom_uleb_encode(obu_size, sizeof(obu_size), buff_ptr,
+ &coded_obu_size) != 0) {
+ return AOM_CODEC_ERROR;
+ }
+
+ // write the saved (modified) obu_header following obu size
+ memmove(buff_ptr + length_of_obu_size, saved_obu_header, obu_header_size);
+
+ total_bytes_read += obu_bytes_read;
+ remaining_size -= obu_bytes_read;
+ buff_ptr += length_of_obu_size + obu_size;
+ output_size += length_of_obu_size + (size_t)obu_size;
+ }
+
+ *frame_size = output_size;
+ return AOM_CODEC_OK;
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags) {
+ // TODO(yunqingwang): For what references to use, external encoding flags
+ // should be consistent with internal reference frame selection. Need to
+ // ensure that there is not conflict between the two. In AV1 encoder, the
+ // priority rank for 7 reference frames are: LAST, ALTREF, LAST2, LAST3,
+ // GOLDEN, BWDREF, ALTREF2. If only one reference frame is used, it must be
+ // LAST.
+ cpi->ext_ref_frame_flags = AOM_REFFRAME_ALL;
+ if (flags &
+ (AOM_EFLAG_NO_REF_LAST | AOM_EFLAG_NO_REF_LAST2 | AOM_EFLAG_NO_REF_LAST3 |
+ AOM_EFLAG_NO_REF_GF | AOM_EFLAG_NO_REF_ARF | AOM_EFLAG_NO_REF_BWD |
+ AOM_EFLAG_NO_REF_ARF2)) {
+ if (flags & AOM_EFLAG_NO_REF_LAST) {
+ cpi->ext_ref_frame_flags = 0;
+ } else {
+ int ref = AOM_REFFRAME_ALL;
+
+ if (flags & AOM_EFLAG_NO_REF_LAST2) ref ^= AOM_LAST2_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_LAST3) ref ^= AOM_LAST3_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_GF) ref ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_REF_ARF) {
+ ref ^= AOM_ALT_FLAG;
+ ref ^= AOM_BWD_FLAG;
+ ref ^= AOM_ALT2_FLAG;
+ } else {
+ if (flags & AOM_EFLAG_NO_REF_BWD) ref ^= AOM_BWD_FLAG;
+ if (flags & AOM_EFLAG_NO_REF_ARF2) ref ^= AOM_ALT2_FLAG;
+ }
+
+ av1_use_as_reference(cpi, ref);
+ }
+ }
+
+ if (flags &
+ (AOM_EFLAG_NO_UPD_LAST | AOM_EFLAG_NO_UPD_GF | AOM_EFLAG_NO_UPD_ARF)) {
+ int upd = AOM_REFFRAME_ALL;
+
+ // Refreshing LAST/LAST2/LAST3 is handled by 1 common flag.
+ if (flags & AOM_EFLAG_NO_UPD_LAST) upd ^= AOM_LAST_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_GF) upd ^= AOM_GOLD_FLAG;
+
+ if (flags & AOM_EFLAG_NO_UPD_ARF) {
+ upd ^= AOM_ALT_FLAG;
+ upd ^= AOM_BWD_FLAG;
+ upd ^= AOM_ALT2_FLAG;
+ }
+
+ av1_update_reference(cpi, upd);
+ }
+
+ cpi->ext_use_ref_frame_mvs = cpi->oxcf.allow_ref_frame_mvs &
+ ((flags & AOM_EFLAG_NO_REF_FRAME_MVS) == 0);
+ cpi->ext_use_error_resilient = cpi->oxcf.error_resilient_mode |
+ ((flags & AOM_EFLAG_ERROR_RESILIENT) != 0);
+ cpi->ext_use_s_frame =
+ cpi->oxcf.s_frame_mode | ((flags & AOM_EFLAG_SET_S_FRAME) != 0);
+ cpi->ext_use_primary_ref_none = (flags & AOM_EFLAG_SET_PRIMARY_REF_NONE) != 0;
+
+ if (flags & AOM_EFLAG_NO_UPD_ENTROPY) {
+ av1_update_entropy(cpi, 0);
+ }
+}
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n) {
+ return n * TICKS_PER_SEC * timebase->num / timebase->den;
+}
+
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n) {
+ const int64_t round = TICKS_PER_SEC * timebase->num / 2 - 1;
+ return (n * timebase->den + round) / timebase->num / TICKS_PER_SEC;
+}
+
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi) {
+ if (!cpi) return NULL;
+
+ uint8_t header_buf[512] = { 0 };
+ const uint32_t sequence_header_size =
+ write_sequence_header_obu(cpi, &header_buf[0]);
+ assert(sequence_header_size <= sizeof(header_buf));
+ if (sequence_header_size == 0) return NULL;
+
+ const size_t obu_header_size = 1;
+ const size_t size_field_size = aom_uleb_size_in_bytes(sequence_header_size);
+ const size_t payload_offset = obu_header_size + size_field_size;
+
+ if (payload_offset + sequence_header_size > sizeof(header_buf)) return NULL;
+ memmove(&header_buf[payload_offset], &header_buf[0], sequence_header_size);
+
+ if (write_obu_header(OBU_SEQUENCE_HEADER, 0, &header_buf[0]) !=
+ obu_header_size) {
+ return NULL;
+ }
+
+ size_t coded_size_field_size = 0;
+ if (aom_uleb_encode(sequence_header_size, size_field_size,
+ &header_buf[obu_header_size],
+ &coded_size_field_size) != 0) {
+ return NULL;
+ }
+ assert(coded_size_field_size == size_field_size);
+
+ aom_fixed_buf_t *global_headers =
+ (aom_fixed_buf_t *)malloc(sizeof(*global_headers));
+ if (!global_headers) return NULL;
+
+ const size_t global_header_buf_size =
+ obu_header_size + size_field_size + sequence_header_size;
+
+ global_headers->buf = malloc(global_header_buf_size);
+ if (!global_headers->buf) {
+ free(global_headers);
+ return NULL;
+ }
+
+ memcpy(global_headers->buf, &header_buf[0], global_header_buf_size);
+ global_headers->sz = global_header_buf_size;
+ return global_headers;
+}
diff --git a/third_party/aom/av1/encoder/encoder.h b/third_party/aom/av1/encoder/encoder.h
new file mode 100644
index 0000000000..ee7fc46379
--- /dev/null
+++ b/third_party/aom/av1/encoder/encoder.h
@@ -0,0 +1,985 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODER_H_
+#define AOM_AV1_ENCODER_ENCODER_H_
+
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomcx.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/thread_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
+#include "av1/common/timing.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/mbgraph.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/tokenize.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_dsp/ssim.h"
+#endif
+#include "aom_dsp/variance.h"
+#if CONFIG_DENOISE
+#include "aom_dsp/noise_model.h"
+#endif
+#include "aom/internal/aom_codec_internal.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ int nmv_vec_cost[MV_JOINTS];
+ int nmv_costs[2][MV_VALS];
+ int nmv_costs_hp[2][MV_VALS];
+
+ FRAME_CONTEXT fc;
+} CODING_CONTEXT;
+
+typedef enum {
+ // regular inter frame
+ REGULAR_FRAME = 0,
+ // alternate reference frame
+ ARF_FRAME = 1,
+ // overlay frame
+ OVERLAY_FRAME = 2,
+ // golden frame
+ GLD_FRAME = 3,
+ // backward reference frame
+ BRF_FRAME = 4,
+ // extra alternate reference frame
+ EXT_ARF_FRAME = 5,
+ FRAME_CONTEXT_INDEXES
+} FRAME_CONTEXT_INDEX;
+
+typedef enum {
+ NORMAL = 0,
+ FOURFIVE = 1,
+ THREEFIVE = 2,
+ ONETWO = 3
+} AOM_SCALING;
+
+typedef enum {
+ // Good Quality Fast Encoding. The encoder balances quality with the amount of
+ // time it takes to encode the output. Speed setting controls how fast.
+ GOOD
+} MODE;
+
+typedef enum {
+ FRAMEFLAGS_KEY = 1 << 0,
+ FRAMEFLAGS_GOLDEN = 1 << 1,
+ FRAMEFLAGS_BWDREF = 1 << 2,
+ // TODO(zoeliu): To determine whether a frame flag is needed for ALTREF2_FRAME
+ FRAMEFLAGS_ALTREF = 1 << 3,
+} FRAMETYPE_FLAGS;
+
+typedef enum {
+ NO_AQ = 0,
+ VARIANCE_AQ = 1,
+ COMPLEXITY_AQ = 2,
+ CYCLIC_REFRESH_AQ = 3,
+ AQ_MODE_COUNT // This should always be the last member of the enum
+} AQ_MODE;
+typedef enum {
+ NO_DELTA_Q = 0,
+ DELTA_Q_ONLY = 1,
+ DELTA_Q_LF = 2,
+ DELTAQ_MODE_COUNT // This should always be the last member of the enum
+} DELTAQ_MODE;
+
+typedef enum {
+ RESIZE_NONE = 0, // No frame resizing allowed.
+ RESIZE_FIXED = 1, // All frames are coded at the specified scale.
+ RESIZE_RANDOM = 2, // All frames are coded at a random scale.
+ RESIZE_MODES
+} RESIZE_MODE;
+
+typedef enum {
+ SUPERRES_NONE = 0, // No frame superres allowed
+ SUPERRES_FIXED = 1, // All frames are coded at the specified scale,
+ // and super-resolved.
+ SUPERRES_RANDOM = 2, // All frames are coded at a random scale,
+ // and super-resolved.
+ SUPERRES_QTHRESH = 3, // Superres scale for a frame is determined based on
+ // q_index
+ SUPERRES_MODES
+} SUPERRES_MODE;
+
+typedef struct AV1EncoderConfig {
+ BITSTREAM_PROFILE profile;
+ aom_bit_depth_t bit_depth; // Codec bit-depth.
+ int width; // width of data passed to the compressor
+ int height; // height of data passed to the compressor
+ int forced_max_frame_width; // forced maximum width of frame (if != 0)
+ int forced_max_frame_height; // forced maximum height of frame (if != 0)
+ unsigned int input_bit_depth; // Input bit depth.
+ double init_framerate; // set to passed in framerate
+ int64_t target_bandwidth; // bandwidth to be used in bits per second
+
+ int noise_sensitivity; // pre processing blur: recommendation 0
+ int sharpness; // sharpening output: recommendation 0:
+ int speed;
+ // maximum allowed bitrate for any intra frame in % of bitrate target.
+ unsigned int rc_max_intra_bitrate_pct;
+ // maximum allowed bitrate for any inter frame in % of bitrate target.
+ unsigned int rc_max_inter_bitrate_pct;
+ // percent of rate boost for golden frame in CBR mode.
+ unsigned int gf_cbr_boost_pct;
+
+ MODE mode;
+ int pass;
+
+ // Key Framing Operations
+ int auto_key; // autodetect cut scenes and set the keyframes
+ int key_freq; // maximum distance to key frame.
+ int sframe_dist;
+ int sframe_mode;
+ int sframe_enabled;
+ int lag_in_frames; // how many frames lag before we start encoding
+ int fwd_kf_enabled;
+
+ // ----------------------------------------------------------------
+ // DATARATE CONTROL OPTIONS
+
+ // vbr, cbr, constrained quality or constant quality
+ enum aom_rc_mode rc_mode;
+
+ // buffer targeting aggressiveness
+ int under_shoot_pct;
+ int over_shoot_pct;
+
+ // buffering parameters
+ int64_t starting_buffer_level_ms;
+ int64_t optimal_buffer_level_ms;
+ int64_t maximum_buffer_size_ms;
+
+ // Frame drop threshold.
+ int drop_frames_water_mark;
+
+ // controlling quality
+ int fixed_q;
+ int worst_allowed_q;
+ int best_allowed_q;
+ int cq_level;
+ AQ_MODE aq_mode; // Adaptive Quantization mode
+ DELTAQ_MODE deltaq_mode;
+ int enable_cdef;
+ int enable_restoration;
+ int disable_trellis_quant;
+ int using_qm;
+ int qm_y;
+ int qm_u;
+ int qm_v;
+ int qm_minlevel;
+ int qm_maxlevel;
+#if CONFIG_DIST_8X8
+ int using_dist_8x8;
+#endif
+ unsigned int num_tile_groups;
+ unsigned int mtu;
+
+ // Internal frame size scaling.
+ RESIZE_MODE resize_mode;
+ uint8_t resize_scale_denominator;
+ uint8_t resize_kf_scale_denominator;
+
+ // Frame Super-Resolution size scaling.
+ SUPERRES_MODE superres_mode;
+ uint8_t superres_scale_denominator;
+ uint8_t superres_kf_scale_denominator;
+ int superres_qthresh;
+ int superres_kf_qthresh;
+
+ // Enable feature to reduce the frame quantization every x frames.
+ int frame_periodic_boost;
+
+ // two pass datarate control
+ int two_pass_vbrbias; // two pass datarate control tweaks
+ int two_pass_vbrmin_section;
+ int two_pass_vbrmax_section;
+ // END DATARATE CONTROL OPTIONS
+ // ----------------------------------------------------------------
+
+ int enable_auto_arf;
+ int enable_auto_brf; // (b)ackward (r)ef (f)rame
+
+ /* Bitfield defining the error resiliency features to enable.
+ * Can provide decodable frames after losses in previous
+ * frames and decodable partitions after losses in the same frame.
+ */
+ unsigned int error_resilient_mode;
+
+ unsigned int s_frame_mode;
+
+ /* Bitfield defining the parallel decoding mode where the
+ * decoding in successive frames may be conducted in parallel
+ * just by decoding the frame headers.
+ */
+ unsigned int frame_parallel_decoding_mode;
+
+ unsigned int limit;
+
+ int arnr_max_frames;
+ int arnr_strength;
+
+ int min_gf_interval;
+ int max_gf_interval;
+
+ int row_mt;
+ int tile_columns;
+ int tile_rows;
+ int tile_width_count;
+ int tile_height_count;
+ int tile_widths[MAX_TILE_COLS];
+ int tile_heights[MAX_TILE_ROWS];
+
+ int max_threads;
+
+ aom_fixed_buf_t two_pass_stats_in;
+ struct aom_codec_pkt_list *output_pkt_list;
+
+#if CONFIG_FP_MB_STATS
+ aom_fixed_buf_t firstpass_mb_stats_in;
+#endif
+
+ aom_tune_metric tuning;
+ aom_tune_content content;
+ int use_highbitdepth;
+ aom_color_primaries_t color_primaries;
+ aom_transfer_characteristics_t transfer_characteristics;
+ aom_matrix_coefficients_t matrix_coefficients;
+ aom_chroma_sample_position_t chroma_sample_position;
+ int color_range;
+ int render_width;
+ int render_height;
+ aom_timing_info_type_t timing_info_type;
+ int timing_info_present;
+ aom_timing_info_t timing_info;
+ int decoder_model_info_present_flag;
+ int display_model_info_present_flag;
+ int buffer_removal_time_present;
+ aom_dec_model_info_t buffer_model;
+ aom_dec_model_op_parameters_t op_params[MAX_NUM_OPERATING_POINTS + 1];
+ aom_op_timing_info_t op_frame_timing[MAX_NUM_OPERATING_POINTS + 1];
+ int film_grain_test_vector;
+ const char *film_grain_table_filename;
+
+ uint8_t cdf_update_mode;
+ aom_superblock_size_t superblock_size;
+ unsigned int large_scale_tile;
+ unsigned int single_tile_decoding;
+ int monochrome;
+ unsigned int full_still_picture_hdr;
+ int enable_dual_filter;
+ unsigned int motion_vector_unit_test;
+ const cfg_options_t *cfg;
+ int enable_order_hint;
+ int enable_jnt_comp;
+ int enable_ref_frame_mvs;
+ unsigned int allow_ref_frame_mvs;
+ int enable_warped_motion;
+ int allow_warped_motion;
+ int enable_superres;
+ unsigned int save_as_annexb;
+
+#if CONFIG_DENOISE
+ float noise_level;
+ int noise_block_size;
+#endif
+
+ unsigned int chroma_subsampling_x;
+ unsigned int chroma_subsampling_y;
+} AV1EncoderConfig;
+
+static INLINE int is_lossless_requested(const AV1EncoderConfig *cfg) {
+ return cfg->best_allowed_q == 0 && cfg->worst_allowed_q == 0;
+}
+
+typedef struct FRAME_COUNTS {
+// Note: This structure should only contain 'unsigned int' fields, or
+// aggregates built solely from 'unsigned int' fields/elements
+#if CONFIG_ENTROPY_STATS
+ unsigned int kf_y_mode[KF_MODE_CONTEXTS][KF_MODE_CONTEXTS][INTRA_MODES];
+ unsigned int angle_delta[DIRECTIONAL_MODES][2 * MAX_ANGLE_DELTA + 1];
+ unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
+ unsigned int uv_mode[CFL_ALLOWED_TYPES][INTRA_MODES][UV_INTRA_MODES];
+ unsigned int cfl_sign[CFL_JOINT_SIGNS];
+ unsigned int cfl_alpha[CFL_ALPHA_CONTEXTS][CFL_ALPHABET_SIZE];
+ unsigned int palette_y_mode[PALATTE_BSIZE_CTXS][PALETTE_Y_MODE_CONTEXTS][2];
+ unsigned int palette_uv_mode[PALETTE_UV_MODE_CONTEXTS][2];
+ unsigned int palette_y_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_uv_size[PALATTE_BSIZE_CTXS][PALETTE_SIZES];
+ unsigned int palette_y_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int palette_uv_color_index[PALETTE_SIZES]
+ [PALETTE_COLOR_INDEX_CONTEXTS]
+ [PALETTE_COLORS];
+ unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+ unsigned int txb_skip[TOKEN_CDF_Q_CTXS][TX_SIZES][TXB_SKIP_CONTEXTS][2];
+ unsigned int eob_extra[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [EOB_COEF_CONTEXTS][2];
+ unsigned int dc_sign[PLANE_TYPES][DC_SIGN_CONTEXTS][2];
+ unsigned int coeff_lps[TX_SIZES][PLANE_TYPES][BR_CDF_SIZE - 1][LEVEL_CONTEXTS]
+ [2];
+ unsigned int eob_flag[TX_SIZES][PLANE_TYPES][EOB_COEF_CONTEXTS][2];
+ unsigned int eob_multi16[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][5];
+ unsigned int eob_multi32[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][6];
+ unsigned int eob_multi64[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][7];
+ unsigned int eob_multi128[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][8];
+ unsigned int eob_multi256[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][9];
+ unsigned int eob_multi512[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][10];
+ unsigned int eob_multi1024[TOKEN_CDF_Q_CTXS][PLANE_TYPES][2][11];
+ unsigned int coeff_lps_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [LEVEL_CONTEXTS][BR_CDF_SIZE];
+ unsigned int coeff_base_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS][NUM_BASE_LEVELS + 2];
+ unsigned int coeff_base_eob_multi[TOKEN_CDF_Q_CTXS][TX_SIZES][PLANE_TYPES]
+ [SIG_COEF_CONTEXTS_EOB][NUM_BASE_LEVELS + 1];
+ unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+ unsigned int zeromv_mode[GLOBALMV_MODE_CONTEXTS][2];
+ unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+ unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+ unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+ unsigned int wedge_idx[BLOCK_SIZES_ALL][16];
+ unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+ unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+ unsigned int wedge_interintra[BLOCK_SIZES_ALL][2];
+ unsigned int compound_type[BLOCK_SIZES_ALL][COMPOUND_TYPES - 1];
+ unsigned int motion_mode[BLOCK_SIZES_ALL][MOTION_MODES];
+ unsigned int obmc[BLOCK_SIZES_ALL][2];
+ unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
+ unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
+ unsigned int comp_ref_type[COMP_REF_TYPE_CONTEXTS][2];
+ unsigned int uni_comp_ref[UNI_COMP_REF_CONTEXTS][UNIDIR_COMP_REFS - 1][2];
+ unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS - 1][2];
+ unsigned int comp_ref[REF_CONTEXTS][FWD_REFS - 1][2];
+ unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS - 1][2];
+ unsigned int intrabc[2];
+
+ unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
+ unsigned int intra_tx_size[MAX_TX_CATS][TX_SIZE_CONTEXTS][MAX_TX_DEPTH + 1];
+ unsigned int skip_mode[SKIP_MODE_CONTEXTS][2];
+ unsigned int skip[SKIP_CONTEXTS][2];
+ unsigned int compound_index[COMP_INDEX_CONTEXTS][2];
+ unsigned int comp_group_idx[COMP_GROUP_IDX_CONTEXTS][2];
+ unsigned int delta_q[DELTA_Q_PROBS][2];
+ unsigned int delta_lf_multi[FRAME_LF_COUNT][DELTA_LF_PROBS][2];
+ unsigned int delta_lf[DELTA_LF_PROBS][2];
+
+ unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+ unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+ [TX_TYPES];
+ unsigned int filter_intra_mode[FILTER_INTRA_MODES];
+ unsigned int filter_intra[BLOCK_SIZES_ALL][2];
+ unsigned int switchable_restore[RESTORE_SWITCHABLE_TYPES];
+ unsigned int wiener_restore[2];
+ unsigned int sgrproj_restore[2];
+#endif // CONFIG_ENTROPY_STATS
+
+ unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
+ [SWITCHABLE_FILTERS];
+} FRAME_COUNTS;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+#define INTER_MODE_RD_DATA_OVERALL_SIZE 6400
+
+typedef struct {
+ int ready;
+ double a;
+ double b;
+ double dist_mean;
+ double ld_mean;
+ double sse_mean;
+ double sse_sse_mean;
+ double sse_ld_mean;
+ int num;
+ double dist_sum;
+ double ld_sum;
+ double sse_sum;
+ double sse_sse_sum;
+ double sse_ld_sum;
+} InterModeRdModel;
+
+typedef struct {
+ int idx;
+ int64_t rd;
+} RdIdxPair;
+// TODO(angiebird): This is an estimated size. We still need to figure what is
+// the maximum number of modes.
+#define MAX_INTER_MODES 1024
+typedef struct inter_modes_info {
+ int num;
+ MB_MODE_INFO mbmi_arr[MAX_INTER_MODES];
+ int mode_rate_arr[MAX_INTER_MODES];
+ int64_t sse_arr[MAX_INTER_MODES];
+ int64_t est_rd_arr[MAX_INTER_MODES];
+ RdIdxPair rd_idx_pair_arr[MAX_INTER_MODES];
+} InterModesInfo;
+#endif
+
+// TODO(jingning) All spatially adaptive variables should go to TileDataEnc.
+typedef struct TileDataEnc {
+ TileInfo tile_info;
+ int thresh_freq_fact[BLOCK_SIZES_ALL][MAX_MODES];
+ int mode_map[BLOCK_SIZES_ALL][MAX_MODES];
+ int m_search_count;
+ int ex_search_count;
+ CFL_CTX cfl;
+ DECLARE_ALIGNED(16, FRAME_CONTEXT, tctx);
+ uint8_t allow_update_cdf;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ InterModeRdModel inter_mode_rd_models[BLOCK_SIZES_ALL];
+ InterModesInfo inter_modes_info;
+#endif
+} TileDataEnc;
+
+typedef struct {
+ TOKENEXTRA *start;
+ TOKENEXTRA *stop;
+ unsigned int count;
+} TOKENLIST;
+
+typedef struct RD_COUNTS {
+ int64_t comp_pred_diff[REFERENCE_MODES];
+ // Stores number of 4x4 blocks using global motion per reference frame.
+ int global_motion_used[REF_FRAMES];
+ int compound_ref_used_flag;
+ int skip_mode_used_flag;
+} RD_COUNTS;
+
+typedef struct ThreadData {
+ MACROBLOCK mb;
+ RD_COUNTS rd_counts;
+ FRAME_COUNTS *counts;
+ PC_TREE *pc_tree;
+ PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+ uint32_t *hash_value_buffer[2][2];
+ int32_t *wsrc_buf;
+ int32_t *mask_buf;
+ uint8_t *above_pred_buf;
+ uint8_t *left_pred_buf;
+ PALETTE_BUFFER *palette_buffer;
+ CONV_BUF_TYPE *tmp_conv_dst;
+ uint8_t *tmp_obmc_bufs[2];
+ int intrabc_used_this_tile;
+} ThreadData;
+
+struct EncWorkerData;
+
+typedef struct ActiveMap {
+ int enabled;
+ int update;
+ unsigned char *map;
+} ActiveMap;
+
+#if CONFIG_INTERNAL_STATS
+// types of stats
+typedef enum {
+ STAT_Y,
+ STAT_U,
+ STAT_V,
+ STAT_ALL,
+ NUM_STAT_TYPES // This should always be the last member of the enum
+} StatType;
+
+typedef struct IMAGE_STAT {
+ double stat[NUM_STAT_TYPES];
+ double worst;
+} ImageStat;
+#endif // CONFIG_INTERNAL_STATS
+
+typedef struct {
+ int ref_count;
+ YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+typedef struct TileBufferEnc {
+ uint8_t *data;
+ size_t size;
+} TileBufferEnc;
+
+typedef struct AV1_COMP {
+ QUANTS quants;
+ ThreadData td;
+ FRAME_COUNTS counts;
+ MB_MODE_INFO_EXT *mbmi_ext_base;
+ CB_COEFF_BUFFER *coeff_buffer_base;
+ Dequants dequants;
+ AV1_COMMON common;
+ AV1EncoderConfig oxcf;
+ struct lookahead_ctx *lookahead;
+ struct lookahead_entry *alt_ref_source;
+ int no_show_kf;
+
+ int optimize_speed_feature;
+ int optimize_seg_arr[MAX_SEGMENTS];
+
+ YV12_BUFFER_CONFIG *source;
+ YV12_BUFFER_CONFIG *last_source; // NULL for first frame and alt_ref frames
+ YV12_BUFFER_CONFIG *unscaled_source;
+ YV12_BUFFER_CONFIG scaled_source;
+ YV12_BUFFER_CONFIG *unscaled_last_source;
+ YV12_BUFFER_CONFIG scaled_last_source;
+
+ // For a still frame, this flag is set to 1 to skip partition search.
+ int partition_search_skippable_frame;
+ double csm_rate_array[32];
+ double m_rate_array[32];
+ int rate_size;
+ int rate_index;
+ hash_table *previous_hash_table;
+ int previous_index;
+ int cur_poc; // DebugInfo
+
+ unsigned int row_mt;
+ int scaled_ref_idx[REF_FRAMES];
+ int ref_fb_idx[REF_FRAMES];
+ int refresh_fb_idx; // ref frame buffer index to refresh
+
+ int last_show_frame_buf_idx; // last show frame buffer index
+
+ int refresh_last_frame;
+ int refresh_golden_frame;
+ int refresh_bwd_ref_frame;
+ int refresh_alt2_ref_frame;
+ int refresh_alt_ref_frame;
+#if USE_SYMM_MULTI_LAYER
+ int new_bwdref_update_rule;
+#endif
+
+ int ext_refresh_frame_flags_pending;
+ int ext_refresh_last_frame;
+ int ext_refresh_golden_frame;
+ int ext_refresh_bwd_ref_frame;
+ int ext_refresh_alt2_ref_frame;
+ int ext_refresh_alt_ref_frame;
+
+ int ext_refresh_frame_context_pending;
+ int ext_refresh_frame_context;
+ int ext_use_ref_frame_mvs;
+ int ext_use_error_resilient;
+ int ext_use_s_frame;
+ int ext_use_primary_ref_none;
+
+ YV12_BUFFER_CONFIG last_frame_uf;
+ YV12_BUFFER_CONFIG trial_frame_rst;
+
+ // Ambient reconstruction err target for force key frames
+ int64_t ambient_err;
+
+ RD_OPT rd;
+
+ CODING_CONTEXT coding_context;
+
+ int gmtype_cost[TRANS_TYPES];
+ int gmparams_cost[REF_FRAMES];
+
+ int nmv_costs[2][MV_VALS];
+ int nmv_costs_hp[2][MV_VALS];
+
+ int64_t last_time_stamp_seen;
+ int64_t last_end_time_stamp_seen;
+ int64_t first_time_stamp_ever;
+
+ RATE_CONTROL rc;
+ double framerate;
+
+ // NOTE(zoeliu): Any inter frame allows maximum of REF_FRAMES inter
+ // references; Plus the currently coded frame itself, it is needed to allocate
+ // sufficient space to the size of the maximum possible number of frames.
+ int interp_filter_selected[REF_FRAMES + 1][SWITCHABLE];
+
+ struct aom_codec_pkt_list *output_pkt_list;
+
+ MBGRAPH_FRAME_STATS mbgraph_stats[MAX_LAG_BUFFERS];
+ int mbgraph_n_frames; // number of frames filled in the above
+ int static_mb_pct; // % forced skip mbs by segmentation
+ int ref_frame_flags;
+ int ext_ref_frame_flags;
+ RATE_FACTOR_LEVEL frame_rf_level[FRAME_BUFFERS];
+
+ SPEED_FEATURES sf;
+
+ unsigned int max_mv_magnitude;
+ int mv_step_param;
+
+ int allow_comp_inter_inter;
+ int all_one_sided_refs;
+
+ uint8_t *segmentation_map;
+
+ CYCLIC_REFRESH *cyclic_refresh;
+ ActiveMap active_map;
+
+ fractional_mv_step_fp *find_fractional_mv_step;
+ av1_diamond_search_fn_t diamond_search_sad;
+ aom_variance_fn_ptr_t fn_ptr[BLOCK_SIZES_ALL];
+ uint64_t time_receive_data;
+ uint64_t time_compress_data;
+ uint64_t time_pick_lpf;
+ uint64_t time_encode_sb_row;
+
+#if CONFIG_FP_MB_STATS
+ int use_fp_mb_stats;
+#endif
+
+ TWO_PASS twopass;
+
+ YV12_BUFFER_CONFIG alt_ref_buffer;
+
+#if CONFIG_INTERNAL_STATS
+ unsigned int mode_chosen_counts[MAX_MODES];
+
+ int count;
+ uint64_t total_sq_error;
+ uint64_t total_samples;
+ ImageStat psnr;
+
+ double total_blockiness;
+ double worst_blockiness;
+
+ int bytes;
+ double summed_quality;
+ double summed_weights;
+ unsigned int tot_recode_hits;
+ double worst_ssim;
+
+ ImageStat fastssim;
+ ImageStat psnrhvs;
+
+ int b_calculate_blockiness;
+ int b_calculate_consistency;
+
+ double total_inconsistency;
+ double worst_consistency;
+ Ssimv *ssim_vars;
+ Metrics metrics;
+#endif
+ int b_calculate_psnr;
+
+ int droppable;
+
+ int initial_width;
+ int initial_height;
+ int initial_mbs; // Number of MBs in the full-size frame; to be used to
+ // normalize the firstpass stats. This will differ from the
+ // number of MBs in the current frame when the frame is
+ // scaled.
+
+ // When resize is triggered through external control, the desired width/height
+ // are stored here until use in the next frame coded. They are effective only
+ // for
+ // one frame and are reset after use.
+ int resize_pending_width;
+ int resize_pending_height;
+
+ int frame_flags;
+
+ search_site_config ss_cfg;
+
+ TileDataEnc *tile_data;
+ int allocated_tiles; // Keep track of memory allocated for tiles.
+
+ TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+ unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+ TOKENLIST *tplist[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+ TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+ int resize_state;
+ int resize_avg_qp;
+ int resize_buffer_underflow;
+ int resize_count;
+
+ // Sequence parameters have been transmitted already and locked
+ // or not. Once locked av1_change_config cannot change the seq
+ // parameters.
+ int seq_params_locked;
+
+ // VARIANCE_AQ segment map refresh
+ int vaq_refresh;
+
+ // Multi-threading
+ int num_workers;
+ AVxWorker *workers;
+ struct EncWorkerData *tile_thr_data;
+ int refresh_frame_mask;
+ int existing_fb_idx_to_show;
+ int is_arf_filter_off[MAX_EXT_ARFS + 1];
+ int num_extra_arfs;
+ int arf_pos_in_gf[MAX_EXT_ARFS + 1];
+ int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
+ int global_motion_search_done;
+ tran_low_t *tcoeff_buf[MAX_MB_PLANE];
+ int extra_arf_allowed;
+ // A flag to indicate if intrabc is ever used in current frame.
+ int intrabc_used;
+ int dv_cost[2][MV_VALS];
+ // TODO(huisu@google.com): we can update dv_joint_cost per SB.
+ int dv_joint_cost[MV_JOINTS];
+ int has_lossless_segment;
+
+ // For frame refs short signaling:
+ // A mapping of each reference frame from its encoder side value to the
+ // decoder side value obtained following the short signaling procedure.
+ int ref_conv[REF_FRAMES];
+
+ AV1LfSync lf_row_sync;
+ AV1LrSync lr_row_sync;
+ AV1LrStruct lr_ctxt;
+
+ aom_film_grain_table_t *film_grain_table;
+#if CONFIG_DENOISE
+ struct aom_denoise_and_model_t *denoise_and_model;
+#endif
+ // Stores the default value of skip flag depending on chroma format
+ // Set as 1 for monochrome and 3 for other color formats
+ int default_interp_skip_flags;
+ int preserve_arf_as_gld;
+} AV1_COMP;
+
+// Must not be called more than once.
+void av1_initialize_enc(void);
+
+struct AV1_COMP *av1_create_compressor(AV1EncoderConfig *oxcf,
+ BufferPool *const pool);
+void av1_remove_compressor(AV1_COMP *cpi);
+
+void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf);
+
+// receive a frames worth of data. caller can assume that a copy of this
+// frame is made and not just a copy of the pointer..
+int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags,
+ YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
+ int64_t end_time_stamp);
+
+int av1_get_compressed_data(AV1_COMP *cpi, unsigned int *frame_flags,
+ size_t *size, uint8_t *dest, int64_t *time_stamp,
+ int64_t *time_end, int flush,
+ const aom_rational_t *timebase);
+
+int av1_get_preview_raw_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *dest);
+
+int av1_get_last_show_frame(AV1_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
+aom_codec_err_t av1_copy_new_frame_enc(AV1_COMMON *cm,
+ YV12_BUFFER_CONFIG *new_frame,
+ YV12_BUFFER_CONFIG *sd);
+
+int av1_use_as_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+void av1_update_reference(AV1_COMP *cpi, int ref_frame_flags);
+
+int av1_copy_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd);
+
+int av1_update_entropy(AV1_COMP *cpi, int update);
+
+int av1_set_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_get_active_map(AV1_COMP *cpi, unsigned char *map, int rows, int cols);
+
+int av1_set_internal_size(AV1_COMP *cpi, AOM_SCALING horiz_mode,
+ AOM_SCALING vert_mode);
+
+int av1_get_quantizer(struct AV1_COMP *cpi);
+
+int av1_convert_sect5obus_to_annexb(uint8_t *buffer, size_t *input_size);
+
+int64_t timebase_units_to_ticks(const aom_rational_t *timebase, int64_t n);
+int64_t ticks_to_timebase_units(const aom_rational_t *timebase, int64_t n);
+
+static INLINE int frame_is_kf_gf_arf(const AV1_COMP *cpi) {
+ return frame_is_intra_only(&cpi->common) || cpi->refresh_alt_ref_frame ||
+ (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
+}
+
+static INLINE int get_ref_frame_map_idx(const AV1_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+ return (ref_frame >= 1) ? cpi->ref_fb_idx[ref_frame - 1] : INVALID_IDX;
+}
+
+static INLINE int get_ref_frame_buf_idx(const AV1_COMP *cpi,
+ MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+ return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
+}
+
+// TODO(huisu@google.com, youzhou@microsoft.com): enable hash-me for HBD.
+static INLINE int av1_use_hash_me(const AV1_COMMON *const cm) {
+ return cm->allow_screen_content_tools;
+}
+
+static INLINE hash_table *av1_get_ref_frame_hash_map(
+ const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return buf_idx != INVALID_IDX
+ ? &cm->buffer_pool->frame_bufs[buf_idx].hash_table
+ : NULL;
+}
+
+static INLINE YV12_BUFFER_CONFIG *get_ref_frame_buffer(
+ const AV1_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf
+ : NULL;
+}
+
+static INLINE int enc_is_ref_frame_buf(AV1_COMP *cpi, RefCntBuffer *frame_buf) {
+ MV_REFERENCE_FRAME ref_frame;
+ AV1_COMMON *const cm = &cpi->common;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ if (buf_idx == INVALID_IDX) continue;
+ if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+ }
+ return (ref_frame <= ALTREF_FRAME);
+}
+
+// Token buffer is only used for palette tokens.
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols,
+ int sb_size_log2,
+ const int num_planes) {
+ // Calculate the maximum number of max superblocks in the image.
+ const int shift = sb_size_log2 - 4;
+ const int sb_size = 1 << sb_size_log2;
+ const int sb_size_square = sb_size * sb_size;
+ const int sb_rows = ALIGN_POWER_OF_TWO(mb_rows, shift) >> shift;
+ const int sb_cols = ALIGN_POWER_OF_TWO(mb_cols, shift) >> shift;
+
+ // One palette token for each pixel. There can be palettes on two planes.
+ const int sb_palette_toks = AOMMIN(2, num_planes) * sb_size_square;
+
+ return sb_rows * sb_cols * sb_palette_toks;
+}
+
+// Get the allocated token size for a tile. It does the same calculation as in
+// the frame token allocation.
+static INLINE unsigned int allocated_tokens(TileInfo tile, int sb_size_log2,
+ int num_planes) {
+ int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 2) >> 2;
+ int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 2) >> 2;
+
+ return get_token_alloc(tile_mb_rows, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+static INLINE void get_start_tok(AV1_COMP *cpi, int tile_row, int tile_col,
+ int mi_row, TOKENEXTRA **tok, int sb_size_log2,
+ int num_planes) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+ const TileInfo *const tile_info = &this_tile->tile_info;
+
+ const int tile_mb_cols =
+ (tile_info->mi_col_end - tile_info->mi_col_start + 2) >> 2;
+ const int tile_mb_row = (mi_row - tile_info->mi_row_start + 2) >> 2;
+
+ *tok = cpi->tile_tok[tile_row][tile_col] +
+ get_token_alloc(tile_mb_row, tile_mb_cols, sb_size_log2, num_planes);
+}
+
+void av1_apply_encoding_flags(AV1_COMP *cpi, aom_enc_frame_flags_t flags);
+
+#define ALT_MIN_LAG 3
+static INLINE int is_altref_enabled(const AV1_COMP *const cpi) {
+ return cpi->oxcf.lag_in_frames >= ALT_MIN_LAG && cpi->oxcf.enable_auto_arf;
+}
+
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+
+static INLINE void set_ref_ptrs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME ref0,
+ MV_REFERENCE_FRAME ref1) {
+ xd->block_refs[0] =
+ &cm->frame_refs[ref0 >= LAST_FRAME ? ref0 - LAST_FRAME : 0];
+ xd->block_refs[1] =
+ &cm->frame_refs[ref1 >= LAST_FRAME ? ref1 - LAST_FRAME : 0];
+}
+
+static INLINE int get_chessboard_index(int frame_index) {
+ return frame_index & 0x1;
+}
+
+static INLINE int *cond_cost_list(const struct AV1_COMP *cpi, int *cost_list) {
+ return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
+}
+
+void av1_new_framerate(AV1_COMP *cpi, double framerate);
+
+#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
+
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+ int new_uidx) {
+ const int ref_index = *uidx;
+
+ if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+ ubufs[ref_index].ref_count--;
+
+ *uidx = new_uidx;
+ ubufs[new_uidx].ref_count++;
+}
+
+// Returns 1 if a frame is scaled and 0 otherwise.
+static INLINE int av1_resize_scaled(const AV1_COMMON *cm) {
+ return !(cm->superres_upscaled_width == cm->render_width &&
+ cm->superres_upscaled_height == cm->render_height);
+}
+
+static INLINE int av1_frame_scaled(const AV1_COMMON *cm) {
+ return !av1_superres_scaled(cm) && av1_resize_scaled(cm);
+}
+
+// Don't allow a show_existing_frame to coincide with an error resilient
+// frame. An exception can be made for a forward keyframe since it has no
+// previous dependencies.
+static INLINE int encode_show_existing_frame(const AV1_COMMON *cm) {
+ return cm->show_existing_frame &&
+ (!cm->error_resilient_mode || cm->frame_type == KEY_FRAME);
+}
+
+// Returns a Sequence Header OBU stored in an aom_fixed_buf_t, or NULL upon
+// failure. When a non-NULL aom_fixed_buf_t pointer is returned by this
+// function, the memory must be freed by the caller. Both the buf member of the
+// aom_fixed_buf_t, and the aom_fixed_buf_t pointer itself must be freed. Memory
+// returned must be freed via call to free().
+//
+// Note: The OBU returned is in Low Overhead Bitstream Format. Specifically,
+// the obu_has_size_field bit is set, and the buffer contains the obu_size
+// field.
+aom_fixed_buf_t *av1_get_global_headers(AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODER_H_
diff --git a/third_party/aom/av1/encoder/encodetxb.c b/third_party/aom/av1/encoder/encodetxb.c
new file mode 100644
index 0000000000..5a31d93d73
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.c
@@ -0,0 +1,2062 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodetxb.h"
+
+#include "aom_ports/mem.h"
+#include "av1/common/blockd.h"
+#include "av1/common/idct.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/encoder/bitstream.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static int hbt_needs_init = 1;
+static CRC32C crc_calculator;
+static const int HBT_EOB = 16; // also the length in opt_qcoeff
+static const int HBT_TABLE_SIZE = 65536; // 16 bit: holds 65536 'arrays'
+static const int HBT_ARRAY_LENGTH = 256; // 8 bit: 256 entries
+// If removed in hbt_create_hashes or increased beyond int8_t, widen deltas type
+static const int HBT_KICKOUT = 3;
+
+typedef struct OptTxbQcoeff {
+ // Use larger type if larger/no kickout value is used in hbt_create_hashes
+ int8_t deltas[16];
+ uint32_t hbt_qc_hash;
+ uint32_t hbt_ctx_hash;
+ int init;
+ int rate_cost;
+} OptTxbQcoeff;
+
+OptTxbQcoeff *hbt_hash_table;
+
+typedef struct LevelDownStats {
+ int update;
+ tran_low_t low_qc;
+ tran_low_t low_dqc;
+ int64_t dist0;
+ int rate;
+ int rate_low;
+ int64_t dist;
+ int64_t dist_low;
+ int64_t rd;
+ int64_t rd_low;
+ int64_t nz_rd;
+ int64_t rd_diff;
+ int cost_diff;
+ int64_t dist_diff;
+ int new_eob;
+} LevelDownStats;
+
+void av1_alloc_txb_buf(AV1_COMP *cpi) {
+ AV1_COMMON *cm = &cpi->common;
+ int size = ((cm->mi_rows >> cm->seq_params.mib_size_log2) + 1) *
+ ((cm->mi_cols >> cm->seq_params.mib_size_log2) + 1);
+
+ av1_free_txb_buf(cpi);
+ // TODO(jingning): This should be further reduced.
+ CHECK_MEM_ERROR(cm, cpi->coeff_buffer_base,
+ aom_memalign(32, sizeof(*cpi->coeff_buffer_base) * size));
+}
+
+void av1_free_txb_buf(AV1_COMP *cpi) { aom_free(cpi->coeff_buffer_base); }
+
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ int mib_size_log2 = cm->seq_params.mib_size_log2;
+ int stride = (cm->mi_cols >> mib_size_log2) + 1;
+ int offset = (mi_row >> mib_size_log2) * stride + (mi_col >> mib_size_log2);
+ CB_COEFF_BUFFER *coeff_buf = &cpi->coeff_buffer_base[offset];
+ const int txb_offset = x->cb_offset / (TX_SIZE_W_MIN * TX_SIZE_H_MIN);
+ assert(x->cb_offset < (1 << num_pels_log2_lookup[cm->seq_params.sb_size]));
+ for (int plane = 0; plane < num_planes; ++plane) {
+ x->mbmi_ext->tcoeff[plane] = coeff_buf->tcoeff[plane] + x->cb_offset;
+ x->mbmi_ext->eobs[plane] = coeff_buf->eobs[plane] + txb_offset;
+ x->mbmi_ext->txb_skip_ctx[plane] =
+ coeff_buf->txb_skip_ctx[plane] + txb_offset;
+ x->mbmi_ext->dc_sign_ctx[plane] =
+ coeff_buf->dc_sign_ctx[plane] + txb_offset;
+ }
+}
+
+static void write_golomb(aom_writer *w, int level) {
+ int x = level + 1;
+ int i = x;
+ int length = 0;
+
+ while (i) {
+ i >>= 1;
+ ++length;
+ }
+ assert(length > 0);
+
+ for (i = 0; i < length - 1; ++i) aom_write_bit(w, 0);
+
+ for (i = length - 1; i >= 0; --i) aom_write_bit(w, (x >> i) & 0x01);
+}
+
+static INLINE tran_low_t get_lower_coeff(tran_low_t qc) {
+ if (qc == 0) {
+ return 0;
+ }
+ return qc > 0 ? qc - 1 : qc + 1;
+}
+
+static INLINE tran_low_t qcoeff_to_dqcoeff(tran_low_t qc, int coeff_idx,
+ int dqv, int shift,
+ const qm_val_t *iqmatrix) {
+ int sign = qc < 0 ? -1 : 1;
+ if (iqmatrix != NULL)
+ dqv =
+ ((iqmatrix[coeff_idx] * dqv) + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+ return sign * ((abs(qc) * dqv) >> shift);
+}
+
+static INLINE int64_t get_coeff_dist(tran_low_t tcoeff, tran_low_t dqcoeff,
+ int shift) {
+ const int64_t diff = (tcoeff - dqcoeff) * (1 << shift);
+ const int64_t error = diff * diff;
+ return error;
+}
+
+static const int8_t eob_to_pos_small[33] = {
+ 0, 1, 2, // 0-2
+ 3, 3, // 3-4
+ 4, 4, 4, 4, // 5-8
+ 5, 5, 5, 5, 5, 5, 5, 5, // 9-16
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6 // 17-32
+};
+
+static const int8_t eob_to_pos_large[17] = {
+ 6, // place holder
+ 7, // 33-64
+ 8, 8, // 65-128
+ 9, 9, 9, 9, // 129-256
+ 10, 10, 10, 10, 10, 10, 10, 10, // 257-512
+ 11 // 513-
+};
+
+static INLINE int get_eob_pos_token(const int eob, int *const extra) {
+ int t;
+
+ if (eob < 33) {
+ t = eob_to_pos_small[eob];
+ } else {
+ const int e = AOMMIN((eob - 1) >> 5, 16);
+ t = eob_to_pos_large[e];
+ }
+
+ *extra = eob - k_eob_group_start[t];
+
+ return t;
+}
+
+#if CONFIG_ENTROPY_STATS
+void av1_update_eob_context(int cdf_idx, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, PLANE_TYPE plane,
+ FRAME_CONTEXT *ec_ctx, FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+#else
+void av1_update_eob_context(int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ PLANE_TYPE plane, FRAME_CONTEXT *ec_ctx,
+ uint8_t allow_update_cdf) {
+#endif
+ int eob_extra;
+ const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+ TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+
+ switch (eob_multi_size) {
+ case 0:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi16[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf16[plane][eob_multi_ctx], eob_pt - 1, 5);
+ break;
+ case 1:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi32[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf32[plane][eob_multi_ctx], eob_pt - 1, 6);
+ break;
+ case 2:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi64[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_flag_cdf64[plane][eob_multi_ctx], eob_pt - 1, 7);
+ break;
+ case 3:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi128[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf128[plane][eob_multi_ctx], eob_pt - 1,
+ 8);
+ }
+ break;
+ case 4:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi256[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf256[plane][eob_multi_ctx], eob_pt - 1,
+ 9);
+ }
+ break;
+ case 5:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi512[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf512[plane][eob_multi_ctx], eob_pt - 1,
+ 10);
+ }
+ break;
+ case 6:
+ default:
+#if CONFIG_ENTROPY_STATS
+ ++counts->eob_multi1024[cdf_idx][plane][eob_multi_ctx][eob_pt - 1];
+#endif
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->eob_flag_cdf1024[plane][eob_multi_ctx], eob_pt - 1,
+ 11);
+ }
+ break;
+ }
+
+ if (k_eob_offset_bits[eob_pt] > 0) {
+ int eob_ctx = eob_pt - 3;
+ int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+#if CONFIG_ENTROPY_STATS
+ counts->eob_extra[cdf_idx][txs_ctx][plane][eob_pt][bit]++;
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->eob_extra_cdf[txs_ctx][plane][eob_ctx], bit, 2);
+ }
+}
+
+static int get_eob_cost(int eob, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, TX_CLASS tx_class) {
+ int eob_extra;
+ const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+ int eob_cost = 0;
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ eob_cost = txb_eob_costs->eob_cost[eob_multi_ctx][eob_pt - 1];
+
+ if (k_eob_offset_bits[eob_pt] > 0) {
+ const int eob_ctx = eob_pt - 3;
+ const int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+ const int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ eob_cost += txb_costs->eob_extra_cost[eob_ctx][bit];
+ const int offset_bits = k_eob_offset_bits[eob_pt];
+ if (offset_bits > 1) eob_cost += av1_cost_literal(offset_bits - 1);
+ }
+ return eob_cost;
+}
+
+static INLINE int get_sign_bit_cost(tran_low_t qc, int coeff_idx,
+ const int (*dc_sign_cost)[2],
+ int dc_sign_ctx) {
+ if (coeff_idx == 0) {
+ const int sign = (qc < 0) ? 1 : 0;
+ return dc_sign_cost[dc_sign_ctx][sign];
+ }
+ return av1_cost_literal(1);
+}
+
+static INLINE int get_br_cost(tran_low_t abs_qc, int ctx,
+ const int *coeff_lps) {
+ const tran_low_t min_level = 1 + NUM_BASE_LEVELS;
+ const tran_low_t max_level = 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE;
+ (void)ctx;
+ if (abs_qc >= min_level) {
+ if (abs_qc >= max_level) {
+ return coeff_lps[COEFF_BASE_RANGE]; // COEFF_BASE_RANGE * cost0;
+ } else {
+ return coeff_lps[(abs_qc - min_level)]; // * cost0 + cost1;
+ }
+ }
+ return 0;
+}
+
+static INLINE int get_golomb_cost(int abs_qc) {
+ if (abs_qc >= 1 + NUM_BASE_LEVELS + COEFF_BASE_RANGE) {
+ const int r = abs_qc - COEFF_BASE_RANGE - NUM_BASE_LEVELS;
+ const int length = get_msb(r) + 1;
+ return av1_cost_literal(2 * length - 1);
+ }
+ return 0;
+}
+
+static int get_coeff_cost(const tran_low_t qc, const int scan_idx,
+ const int is_eob, const TxbInfo *const txb_info,
+ const LV_MAP_COEFF_COST *const txb_costs,
+ const int coeff_ctx, const TX_CLASS tx_class) {
+ const TXB_CTX *const txb_ctx = txb_info->txb_ctx;
+ const int is_nz = (qc != 0);
+ const tran_low_t abs_qc = abs(qc);
+ int cost = 0;
+ const int16_t *const scan = txb_info->scan_order->scan;
+ const int pos = scan[scan_idx];
+
+ if (is_eob) {
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ } else {
+ cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ }
+ if (is_nz) {
+ cost += get_sign_bit_cost(qc, scan_idx, txb_costs->dc_sign_cost,
+ txb_ctx->dc_sign_ctx);
+
+ if (abs_qc > NUM_BASE_LEVELS) {
+ const int ctx =
+ get_br_ctx(txb_info->levels, pos, txb_info->bwl, tx_class);
+ cost += get_br_cost(abs_qc, ctx, txb_costs->lps_cost[ctx]);
+ cost += get_golomb_cost(abs_qc);
+ }
+ }
+ return cost;
+}
+
+static INLINE int get_nz_map_ctx(const uint8_t *const levels,
+ const int coeff_idx, const int bwl,
+ const int height, const int scan_idx,
+ const int is_eob, const TX_SIZE tx_size,
+ const TX_CLASS tx_class) {
+ if (is_eob) {
+ if (scan_idx == 0) return 0;
+ if (scan_idx <= (height << bwl) / 8) return 1;
+ if (scan_idx <= (height << bwl) / 4) return 2;
+ return 3;
+ }
+ const int stats =
+ get_nz_mag(levels + get_padded_idx(coeff_idx, bwl), bwl, tx_class);
+ return get_nz_map_ctx_from_stats(stats, coeff_idx, bwl, tx_size, tx_class);
+}
+
+static void get_dist_cost_stats(LevelDownStats *const stats, const int scan_idx,
+ const int is_eob,
+ const LV_MAP_COEFF_COST *const txb_costs,
+ const TxbInfo *const txb_info,
+ const TX_CLASS tx_class) {
+ const int16_t *const scan = txb_info->scan_order->scan;
+ const int coeff_idx = scan[scan_idx];
+ const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+ const uint8_t *const levels = txb_info->levels;
+ stats->new_eob = -1;
+ stats->update = 0;
+ stats->rd_low = 0;
+ stats->rd = 0;
+ stats->nz_rd = 0;
+ stats->dist_low = 0;
+ stats->rate_low = 0;
+ stats->low_qc = 0;
+
+ const tran_low_t tqc = txb_info->tcoeff[coeff_idx];
+ const int dqv = txb_info->dequant[coeff_idx != 0];
+ const int coeff_ctx =
+ get_nz_map_ctx(levels, coeff_idx, txb_info->bwl, txb_info->height,
+ scan_idx, is_eob, txb_info->tx_size, tx_class);
+ const int qc_cost = get_coeff_cost(qc, scan_idx, is_eob, txb_info, txb_costs,
+ coeff_ctx, tx_class);
+ assert(qc != 0);
+ const tran_low_t dqc = qcoeff_to_dqcoeff(qc, coeff_idx, dqv, txb_info->shift,
+ txb_info->iqmatrix);
+ const int64_t dqc_dist = get_coeff_dist(tqc, dqc, txb_info->shift);
+
+ // distortion difference when coefficient is quantized to 0
+ const tran_low_t dqc0 =
+ qcoeff_to_dqcoeff(0, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+
+ stats->dist0 = get_coeff_dist(tqc, dqc0, txb_info->shift);
+ stats->dist = dqc_dist - stats->dist0;
+ stats->rate = qc_cost;
+
+ stats->rd = RDCOST(txb_info->rdmult, stats->rate, stats->dist);
+
+ stats->low_qc = get_lower_coeff(qc);
+
+ if (is_eob && stats->low_qc == 0) {
+ stats->rd_low = stats->rd; // disable selection of low_qc in this case.
+ } else {
+ if (stats->low_qc == 0) {
+ stats->dist_low = 0;
+ } else {
+ stats->low_dqc = qcoeff_to_dqcoeff(stats->low_qc, coeff_idx, dqv,
+ txb_info->shift, txb_info->iqmatrix);
+ const int64_t low_dqc_dist =
+ get_coeff_dist(tqc, stats->low_dqc, txb_info->shift);
+ stats->dist_low = low_dqc_dist - stats->dist0;
+ }
+ const int low_qc_cost =
+ get_coeff_cost(stats->low_qc, scan_idx, is_eob, txb_info, txb_costs,
+ coeff_ctx, tx_class);
+ stats->rate_low = low_qc_cost;
+ stats->rd_low = RDCOST(txb_info->rdmult, stats->rate_low, stats->dist_low);
+ }
+}
+
+static void get_dist_cost_stats_with_eob(
+ LevelDownStats *const stats, const int scan_idx,
+ const LV_MAP_COEFF_COST *const txb_costs, const TxbInfo *const txb_info,
+ const TX_CLASS tx_class) {
+ const int is_eob = 0;
+ get_dist_cost_stats(stats, scan_idx, is_eob, txb_costs, txb_info, tx_class);
+
+ const int16_t *const scan = txb_info->scan_order->scan;
+ const int coeff_idx = scan[scan_idx];
+ const tran_low_t qc = txb_info->qcoeff[coeff_idx];
+ const int coeff_ctx_temp = get_nz_map_ctx(
+ txb_info->levels, coeff_idx, txb_info->bwl, txb_info->height, scan_idx, 1,
+ txb_info->tx_size, tx_class);
+ const int qc_eob_cost = get_coeff_cost(qc, scan_idx, 1, txb_info, txb_costs,
+ coeff_ctx_temp, tx_class);
+ int64_t rd_eob = RDCOST(txb_info->rdmult, qc_eob_cost, stats->dist);
+ if (stats->low_qc != 0) {
+ const int low_qc_eob_cost =
+ get_coeff_cost(stats->low_qc, scan_idx, 1, txb_info, txb_costs,
+ coeff_ctx_temp, tx_class);
+ int64_t rd_eob_low =
+ RDCOST(txb_info->rdmult, low_qc_eob_cost, stats->dist_low);
+ rd_eob = (rd_eob > rd_eob_low) ? rd_eob_low : rd_eob;
+ }
+
+ stats->nz_rd = AOMMIN(stats->rd_low, stats->rd) - rd_eob;
+}
+
+static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
+ const TxbInfo *const txb_info) {
+ txb_info->qcoeff[coeff_idx] = qc;
+ txb_info->levels[get_padded_idx(coeff_idx, txb_info->bwl)] =
+ (uint8_t)clamp(abs(qc), 0, INT8_MAX);
+}
+
+static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
+ const TxbInfo *const txb_info) {
+ update_qcoeff(coeff_idx, qc, txb_info);
+ const int dqv = txb_info->dequant[coeff_idx != 0];
+ txb_info->dqcoeff[coeff_idx] = qcoeff_to_dqcoeff(
+ qc, coeff_idx, dqv, txb_info->shift, txb_info->iqmatrix);
+}
+
+void av1_txb_init_levels_c(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ uint8_t *ls = levels;
+
+ memset(levels - TX_PAD_TOP * stride, 0,
+ sizeof(*levels) * TX_PAD_TOP * stride);
+ memset(levels + stride * height, 0,
+ sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
+
+ for (int i = 0; i < height; i++) {
+ for (int j = 0; j < width; j++) {
+ *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, INT8_MAX);
+ }
+ for (int j = 0; j < TX_PAD_HOR; j++) {
+ *ls++ = 0;
+ }
+ }
+}
+
+void av1_get_nz_map_contexts_c(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size, const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int bwl = get_txb_bwl(tx_size);
+ const int height = get_txb_high(tx_size);
+ for (int i = 0; i < eob; ++i) {
+ const int pos = scan[i];
+ coeff_contexts[pos] = get_nz_map_ctx(levels, pos, bwl, height, i,
+ i == eob - 1, tx_size, tx_class);
+ }
+}
+
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ TX_SIZE tx_size, const tran_low_t *tcoeff,
+ uint16_t eob, TXB_CTX *txb_ctx) {
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+ aom_write_symbol(w, eob == 0,
+ ec_ctx->txb_skip_cdf[txs_ctx][txb_ctx->txb_skip_ctx], 2);
+ if (eob == 0) return;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ int c;
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+
+ av1_write_tx_type(cm, xd, blk_row, blk_col, plane, tx_size, w);
+
+ int eob_extra;
+ const int eob_pt = get_eob_pos_token(eob, &eob_extra);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const int eob_multi_ctx = (tx_class == TX_CLASS_2D) ? 0 : 1;
+ switch (eob_multi_size) {
+ case 0:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf16[plane_type][eob_multi_ctx], 5);
+ break;
+ case 1:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf32[plane_type][eob_multi_ctx], 6);
+ break;
+ case 2:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf64[plane_type][eob_multi_ctx], 7);
+ break;
+ case 3:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf128[plane_type][eob_multi_ctx], 8);
+ break;
+ case 4:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf256[plane_type][eob_multi_ctx], 9);
+ break;
+ case 5:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf512[plane_type][eob_multi_ctx], 10);
+ break;
+ default:
+ aom_write_symbol(w, eob_pt - 1,
+ ec_ctx->eob_flag_cdf1024[plane_type][eob_multi_ctx], 11);
+ break;
+ }
+
+ if (k_eob_offset_bits[eob_pt] > 0) {
+ const int eob_ctx = eob_pt - 3;
+ int eob_shift = k_eob_offset_bits[eob_pt] - 1;
+ int bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_symbol(w, bit,
+ ec_ctx->eob_extra_cdf[txs_ctx][plane_type][eob_ctx], 2);
+ for (int i = 1; i < k_eob_offset_bits[eob_pt]; i++) {
+ eob_shift = k_eob_offset_bits[eob_pt] - 1 - i;
+ bit = (eob_extra & (1 << eob_shift)) ? 1 : 0;
+ aom_write_bit(w, bit);
+ }
+ }
+
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ for (c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = tcoeff[pos];
+ const tran_low_t level = abs(v);
+
+ if (c == eob - 1) {
+ aom_write_symbol(
+ w, AOMMIN(level, 3) - 1,
+ ec_ctx->coeff_base_eob_cdf[txs_ctx][plane_type][coeff_ctx], 3);
+ } else {
+ aom_write_symbol(w, AOMMIN(level, 3),
+ ec_ctx->coeff_base_cdf[txs_ctx][plane_type][coeff_ctx],
+ 4);
+ }
+ if (level > NUM_BASE_LEVELS) {
+ // level is above 1.
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ aom_write_symbol(
+ w, k,
+ ec_ctx->coeff_br_cdf[AOMMIN(txs_ctx, TX_32X32)][plane_type][br_ctx],
+ BR_CDF_SIZE);
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+
+ // Loop to code all signs in the transform block,
+ // starting with the sign of DC (if applicable)
+ for (c = 0; c < eob; ++c) {
+ const tran_low_t v = tcoeff[scan[c]];
+ const tran_low_t level = abs(v);
+ const int sign = (v < 0) ? 1 : 0;
+ if (level) {
+ if (c == 0) {
+ aom_write_symbol(
+ w, sign, ec_ctx->dc_sign_cdf[plane_type][txb_ctx->dc_sign_ctx], 2);
+ } else {
+ aom_write_bit(w, sign);
+ }
+ if (level > COEFF_BASE_RANGE + NUM_BASE_LEVELS)
+ write_golomb(w, level - COEFF_BASE_RANGE - 1 - NUM_BASE_LEVELS);
+ }
+ }
+}
+
+typedef struct encode_txb_args {
+ const AV1_COMMON *cm;
+ MACROBLOCK *x;
+ aom_writer *w;
+} ENCODE_TXB_ARGS;
+
+static void write_coeffs_txb_wrap(const AV1_COMMON *cm, MACROBLOCK *x,
+ aom_writer *w, int plane, int block,
+ int blk_row, int blk_col, TX_SIZE tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ uint16_t eob = x->mbmi_ext->eobs[plane][block];
+ TXB_CTX txb_ctx = { x->mbmi_ext->txb_skip_ctx[plane][block],
+ x->mbmi_ext->dc_sign_ctx[plane][block] };
+ av1_write_coeffs_txb(cm, xd, w, blk_row, blk_col, plane, tx_size, tcoeff, eob,
+ &txb_ctx);
+}
+
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+ int mi_col, aom_writer *w, BLOCK_SIZE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int num_planes = av1_num_planes(cm);
+ int block[MAX_MB_PLANE] = { 0 };
+ int row, col;
+ assert(bsize == get_plane_block_size(bsize, xd->plane[0].subsampling_x,
+ xd->plane[0].subsampling_y));
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ const BLOCK_SIZE max_unit_bsize = BLOCK_64X64;
+ int mu_blocks_wide = block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high = block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+ mu_blocks_wide = AOMMIN(max_blocks_wide, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(max_blocks_high, mu_blocks_high);
+
+ for (row = 0; row < max_blocks_high; row += mu_blocks_high) {
+ for (col = 0; col < max_blocks_wide; col += mu_blocks_wide) {
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int step = stepr * stepc;
+
+ const int unit_height = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_high + row, max_blocks_high), pd->subsampling_y);
+ const int unit_width = ROUND_POWER_OF_TWO(
+ AOMMIN(mu_blocks_wide + col, max_blocks_wide), pd->subsampling_x);
+ for (int blk_row = row >> pd->subsampling_y; blk_row < unit_height;
+ blk_row += stepr) {
+ for (int blk_col = col >> pd->subsampling_x; blk_col < unit_width;
+ blk_col += stepc) {
+ write_coeffs_txb_wrap(cm, x, w, plane, block[plane], blk_row,
+ blk_col, tx_size);
+ block[plane] += step;
+ }
+ }
+ }
+ }
+ }
+}
+
+// TODO(angiebird): use this function whenever it's possible
+static int get_tx_type_cost(const AV1_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, int plane, TX_SIZE tx_size,
+ TX_TYPE tx_type) {
+ if (plane > 0) return 0;
+
+ const TX_SIZE square_tx_size = txsize_sqr_map[tx_size];
+
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+ !xd->lossless[xd->mi[0]->segment_id]) {
+ const int ext_tx_set =
+ get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+ if (is_inter) {
+ if (ext_tx_set > 0)
+ return x->inter_tx_type_costs[ext_tx_set][square_tx_size][tx_type];
+ } else {
+ if (ext_tx_set > 0) {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+ return x->intra_tx_type_costs[ext_tx_set][square_tx_size][intra_dir]
+ [tx_type];
+ }
+ }
+ }
+ return 0;
+}
+
+static AOM_FORCE_INLINE int warehouse_efficients_txb(
+ const AV1_COMMON *const cm, const MACROBLOCK *x, const int plane,
+ const int block, const TX_SIZE tx_size, const TXB_CTX *const txb_ctx,
+ const struct macroblock_plane *p, const int eob,
+ const PLANE_TYPE plane_type, const LV_MAP_COEFF_COST *const coeff_costs,
+ const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+ const TX_CLASS tx_class) {
+ const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const int txb_skip_ctx = txb_ctx->txb_skip_ctx;
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *const eob_costs =
+ &x->eob_costs[eob_multi_size][plane_type];
+ int cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
+
+ av1_txb_init_levels(qcoeff, width, height, levels);
+
+ cost += get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+
+ cost += get_eob_cost(eob, eob_costs, coeff_costs, tx_class);
+
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ const int(*lps_cost)[COEFF_BASE_RANGE + 1] = coeff_costs->lps_cost;
+ int c = eob - 1;
+ {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int sign = v >> 31;
+ const int level = (v ^ sign) - sign;
+ const int coeff_ctx = coeff_contexts[pos];
+ cost += coeff_costs->base_eob_cost[coeff_ctx][AOMMIN(level, 3) - 1];
+
+ if (v) {
+ // sign bit cost
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ const int base_range =
+ AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ cost += lps_cost[ctx][base_range];
+ cost += get_golomb_cost(level);
+ }
+ if (c) {
+ cost += av1_cost_literal(1);
+ } else {
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ return cost;
+ }
+ }
+ }
+ const int(*base_cost)[4] = coeff_costs->base_cost;
+ for (c = eob - 2; c >= 1; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const int level = abs(v);
+ const int cost0 = base_cost[coeff_ctx][AOMMIN(level, 3)];
+ if (v) {
+ // sign bit cost
+ cost += av1_cost_literal(1);
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ const int base_range =
+ AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ cost += lps_cost[ctx][base_range];
+ cost += get_golomb_cost(level);
+ }
+ }
+ cost += cost0;
+ }
+ if (c == 0) {
+ const int pos = scan[c];
+ const tran_low_t v = qcoeff[pos];
+ const int coeff_ctx = coeff_contexts[pos];
+ const int sign = v >> 31;
+ const int level = (v ^ sign) - sign;
+ cost += base_cost[coeff_ctx][AOMMIN(level, 3)];
+
+ if (v) {
+ // sign bit cost
+ const int sign01 = (sign ^ sign) - sign;
+ const int dc_sign_ctx = txb_ctx->dc_sign_ctx;
+ cost += coeff_costs->dc_sign_cost[dc_sign_ctx][sign01];
+ if (level > NUM_BASE_LEVELS) {
+ const int ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ const int base_range =
+ AOMMIN(level - 1 - NUM_BASE_LEVELS, COEFF_BASE_RANGE);
+ cost += lps_cost[ctx][base_range];
+ cost += get_golomb_cost(level);
+ }
+ }
+ }
+ return cost;
+}
+
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+ const int plane, const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type, const TXB_CTX *const txb_ctx) {
+ const struct macroblock_plane *p = &x->plane[plane];
+ const int eob = p->eobs[block];
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs[txs_ctx][plane_type];
+ if (eob == 0) {
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ }
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+
+#define WAREHOUSE_EFFICIENTS_TXB_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ return warehouse_efficients_txb(cm, x, plane, block, tx_size, txb_ctx, p, \
+ eob, plane_type, coeff_costs, xd, tx_type, \
+ tx_class_literal);
+ switch (tx_class) {
+ WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_2D);
+ WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_HORIZ);
+ WAREHOUSE_EFFICIENTS_TXB_CASE(TX_CLASS_VERT);
+#undef WAREHOUSE_EFFICIENTS_TXB_CASE
+ default: assert(false); return 0;
+ }
+}
+
+static int optimize_txb(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+ const LV_MAP_EOB_COST *txb_eob_costs, int *rate_cost) {
+ int update = 0;
+ if (txb_info->eob == 0) return update;
+ const int16_t *const scan = txb_info->scan_order->scan;
+ // forward optimize the nz_map`
+ const int init_eob = txb_info->eob;
+ const TX_CLASS tx_class = tx_type_to_class[txb_info->tx_type];
+ const int eob_cost =
+ get_eob_cost(init_eob, txb_eob_costs, txb_costs, tx_class);
+
+ // backward optimize the level-k map
+ int accu_rate = eob_cost;
+ int64_t accu_dist = 0;
+ int64_t prev_eob_rd_cost = INT64_MAX;
+ int64_t cur_eob_rd_cost = 0;
+
+ {
+ const int si = init_eob - 1;
+ const int coeff_idx = scan[si];
+ LevelDownStats stats;
+ get_dist_cost_stats(&stats, si, si == init_eob - 1, txb_costs, txb_info,
+ tx_class);
+ if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+ update = 1;
+ update_coeff(coeff_idx, stats.low_qc, txb_info);
+ accu_rate += stats.rate_low;
+ accu_dist += stats.dist_low;
+ } else {
+ accu_rate += stats.rate;
+ accu_dist += stats.dist;
+ }
+ }
+
+ int si = init_eob - 2;
+ int8_t has_nz_tail = 0;
+ // eob is not fixed
+ for (; si >= 0 && has_nz_tail < 2; --si) {
+ assert(si != init_eob - 1);
+ const int coeff_idx = scan[si];
+ tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+ if (qc == 0) {
+ const int coeff_ctx =
+ get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+ txb_info->tx_size, tx_class);
+ accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ LevelDownStats stats;
+ get_dist_cost_stats_with_eob(&stats, si, txb_costs, txb_info, tx_class);
+ // check if it is better to make this the last significant coefficient
+ int cur_eob_rate =
+ get_eob_cost(si + 1, txb_eob_costs, txb_costs, tx_class);
+ cur_eob_rd_cost = RDCOST(txb_info->rdmult, cur_eob_rate, 0);
+ prev_eob_rd_cost =
+ RDCOST(txb_info->rdmult, accu_rate, accu_dist) + stats.nz_rd;
+ if (cur_eob_rd_cost <= prev_eob_rd_cost) {
+ update = 1;
+ for (int j = si + 1; j < txb_info->eob; j++) {
+ const int coeff_pos_j = scan[j];
+ update_coeff(coeff_pos_j, 0, txb_info);
+ }
+ txb_info->eob = si + 1;
+
+ // rerun cost calculation due to change of eob
+ accu_rate = cur_eob_rate;
+ accu_dist = 0;
+ get_dist_cost_stats(&stats, si, 1, txb_costs, txb_info, tx_class);
+ if ((stats.rd_low < stats.rd) && (stats.low_qc != 0)) {
+ update = 1;
+ update_coeff(coeff_idx, stats.low_qc, txb_info);
+ accu_rate += stats.rate_low;
+ accu_dist += stats.dist_low;
+ } else {
+ accu_rate += stats.rate;
+ accu_dist += stats.dist;
+ }
+
+ // reset non zero tail when new eob is found
+ has_nz_tail = 0;
+ } else {
+ int bUpdCoeff = 0;
+ if (stats.rd_low < stats.rd) {
+ if ((si < txb_info->eob - 1)) {
+ bUpdCoeff = 1;
+ update = 1;
+ }
+ } else {
+ ++has_nz_tail;
+ }
+
+ if (bUpdCoeff) {
+ update_coeff(coeff_idx, stats.low_qc, txb_info);
+ accu_rate += stats.rate_low;
+ accu_dist += stats.dist_low;
+ } else {
+ accu_rate += stats.rate;
+ accu_dist += stats.dist;
+ }
+ }
+ }
+ } // for (si)
+
+ // eob is fixed
+ for (; si >= 0; --si) {
+ assert(si != init_eob - 1);
+ const int coeff_idx = scan[si];
+ tran_low_t qc = txb_info->qcoeff[coeff_idx];
+
+ if (qc == 0) {
+ const int coeff_ctx =
+ get_lower_levels_ctx(txb_info->levels, coeff_idx, txb_info->bwl,
+ txb_info->tx_size, tx_class);
+ accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ LevelDownStats stats;
+ get_dist_cost_stats(&stats, si, 0, txb_costs, txb_info, tx_class);
+
+ int bUpdCoeff = 0;
+ if (stats.rd_low < stats.rd) {
+ if ((si < txb_info->eob - 1)) {
+ bUpdCoeff = 1;
+ update = 1;
+ }
+ }
+ if (bUpdCoeff) {
+ update_coeff(coeff_idx, stats.low_qc, txb_info);
+ accu_rate += stats.rate_low;
+ accu_dist += stats.dist_low;
+ } else {
+ accu_rate += stats.rate;
+ accu_dist += stats.dist;
+ }
+ }
+ } // for (si)
+
+ int non_zero_blk_rate =
+ txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][0];
+ prev_eob_rd_cost =
+ RDCOST(txb_info->rdmult, accu_rate + non_zero_blk_rate, accu_dist);
+
+ int zero_blk_rate =
+ txb_costs->txb_skip_cost[txb_info->txb_ctx->txb_skip_ctx][1];
+ int64_t zero_blk_rd_cost = RDCOST(txb_info->rdmult, zero_blk_rate, 0);
+ if (zero_blk_rd_cost <= prev_eob_rd_cost) {
+ update = 1;
+ for (int j = 0; j < txb_info->eob; j++) {
+ const int coeff_pos_j = scan[j];
+ update_coeff(coeff_pos_j, 0, txb_info);
+ }
+ txb_info->eob = 0;
+ }
+
+ // record total rate cost
+ *rate_cost = zero_blk_rd_cost <= prev_eob_rd_cost
+ ? zero_blk_rate
+ : accu_rate + non_zero_blk_rate;
+
+ if (txb_info->eob > 0) {
+ *rate_cost += txb_info->tx_type_cost;
+ }
+
+ return update;
+}
+
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+ { 17, 13 },
+ { 16, 10 },
+};
+
+void hbt_init() {
+ hbt_hash_table =
+ aom_malloc(sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+ memset(hbt_hash_table, 0,
+ sizeof(OptTxbQcoeff) * HBT_TABLE_SIZE * HBT_ARRAY_LENGTH);
+ av1_crc32c_calculator_init(&crc_calculator); // 31 bit: qc & ctx
+
+ hbt_needs_init = 0;
+}
+
+void hbt_destroy() { aom_free(hbt_hash_table); }
+
+int hbt_hash_miss(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+ TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+ const LV_MAP_EOB_COST *txb_eob_costs,
+ const struct macroblock_plane *p, int block, int fast_mode,
+ int *rate_cost) {
+ (void)fast_mode;
+ const int16_t *scan = txb_info->scan_order->scan;
+ int prev_eob = txb_info->eob;
+ assert(HBT_EOB <= 16); // Lengthen array if allowing longer eob.
+ int32_t prev_coeff[16];
+ for (int i = 0; i < prev_eob; i++) {
+ prev_coeff[i] = txb_info->qcoeff[scan[i]];
+ }
+ for (int i = prev_eob; i < HBT_EOB; i++) {
+ prev_coeff[i] = 0; // For compiler piece of mind.
+ }
+
+ av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+ txb_info->levels);
+
+ const int update =
+ optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+ // Overwrite old entry
+ uint16_t hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+ uint16_t hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .rate_cost = *rate_cost;
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index].init = 1;
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .hbt_qc_hash = hbt_qc_hash;
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .hbt_ctx_hash = hbt_ctx_hash;
+ assert(prev_eob >= txb_info->eob); // eob can't get longer
+ for (int i = 0; i < txb_info->eob; i++) {
+ // Record how coeff changed. Convention: towards zero is negative.
+ if (txb_info->qcoeff[scan[i]] > 0)
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] = txb_info->qcoeff[scan[i]] - prev_coeff[i];
+ else
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] = prev_coeff[i] - txb_info->qcoeff[scan[i]];
+ }
+ for (int i = txb_info->eob; i < prev_eob; i++) {
+ // If eob got shorter, record that all after it changed to zero.
+ if (prev_coeff[i] > 0)
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] = -prev_coeff[i];
+ else
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] = prev_coeff[i];
+ }
+ for (int i = prev_eob; i < HBT_EOB; i++) {
+ // Record 'no change' after optimized coefficients run out.
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] = 0;
+ }
+
+ if (update) {
+ p->eobs[block] = txb_info->eob;
+ p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+ txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+ }
+ return txb_info->eob;
+}
+
+int hbt_hash_hit(uint32_t hbt_table_index, int hbt_array_index,
+ TxbInfo *txb_info, const struct macroblock_plane *p, int block,
+ int *rate_cost) {
+ const int16_t *scan = txb_info->scan_order->scan;
+ int new_eob = 0;
+ int update = 0;
+
+ for (int i = 0; i < txb_info->eob; i++) {
+ // Delta convention is negatives go towards zero, so only apply those ones.
+ if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i] < 0) {
+ if (txb_info->qcoeff[scan[i]] > 0)
+ txb_info->qcoeff[scan[i]] +=
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i];
+ else
+ txb_info->qcoeff[scan[i]] -=
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .deltas[i];
+
+ update = 1;
+ update_coeff(scan[i], txb_info->qcoeff[scan[i]], txb_info);
+ }
+ if (txb_info->qcoeff[scan[i]]) new_eob = i + 1;
+ }
+
+ // Rate_cost can be calculated here instead (av1_cost_coeffs_txb), but
+ // it is expensive and gives little benefit as long as qc_hash is high bit
+ *rate_cost =
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .rate_cost;
+
+ if (update) {
+ txb_info->eob = new_eob;
+ p->eobs[block] = txb_info->eob;
+ p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+ txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+ }
+
+ return txb_info->eob;
+}
+
+int hbt_search_match(uint32_t hbt_ctx_hash, uint32_t hbt_qc_hash,
+ TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+ const LV_MAP_EOB_COST *txb_eob_costs,
+ const struct macroblock_plane *p, int block, int fast_mode,
+ int *rate_cost) {
+ // Check for qcoeff match
+ int hbt_array_index = hbt_qc_hash % HBT_ARRAY_LENGTH;
+ int hbt_table_index = hbt_ctx_hash % HBT_TABLE_SIZE;
+
+ if (hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .hbt_qc_hash == hbt_qc_hash &&
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .hbt_ctx_hash == hbt_ctx_hash &&
+ hbt_hash_table[hbt_table_index * HBT_ARRAY_LENGTH + hbt_array_index]
+ .init) {
+ return hbt_hash_hit(hbt_table_index, hbt_array_index, txb_info, p, block,
+ rate_cost);
+ } else {
+ return hbt_hash_miss(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+ txb_eob_costs, p, block, fast_mode, rate_cost);
+ }
+}
+
+int hbt_create_hashes(TxbInfo *txb_info, const LV_MAP_COEFF_COST *txb_costs,
+ const LV_MAP_EOB_COST *txb_eob_costs,
+ const struct macroblock_plane *p, int block,
+ int fast_mode, int *rate_cost) {
+ // Initialize hash table if needed.
+ if (hbt_needs_init) {
+ hbt_init();
+ }
+
+ //// Hash creation
+ uint8_t txb_hash_data[256]; // Asserts below to ensure enough space.
+ const int16_t *scan = txb_info->scan_order->scan;
+ uint8_t chunk = 0;
+ int hash_data_index = 0;
+
+ // Make qc_hash.
+ int packing_index = 0; // needed for packing.
+ for (int i = 0; i < txb_info->eob; i++) {
+ tran_low_t prechunk = txb_info->qcoeff[scan[i]];
+
+ // Softening: Improves speed. Aligns with signed deltas.
+ if (prechunk < 0) prechunk *= -1;
+
+ // Early kick out: Don't apply feature if there are large coeffs:
+ // If this kickout value is removed or raised beyond int8_t,
+ // widen deltas type in OptTxbQcoeff struct.
+ assert((int8_t)HBT_KICKOUT == HBT_KICKOUT); // If not, widen types.
+ if (prechunk > HBT_KICKOUT) {
+ av1_txb_init_levels(txb_info->qcoeff, txb_info->width, txb_info->height,
+ txb_info->levels);
+
+ const int update =
+ optimize_txb(txb_info, txb_costs, txb_eob_costs, rate_cost);
+
+ if (update) {
+ p->eobs[block] = txb_info->eob;
+ p->txb_entropy_ctx[block] = av1_get_txb_entropy_context(
+ txb_info->qcoeff, txb_info->scan_order, txb_info->eob);
+ }
+ return txb_info->eob;
+ }
+
+ // Since coeffs are 0 to 3, only 2 bits are needed: pack into bytes
+ if (packing_index == 0) txb_hash_data[hash_data_index] = 0;
+ chunk = prechunk << packing_index;
+ packing_index += 2;
+ txb_hash_data[hash_data_index] |= chunk;
+
+ // Full byte:
+ if (packing_index == 8) {
+ packing_index = 0;
+ hash_data_index++;
+ }
+ }
+ // Needed when packing_index != 0, to include final byte.
+ hash_data_index++;
+ assert(hash_data_index <= 64);
+ // 31 bit qc_hash: index to array
+ uint32_t hbt_qc_hash =
+ av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+
+ // Make ctx_hash.
+ hash_data_index = 0;
+ tran_low_t prechunk;
+
+ for (int i = 0; i < txb_info->eob; i++) {
+ // Save as magnitudes towards or away from zero.
+ if (txb_info->tcoeff[scan[i]] >= 0)
+ prechunk = txb_info->tcoeff[scan[i]] - txb_info->dqcoeff[scan[i]];
+ else
+ prechunk = txb_info->dqcoeff[scan[i]] - txb_info->tcoeff[scan[i]];
+
+ chunk = prechunk & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ }
+
+ // Extra ctx data:
+ // Include dequants.
+ txb_hash_data[hash_data_index++] = txb_info->dequant[0] & 0xff;
+ txb_hash_data[hash_data_index++] = txb_info->dequant[1] & 0xff;
+ chunk = txb_info->txb_ctx->txb_skip_ctx & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ chunk = txb_info->txb_ctx->dc_sign_ctx & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ // eob
+ chunk = txb_info->eob & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ // rdmult (int64)
+ chunk = txb_info->rdmult & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ // tx_type
+ chunk = txb_info->tx_type & 0xff;
+ txb_hash_data[hash_data_index++] = chunk;
+ // base_eob_cost
+ for (int i = 1; i < 3; i++) { // i = 0 are softened away
+ for (int j = 0; j < SIG_COEF_CONTEXTS_EOB; j++) {
+ chunk = (txb_costs->base_eob_cost[j][i] & 0xff00) >> 8;
+ txb_hash_data[hash_data_index++] = chunk;
+ }
+ }
+ // eob_cost
+ for (int i = 0; i < 11; i++) {
+ for (int j = 0; j < 2; j++) {
+ chunk = (txb_eob_costs->eob_cost[j][i] & 0xff00) >> 8;
+ txb_hash_data[hash_data_index++] = chunk;
+ }
+ }
+ // dc_sign_cost
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < DC_SIGN_CONTEXTS; j++) {
+ chunk = (txb_costs->dc_sign_cost[j][i] & 0xff00) >> 8;
+ txb_hash_data[hash_data_index++] = chunk;
+ }
+ }
+
+ assert(hash_data_index <= 256);
+ // 31 bit ctx_hash: used to index table
+ uint32_t hbt_ctx_hash =
+ av1_get_crc32c_value(&crc_calculator, txb_hash_data, hash_data_index);
+ //// End hash creation
+
+ return hbt_search_match(hbt_ctx_hash, hbt_qc_hash, txb_info, txb_costs,
+ txb_eob_costs, p, block, fast_mode, rate_cost);
+}
+
+static AOM_FORCE_INLINE int get_coeff_cost_simple(
+ int ci, tran_low_t abs_qc, int coeff_ctx,
+ const LV_MAP_COEFF_COST *txb_costs, int bwl, TX_CLASS tx_class,
+ const uint8_t *levels) {
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(ci > 0);
+ int cost = txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ if (abs_qc) {
+ cost += av1_cost_literal(1);
+ if (abs_qc > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+ cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+ cost += get_golomb_cost(abs_qc);
+ }
+ }
+ return cost;
+}
+
+static INLINE int get_coeff_cost_general(int is_last, int ci, tran_low_t abs_qc,
+ int sign, int coeff_ctx,
+ int dc_sign_ctx,
+ const LV_MAP_COEFF_COST *txb_costs,
+ int bwl, TX_CLASS tx_class,
+ const uint8_t *levels) {
+ int cost = 0;
+ if (is_last) {
+ cost += txb_costs->base_eob_cost[coeff_ctx][AOMMIN(abs_qc, 3) - 1];
+ } else {
+ cost += txb_costs->base_cost[coeff_ctx][AOMMIN(abs_qc, 3)];
+ }
+ if (abs_qc != 0) {
+ if (ci == 0) {
+ cost += txb_costs->dc_sign_cost[dc_sign_ctx][sign];
+ } else {
+ cost += av1_cost_literal(1);
+ }
+ if (abs_qc > NUM_BASE_LEVELS) {
+ const int br_ctx = get_br_ctx(levels, ci, bwl, tx_class);
+ cost += get_br_cost(abs_qc, br_ctx, txb_costs->lps_cost[br_ctx]);
+ cost += get_golomb_cost(abs_qc);
+ }
+ }
+ return cost;
+}
+
+static INLINE void get_qc_dqc_low(tran_low_t abs_qc, int sign, int dqv,
+ int shift, tran_low_t *qc_low,
+ tran_low_t *dqc_low) {
+ tran_low_t abs_qc_low = abs_qc - 1;
+ *qc_low = (-sign ^ abs_qc_low) + sign;
+ assert((sign ? -abs_qc_low : abs_qc_low) == *qc_low);
+ tran_low_t abs_dqc_low = (abs_qc_low * dqv) >> shift;
+ *dqc_low = (-sign ^ abs_dqc_low) + sign;
+ assert((sign ? -abs_dqc_low : abs_dqc_low) == *dqc_low);
+}
+
+static INLINE void update_coeff_general(
+ int *accu_rate, int64_t *accu_dist, int si, int eob, TX_SIZE tx_size,
+ TX_CLASS tx_class, int bwl, int height, int64_t rdmult, int shift,
+ int dc_sign_ctx, const int16_t *dequant, const int16_t *scan,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels) {
+ const int dqv = dequant[si != 0];
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int is_last = si == (eob - 1);
+ const int coeff_ctx = get_lower_levels_ctx_general(
+ is_last, si, bwl, height, levels, ci, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const int sign = (qc < 0) ? 1 : 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+ const int rate =
+ get_coeff_cost_general(is_last, ci, abs_qc, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ tran_low_t qc_low, dqc_low;
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+ const int rate_low =
+ get_coeff_cost_general(is_last, ci, abs_qc_low, sign, coeff_ctx,
+ dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+ if (rd_low < rd) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ *accu_dist += dist_low - dist0;
+ } else {
+ *accu_rate += rate;
+ *accu_dist += dist - dist0;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_simple(
+ int *accu_rate, int si, int eob, TX_SIZE tx_size, TX_CLASS tx_class,
+ int bwl, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_COEFF_COST *txb_costs,
+ const tran_low_t *tcoeff, tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ uint8_t *levels) {
+ const int dqv = dequant[1];
+ (void)eob;
+ // this simple version assumes the coeff's scan_idx is not DC (scan_idx != 0)
+ // and not the last (scan_idx != eob - 1)
+ assert(si != eob - 1);
+ assert(si > 0);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int rate = get_coeff_cost_simple(ci, abs_qc, coeff_ctx, txb_costs,
+ bwl, tx_class, levels);
+ if (abs(dqc) < abs(tqc)) {
+ *accu_rate += rate;
+ return;
+ }
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+ const int64_t rd = RDCOST(rdmult, rate, dist);
+
+ const int sign = (qc < 0) ? 1 : 0;
+ tran_low_t qc_low, dqc_low;
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift);
+ const int rate_low = get_coeff_cost_simple(
+ ci, abs_qc_low, coeff_ctx, txb_costs, bwl, tx_class, levels);
+ const int64_t rd_low = RDCOST(rdmult, rate_low, dist_low);
+ if (rd_low < rd) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ *accu_rate += rate_low;
+ } else {
+ *accu_rate += rate;
+ }
+ }
+}
+
+static AOM_FORCE_INLINE void update_coeff_eob(
+ int *accu_rate, int64_t *accu_dist, int *eob, int *nz_num, int *nz_ci,
+ int si, TX_SIZE tx_size, TX_CLASS tx_class, int bwl, int height,
+ int dc_sign_ctx, int64_t rdmult, int shift, const int16_t *dequant,
+ const int16_t *scan, const LV_MAP_EOB_COST *txb_eob_costs,
+ const LV_MAP_COEFF_COST *txb_costs, const tran_low_t *tcoeff,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, uint8_t *levels, int sharpness) {
+ const int dqv = dequant[si != 0];
+ assert(si != *eob - 1);
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const int coeff_ctx =
+ get_lower_levels_ctx(levels, ci, bwl, tx_size, tx_class);
+ if (qc == 0) {
+ *accu_rate += txb_costs->base_cost[coeff_ctx][0];
+ } else {
+ int lower_level = 0;
+ const tran_low_t abs_qc = abs(qc);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int sign = (qc < 0) ? 1 : 0;
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+ int64_t dist = get_coeff_dist(tqc, dqc, shift) - dist0;
+ int rate =
+ get_coeff_cost_general(0, ci, abs_qc, sign, coeff_ctx, dc_sign_ctx,
+ txb_costs, bwl, tx_class, levels);
+ int64_t rd = RDCOST(rdmult, *accu_rate + rate, *accu_dist + dist);
+
+ tran_low_t qc_low, dqc_low;
+ get_qc_dqc_low(abs_qc, sign, dqv, shift, &qc_low, &dqc_low);
+ const tran_low_t abs_qc_low = abs_qc - 1;
+ const int64_t dist_low = get_coeff_dist(tqc, dqc_low, shift) - dist0;
+ const int rate_low =
+ get_coeff_cost_general(0, ci, abs_qc_low, sign, coeff_ctx, dc_sign_ctx,
+ txb_costs, bwl, tx_class, levels);
+ const int64_t rd_low =
+ RDCOST(rdmult, *accu_rate + rate_low, *accu_dist + dist_low);
+
+ int lower_level_new_eob = 0;
+ const int new_eob = si + 1;
+ uint8_t tmp_levels[3];
+ for (int ni = 0; ni < *nz_num; ++ni) {
+ const int last_ci = nz_ci[ni];
+ tmp_levels[ni] = levels[get_padded_idx(last_ci, bwl)];
+ levels[get_padded_idx(last_ci, bwl)] = 0;
+ }
+
+ const int coeff_ctx_new_eob = get_lower_levels_ctx_general(
+ 1, si, bwl, height, levels, ci, tx_size, tx_class);
+ const int new_eob_cost =
+ get_eob_cost(new_eob, txb_eob_costs, txb_costs, tx_class);
+ int rate_coeff_eob =
+ new_eob_cost + get_coeff_cost_general(1, ci, abs_qc, sign,
+ coeff_ctx_new_eob, dc_sign_ctx,
+ txb_costs, bwl, tx_class, levels);
+ int64_t dist_new_eob = dist;
+ int64_t rd_new_eob = RDCOST(rdmult, rate_coeff_eob, dist_new_eob);
+
+ if (abs_qc_low > 0) {
+ const int rate_coeff_eob_low =
+ new_eob_cost +
+ get_coeff_cost_general(1, ci, abs_qc_low, sign, coeff_ctx_new_eob,
+ dc_sign_ctx, txb_costs, bwl, tx_class, levels);
+ const int64_t dist_new_eob_low = dist_low;
+ const int64_t rd_new_eob_low =
+ RDCOST(rdmult, rate_coeff_eob_low, dist_new_eob_low);
+ if (rd_new_eob_low < rd_new_eob) {
+ lower_level_new_eob = 1;
+ rd_new_eob = rd_new_eob_low;
+ rate_coeff_eob = rate_coeff_eob_low;
+ dist_new_eob = dist_new_eob_low;
+ }
+ }
+
+ if (rd_low < rd) {
+ lower_level = 1;
+ rd = rd_low;
+ rate = rate_low;
+ dist = dist_low;
+ }
+
+ if (sharpness == 0 && rd_new_eob < rd) {
+ for (int ni = 0; ni < *nz_num; ++ni) {
+ int last_ci = nz_ci[ni];
+ // levels[get_padded_idx(last_ci, bwl)] = 0;
+ qcoeff[last_ci] = 0;
+ dqcoeff[last_ci] = 0;
+ }
+ *eob = new_eob;
+ *nz_num = 0;
+ *accu_rate = rate_coeff_eob;
+ *accu_dist = dist_new_eob;
+ lower_level = lower_level_new_eob;
+ } else {
+ for (int ni = 0; ni < *nz_num; ++ni) {
+ const int last_ci = nz_ci[ni];
+ levels[get_padded_idx(last_ci, bwl)] = tmp_levels[ni];
+ }
+ *accu_rate += rate;
+ *accu_dist += dist;
+ }
+
+ if (lower_level) {
+ qcoeff[ci] = qc_low;
+ dqcoeff[ci] = dqc_low;
+ levels[get_padded_idx(ci, bwl)] = AOMMIN(abs_qc_low, INT8_MAX);
+ }
+ if (qcoeff[ci]) {
+ nz_ci[*nz_num] = ci;
+ ++*nz_num;
+ }
+ }
+}
+
+static INLINE void update_skip(int *accu_rate, int64_t accu_dist, int *eob,
+ int nz_num, int *nz_ci, int64_t rdmult,
+ int skip_cost, int non_skip_cost,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ int sharpness) {
+ const int64_t rd = RDCOST(rdmult, *accu_rate + non_skip_cost, accu_dist);
+ const int64_t rd_new_eob = RDCOST(rdmult, skip_cost, 0);
+ if (sharpness == 0 && rd_new_eob < rd) {
+ for (int i = 0; i < nz_num; ++i) {
+ const int ci = nz_ci[i];
+ qcoeff[ci] = 0;
+ dqcoeff[ci] = 0;
+ // no need to set up levels because this is the last step
+ // levels[get_padded_idx(ci, bwl)] = 0;
+ }
+ *accu_rate = 0;
+ *eob = 0;
+ }
+}
+
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+ const int16_t *dequant = p->dequant_QTX;
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ assert(width == (1 << bwl));
+ const int is_inter = is_inter_block(mbmi);
+ const SCAN_ORDER *scan_order = get_scan(tx_size, tx_type);
+ const int16_t *scan = scan_order->scan;
+ const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST *txb_eob_costs =
+ &x->eob_costs[eob_multi_size][plane_type];
+
+ const int shift = av1_get_tx_scale(tx_size);
+ const int64_t rdmult =
+ ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+ 2) >>
+ (sharpness +
+ (cpi->oxcf.aq_mode == VARIANCE_AQ && mbmi->segment_id < 4
+ ? 7 - mbmi->segment_id
+ : 2) +
+ (cpi->oxcf.aq_mode != VARIANCE_AQ &&
+ cpi->oxcf.deltaq_mode > NO_DELTA_Q && x->sb_energy_level < 0
+ ? (3 - x->sb_energy_level)
+ : 0));
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+
+ av1_txb_init_levels(qcoeff, width, height, levels);
+
+ // TODO(angirbird): check iqmatrix
+
+ const int non_skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][0];
+ const int skip_cost = txb_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+ int eob = p->eobs[block];
+ const int eob_cost = get_eob_cost(eob, txb_eob_costs, txb_costs, tx_class);
+ int accu_rate = eob_cost;
+ int64_t accu_dist = 0;
+ int si = eob - 1;
+ const int ci = scan[si];
+ const tran_low_t qc = qcoeff[ci];
+ const tran_low_t abs_qc = abs(qc);
+ const int sign = qc < 0;
+ const int max_nz_num = 2;
+ int nz_num = 1;
+ int nz_ci[3] = { ci, 0, 0 };
+ if (abs_qc >= 2) {
+ update_coeff_general(&accu_rate, &accu_dist, si, eob, tx_size, tx_class,
+ bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels);
+ --si;
+ } else {
+ assert(abs_qc == 1);
+ const int coeff_ctx = get_lower_levels_ctx_general(
+ 1, si, bwl, height, levels, ci, tx_size, tx_class);
+ accu_rate += get_coeff_cost_general(1, ci, abs_qc, sign, coeff_ctx,
+ txb_ctx->dc_sign_ctx, txb_costs, bwl,
+ tx_class, levels);
+ const tran_low_t tqc = tcoeff[ci];
+ const tran_low_t dqc = dqcoeff[ci];
+ const int64_t dist = get_coeff_dist(tqc, dqc, shift);
+ const int64_t dist0 = get_coeff_dist(tqc, 0, shift);
+ accu_dist += dist - dist0;
+ --si;
+ }
+
+#define UPDATE_COEFF_EOB_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 0 && nz_num <= max_nz_num; --si) { \
+ update_coeff_eob(&accu_rate, &accu_dist, &eob, &nz_num, nz_ci, si, \
+ tx_size, tx_class_literal, bwl, height, \
+ txb_ctx->dc_sign_ctx, rdmult, shift, dequant, scan, \
+ txb_eob_costs, txb_costs, tcoeff, qcoeff, dqcoeff, \
+ levels, sharpness); \
+ } \
+ break;
+ switch (tx_class) {
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_EOB_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_EOB_CASE
+ default: assert(false);
+ }
+
+ if (si == -1 && nz_num <= max_nz_num) {
+ update_skip(&accu_rate, accu_dist, &eob, nz_num, nz_ci, rdmult, skip_cost,
+ non_skip_cost, qcoeff, dqcoeff, sharpness);
+ }
+
+#define UPDATE_COEFF_SIMPLE_CASE(tx_class_literal) \
+ case tx_class_literal: \
+ for (; si >= 1; --si) { \
+ update_coeff_simple(&accu_rate, si, eob, tx_size, tx_class_literal, bwl, \
+ rdmult, shift, dequant, scan, txb_costs, tcoeff, \
+ qcoeff, dqcoeff, levels); \
+ } \
+ break;
+ switch (tx_class) {
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_2D);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_HORIZ);
+ UPDATE_COEFF_SIMPLE_CASE(TX_CLASS_VERT);
+#undef UPDATE_COEFF_SIMPLE_CASE
+ default: assert(false);
+ }
+
+ // DC position
+ if (si == 0) {
+ // no need to update accu_dist because it's not used after this point
+ int64_t dummy_dist = 0;
+ update_coeff_general(&accu_rate, &dummy_dist, si, eob, tx_size, tx_class,
+ bwl, height, rdmult, shift, txb_ctx->dc_sign_ctx,
+ dequant, scan, txb_costs, tcoeff, qcoeff, dqcoeff,
+ levels);
+ }
+
+ const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+ if (eob == 0)
+ accu_rate += skip_cost;
+ else
+ accu_rate += non_skip_cost + tx_type_cost;
+
+ p->eobs[block] = eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, p->eobs[block]);
+
+ *rate_cost = accu_rate;
+ return eob;
+}
+
+// This function is deprecated, but we keep it here because hash trellis
+// is not integrated with av1_optimize_txb_new yet
+int av1_optimize_txb(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int blk_row, int blk_col, int block, TX_SIZE tx_size,
+ TXB_CTX *txb_ctx, int fast_mode, int *rate_cost) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
+ tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const tran_low_t *tcoeff = BLOCK_OFFSET(p->coeff, block);
+ const int16_t *dequant = p->dequant_QTX;
+ const int seg_eob = av1_get_max_eob(tx_size);
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int is_inter = is_inter_block(mbmi);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const LV_MAP_COEFF_COST *txb_costs = &x->coeff_costs[txs_ctx][plane_type];
+ const int eob_multi_size = txsize_log2_minus4[tx_size];
+ const LV_MAP_EOB_COST txb_eob_costs =
+ x->eob_costs[eob_multi_size][plane_type];
+
+ const int shift = av1_get_tx_scale(tx_size);
+ const int64_t rdmult =
+ ((x->rdmult * plane_rd_mult[is_inter][plane_type] << (2 * (xd->bd - 8))) +
+ 2) >>
+ 2;
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+ const TX_SIZE qm_tx_size = av1_get_adjusted_tx_size(tx_size);
+ const qm_val_t *iqmatrix =
+ IS_2D_TRANSFORM(tx_type)
+ ? pd->seg_iqmatrix[mbmi->segment_id][qm_tx_size]
+ : cm->giqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size];
+ assert(width == (1 << bwl));
+ const int tx_type_cost = get_tx_type_cost(cm, x, xd, plane, tx_size, tx_type);
+ TxbInfo txb_info = {
+ qcoeff, levels, dqcoeff, tcoeff, dequant, shift,
+ tx_size, txs_ctx, tx_type, bwl, width, height,
+ eob, seg_eob, scan_order, txb_ctx, rdmult, &cm->coeff_ctx_table,
+ iqmatrix, tx_type_cost,
+ };
+
+ // Hash based trellis (hbt) speed feature: avoid expensive optimize_txb calls
+ // by storing the coefficient deltas in a hash table.
+ // Currently disabled in speedfeatures.c
+ if (eob <= HBT_EOB && eob > 0 && cpi->sf.use_hash_based_trellis) {
+ return hbt_create_hashes(&txb_info, txb_costs, &txb_eob_costs, p, block,
+ fast_mode, rate_cost);
+ }
+
+ av1_txb_init_levels(qcoeff, width, height, levels);
+
+ const int update =
+ optimize_txb(&txb_info, txb_costs, &txb_eob_costs, rate_cost);
+
+ if (update) {
+ p->eobs[block] = txb_info.eob;
+ p->txb_entropy_ctx[block] =
+ av1_get_txb_entropy_context(qcoeff, scan_order, txb_info.eob);
+ }
+ return txb_info.eob;
+}
+
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob) {
+ const int16_t *const scan = scan_order->scan;
+ int cul_level = 0;
+ int c;
+
+ if (eob == 0) return 0;
+ for (c = 0; c < eob; ++c) {
+ cul_level += abs(qcoeff[scan[c]]);
+ if (cul_level > COEFF_CONTEXT_MASK) break;
+ }
+
+ cul_level = AOMMIN(COEFF_CONTEXT_MASK, cul_level);
+ set_dc_sign(&cul_level, qcoeff[0]);
+
+ return cul_level;
+}
+
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ const uint16_t eob = p->eobs[block];
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int cul_level = av1_get_txb_entropy_context(qcoeff, scan_order, eob);
+ av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+ blk_row);
+}
+
+static void update_tx_type_count(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int blk_row, int blk_col, int plane,
+ TX_SIZE tx_size, FRAME_COUNTS *counts,
+ uint8_t allow_update_cdf) {
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int is_inter = is_inter_block(mbmi);
+ FRAME_CONTEXT *fc = xd->tile_ctx;
+#if !CONFIG_ENTROPY_STATS
+ (void)counts;
+#endif // !CONFIG_ENTROPY_STATS
+
+ // Only y plane's tx_type is updated
+ if (plane > 0) return;
+ TX_TYPE tx_type = av1_get_tx_type(PLANE_TYPE_Y, xd, blk_row, blk_col, tx_size,
+ cm->reduced_tx_set_used);
+ if (get_ext_tx_types(tx_size, is_inter, cm->reduced_tx_set_used) > 1 &&
+ cm->base_qindex > 0 && !mbmi->skip &&
+ !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+ const int eset = get_ext_tx_set(tx_size, is_inter, cm->reduced_tx_set_used);
+ if (eset > 0) {
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+ if (is_inter) {
+ if (allow_update_cdf) {
+ update_cdf(fc->inter_ext_tx_cdf[eset][txsize_sqr_map[tx_size]],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+#if CONFIG_ENTROPY_STATS
+ ++counts->inter_ext_tx[eset][txsize_sqr_map[tx_size]]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ } else {
+ PREDICTION_MODE intra_dir;
+ if (mbmi->filter_intra_mode_info.use_filter_intra)
+ intra_dir = fimode_to_intradir[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ else
+ intra_dir = mbmi->mode;
+#if CONFIG_ENTROPY_STATS
+ ++counts->intra_ext_tx[eset][txsize_sqr_map[tx_size]][intra_dir]
+ [av1_ext_tx_ind[tx_set_type][tx_type]];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(
+ fc->intra_ext_tx_cdf[eset][txsize_sqr_map[tx_size]][intra_dir],
+ av1_ext_tx_ind[tx_set_type][tx_type],
+ av1_num_ext_tx_set[tx_set_type]);
+ }
+ }
+ }
+ }
+}
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg) {
+ struct tokenize_b_args *const args = arg;
+ const AV1_COMP *cpi = args->cpi;
+ const AV1_COMMON *cm = &cpi->common;
+ ThreadData *const td = args->td;
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ struct macroblock_plane *p = &x->plane[plane];
+ struct macroblockd_plane *pd = &xd->plane[plane];
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int eob = p->eobs[block];
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, pd->above_context + blk_col,
+ pd->left_context + blk_row, &txb_ctx);
+ const int bwl = get_txb_bwl(tx_size);
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const uint8_t allow_update_cdf = args->allow_update_cdf;
+ const TX_SIZE txsize_ctx = get_txsize_entropy_ctx(tx_size);
+ FRAME_CONTEXT *ec_ctx = xd->tile_ctx;
+#if CONFIG_ENTROPY_STATS
+ int cdf_idx = cm->coef_cdf_category;
+#endif // CONFIG_ENTROPY_STATS
+
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->txb_skip[cdf_idx][txsize_ctx][txb_ctx.txb_skip_ctx][eob == 0];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->txb_skip_cdf[txsize_ctx][txb_ctx.txb_skip_ctx], eob == 0,
+ 2);
+ }
+
+ x->mbmi_ext->txb_skip_ctx[plane][block] = txb_ctx.txb_skip_ctx;
+ x->mbmi_ext->eobs[plane][block] = eob;
+
+ if (eob == 0) {
+ av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, 0, blk_col, blk_row);
+ return;
+ }
+
+ tran_low_t *tcoeff = BLOCK_OFFSET(x->mbmi_ext->tcoeff[plane], block);
+ const int segment_id = mbmi->segment_id;
+ const int seg_eob = av1_get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+ const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+ memcpy(tcoeff, qcoeff, sizeof(*tcoeff) * seg_eob);
+
+ uint8_t levels_buf[TX_PAD_2D];
+ uint8_t *const levels = set_levels(levels_buf, width);
+ av1_txb_init_levels(tcoeff, width, height, levels);
+ update_tx_type_count(cm, xd, blk_row, blk_col, plane, tx_size, td->counts,
+ allow_update_cdf);
+
+ const PLANE_TYPE plane_type = pd->plane_type;
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, cm->reduced_tx_set_used);
+ const TX_CLASS tx_class = tx_type_to_class[tx_type];
+ const SCAN_ORDER *const scan_order = get_scan(tx_size, tx_type);
+ const int16_t *const scan = scan_order->scan;
+#if CONFIG_ENTROPY_STATS
+ av1_update_eob_context(cdf_idx, eob, tx_size, tx_class, plane_type, ec_ctx,
+ td->counts, allow_update_cdf);
+#else
+ av1_update_eob_context(eob, tx_size, tx_class, plane_type, ec_ctx,
+ allow_update_cdf);
+#endif
+
+ DECLARE_ALIGNED(16, int8_t, coeff_contexts[MAX_TX_SQUARE]);
+ av1_get_nz_map_contexts(levels, scan, eob, tx_size, tx_class, coeff_contexts);
+
+ for (int c = eob - 1; c >= 0; --c) {
+ const int pos = scan[c];
+ const int coeff_ctx = coeff_contexts[pos];
+ const tran_low_t v = qcoeff[pos];
+ const tran_low_t level = abs(v);
+
+ if (allow_update_cdf) {
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+ update_cdf(
+ ec_ctx->coeff_base_eob_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3) - 1, 3);
+ } else {
+ update_cdf(ec_ctx->coeff_base_cdf[txsize_ctx][plane_type][coeff_ctx],
+ AOMMIN(level, 3), 4);
+ }
+ }
+ {
+ if (c == eob - 1) {
+ assert(coeff_ctx < 4);
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_base_eob_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3) - 1];
+ } else {
+ ++td->counts->coeff_base_multi[cdf_idx][txsize_ctx][plane_type]
+ [coeff_ctx][AOMMIN(level, 3)];
+#endif
+ }
+ }
+ if (level > NUM_BASE_LEVELS) {
+ const int base_range = level - 1 - NUM_BASE_LEVELS;
+ const int br_ctx = get_br_ctx(levels, pos, bwl, tx_class);
+ for (int idx = 0; idx < COEFF_BASE_RANGE; idx += BR_CDF_SIZE - 1) {
+ const int k = AOMMIN(base_range - idx, BR_CDF_SIZE - 1);
+ if (allow_update_cdf) {
+ update_cdf(ec_ctx->coeff_br_cdf[AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx],
+ k, BR_CDF_SIZE);
+ }
+ for (int lps = 0; lps < BR_CDF_SIZE - 1; lps++) {
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps[AOMMIN(txsize_ctx, TX_32X32)][plane_type][lps]
+ [br_ctx][lps == k];
+#endif // CONFIG_ENTROPY_STATS
+ if (lps == k) break;
+ }
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->coeff_lps_multi[cdf_idx][AOMMIN(txsize_ctx, TX_32X32)]
+ [plane_type][br_ctx][k];
+#endif
+ if (k < BR_CDF_SIZE - 1) break;
+ }
+ }
+ }
+
+ // Update the context needed to code the DC sign (if applicable)
+ if (tcoeff[0] != 0) {
+ const int dc_sign = (tcoeff[0] < 0) ? 1 : 0;
+ const int dc_sign_ctx = txb_ctx.dc_sign_ctx;
+#if CONFIG_ENTROPY_STATS
+ ++td->counts->dc_sign[plane_type][dc_sign_ctx][dc_sign];
+#endif // CONFIG_ENTROPY_STATS
+ if (allow_update_cdf)
+ update_cdf(ec_ctx->dc_sign_cdf[plane_type][dc_sign_ctx], dc_sign, 2);
+ x->mbmi_ext->dc_sign_ctx[plane][block] = dc_sign_ctx;
+ }
+
+ const int cul_level = av1_get_txb_entropy_context(tcoeff, scan_order, eob);
+ av1_set_contexts(xd, pd, plane, plane_bsize, tx_size, cul_level, blk_col,
+ blk_row);
+}
+
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ int mi_row, int mi_col, uint8_t allow_update_cdf) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct tokenize_b_args arg = { cpi, td, NULL, 0, allow_update_cdf };
+ (void)rate;
+ (void)mi_row;
+ (void)mi_col;
+ if (mbmi->skip) {
+ av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+ return;
+ }
+
+ if (!dry_run) {
+ av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+ av1_update_and_record_txb_context, &arg,
+ num_planes);
+ } else if (dry_run == DRY_RUN_NORMAL) {
+ av1_foreach_transformed_block(xd, bsize, mi_row, mi_col,
+ av1_update_txb_context_b, &arg, num_planes);
+ } else {
+ printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+ assert(0);
+ }
+}
diff --git a/third_party/aom/av1/encoder/encodetxb.h b/third_party/aom/av1/encoder/encodetxb.h
new file mode 100644
index 0000000000..40ae343b0d
--- /dev/null
+++ b/third_party/aom/av1/encoder/encodetxb.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ENCODETXB_H_
+#define AOM_AV1_ENCODER_ENCODETXB_H_
+
+#include "config/aom_config.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/encoder.h"
+#include "aom_dsp/bitwriter.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TxbInfo {
+ tran_low_t *qcoeff;
+ uint8_t *levels; // absolute values and clamped to 255.
+ tran_low_t *dqcoeff;
+ const tran_low_t *tcoeff;
+ const int16_t *dequant;
+ int shift;
+ TX_SIZE tx_size;
+ TX_SIZE txs_ctx;
+ TX_TYPE tx_type;
+ int bwl;
+ int width;
+ int height;
+ int eob;
+ int seg_eob;
+ const SCAN_ORDER *scan_order;
+ TXB_CTX *txb_ctx;
+ int64_t rdmult;
+ const LV_MAP_CTX_TABLE *coeff_ctx_table;
+ const qm_val_t *iqmatrix;
+ int tx_type_cost;
+} TxbInfo;
+
+void av1_alloc_txb_buf(AV1_COMP *cpi);
+void av1_free_txb_buf(AV1_COMP *cpi);
+int av1_cost_coeffs_txb(const AV1_COMMON *const cm, const MACROBLOCK *x,
+ const int plane, const int block, const TX_SIZE tx_size,
+ const TX_TYPE tx_type, const TXB_CTX *const txb_ctx);
+void av1_write_coeffs_txb(const AV1_COMMON *const cm, MACROBLOCKD *xd,
+ aom_writer *w, int blk_row, int blk_col, int plane,
+ TX_SIZE tx_size, const tran_low_t *tcoeff,
+ uint16_t eob, TXB_CTX *txb_ctx);
+void av1_write_coeffs_mb(const AV1_COMMON *const cm, MACROBLOCK *x, int mi_row,
+ int mi_col, aom_writer *w, BLOCK_SIZE bsize);
+int av1_get_txb_entropy_context(const tran_low_t *qcoeff,
+ const SCAN_ORDER *scan_order, int eob);
+void av1_update_txb_context(const AV1_COMP *cpi, ThreadData *td,
+ RUN_TYPE dry_run, BLOCK_SIZE bsize, int *rate,
+ int mi_row, int mi_col, uint8_t allow_update_cdf);
+
+void av1_update_txb_context_b(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ void *arg);
+
+void av1_update_and_record_txb_context(int plane, int block, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, void *arg);
+
+void av1_set_coeff_buffer(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ int mi_row, int mi_col);
+
+void hbt_destroy();
+int av1_optimize_txb_new(const struct AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, TX_SIZE tx_size, TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx, int *rate_cost,
+ int sharpness);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_ENCODETXB_H_
diff --git a/third_party/aom/av1/encoder/ethread.c b/third_party/aom/av1/encoder/ethread.c
new file mode 100644
index 0000000000..e8ac30bb52
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ethread.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
+ for (int i = 0; i < REFERENCE_MODES; i++)
+ td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
+
+ for (int i = 0; i < REF_FRAMES; i++)
+ td->rd_counts.global_motion_used[i] +=
+ td_t->rd_counts.global_motion_used[i];
+
+ td->rd_counts.compound_ref_used_flag |=
+ td_t->rd_counts.compound_ref_used_flag;
+ td->rd_counts.skip_mode_used_flag |= td_t->rd_counts.skip_mode_used_flag;
+}
+
+static int enc_worker_hook(void *arg1, void *unused) {
+ EncWorkerData *const thread_data = (EncWorkerData *)arg1;
+ AV1_COMP *const cpi = thread_data->cpi;
+ const AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int t;
+
+ (void)unused;
+
+ for (t = thread_data->start; t < tile_rows * tile_cols;
+ t += cpi->num_workers) {
+ int tile_row = t / tile_cols;
+ int tile_col = t % tile_cols;
+
+ av1_encode_tile(cpi, thread_data->td, tile_row, tile_col);
+ }
+
+ return 1;
+}
+
+static void create_enc_workers(AV1_COMP *cpi, int num_workers) {
+ AV1_COMMON *const cm = &cpi->common;
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ CHECK_MEM_ERROR(cm, cpi->workers,
+ aom_malloc(num_workers * sizeof(*cpi->workers)));
+
+ CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+ aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+ ++cpi->num_workers;
+ winterface->init(worker);
+
+ thread_data->cpi = cpi;
+
+ if (i < num_workers - 1) {
+ // Allocate thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td,
+ aom_memalign(32, sizeof(*thread_data->td)));
+ av1_zero(*thread_data->td);
+
+ // Set up pc_tree.
+ thread_data->td->pc_tree = NULL;
+ av1_setup_pc_tree(cm, thread_data->td);
+
+ CHECK_MEM_ERROR(cm, thread_data->td->above_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->above_pred_buf)));
+ CHECK_MEM_ERROR(cm, thread_data->td->left_pred_buf,
+ (uint8_t *)aom_memalign(
+ 16, MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->left_pred_buf)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->wsrc_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->wsrc_buf)));
+
+ for (int x = 0; x < 2; x++)
+ for (int y = 0; y < 2; y++)
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->hash_value_buffer[x][y],
+ (uint32_t *)aom_malloc(
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0])));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->mask_buf,
+ (int32_t *)aom_memalign(
+ 16, MAX_SB_SQUARE * sizeof(*thread_data->td->mask_buf)));
+ // Allocate frame counters in thread data.
+ CHECK_MEM_ERROR(cm, thread_data->td->counts,
+ aom_calloc(1, sizeof(*thread_data->td->counts)));
+
+ // Allocate buffers used by palette coding mode.
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->palette_buffer,
+ aom_memalign(16, sizeof(*thread_data->td->palette_buffer)));
+
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_conv_dst,
+ aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE *
+ sizeof(*thread_data->td->tmp_conv_dst)));
+ for (int j = 0; j < 2; ++j) {
+ CHECK_MEM_ERROR(
+ cm, thread_data->td->tmp_obmc_bufs[j],
+ aom_memalign(16, 2 * MAX_MB_PLANE * MAX_SB_SQUARE *
+ sizeof(*thread_data->td->tmp_obmc_bufs[j])));
+ }
+
+ // Create threads
+ if (!winterface->reset(worker))
+ aom_internal_error(&cm->error, AOM_CODEC_ERROR,
+ "Tile encoder thread creation failed");
+ } else {
+ // Main thread acts as a worker and uses the thread data in cpi.
+ thread_data->td = &cpi->td;
+ }
+ winterface->sync(worker);
+ }
+}
+
+static void launch_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+ // Encode a frame
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+ // Set the starting tile for each thread.
+ thread_data->start = i;
+
+ if (i == cpi->num_workers - 1)
+ winterface->execute(worker);
+ else
+ winterface->launch(worker);
+ }
+}
+
+static void sync_enc_workers(AV1_COMP *cpi, int num_workers) {
+ const AVxWorkerInterface *const winterface = aom_get_worker_interface();
+
+ // Encoding ends.
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ winterface->sync(worker);
+ }
+}
+
+static void accumulate_counters_enc_workers(AV1_COMP *cpi, int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+ cpi->intrabc_used |= thread_data->td->intrabc_used_this_tile;
+ // Accumulate counters.
+ if (i < cpi->num_workers - 1) {
+ av1_accumulate_frame_counts(&cpi->counts, thread_data->td->counts);
+ accumulate_rd_opt(&cpi->td, thread_data->td);
+ cpi->td.mb.txb_split_count += thread_data->td->mb.txb_split_count;
+ }
+ }
+}
+
+static void prepare_enc_workers(AV1_COMP *cpi, AVxWorkerHook hook,
+ int num_workers) {
+ for (int i = 0; i < num_workers; i++) {
+ AVxWorker *const worker = &cpi->workers[i];
+ EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
+
+ worker->hook = hook;
+ worker->data1 = thread_data;
+ worker->data2 = NULL;
+
+ // Before encoding a frame, copy the thread data from cpi.
+ if (thread_data->td != &cpi->td) {
+ thread_data->td->mb = cpi->td.mb;
+ thread_data->td->rd_counts = cpi->td.rd_counts;
+ thread_data->td->mb.above_pred_buf = thread_data->td->above_pred_buf;
+ thread_data->td->mb.left_pred_buf = thread_data->td->left_pred_buf;
+ thread_data->td->mb.wsrc_buf = thread_data->td->wsrc_buf;
+ for (int x = 0; x < 2; x++) {
+ for (int y = 0; y < 2; y++) {
+ memcpy(thread_data->td->hash_value_buffer[x][y],
+ cpi->td.mb.hash_value_buffer[x][y],
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH *
+ sizeof(*thread_data->td->hash_value_buffer[0][0]));
+ thread_data->td->mb.hash_value_buffer[x][y] =
+ thread_data->td->hash_value_buffer[x][y];
+ }
+ }
+ thread_data->td->mb.mask_buf = thread_data->td->mask_buf;
+ }
+ if (thread_data->td->counts != &cpi->counts) {
+ memcpy(thread_data->td->counts, &cpi->counts, sizeof(cpi->counts));
+ }
+
+ if (i < num_workers - 1) {
+ thread_data->td->mb.palette_buffer = thread_data->td->palette_buffer;
+ thread_data->td->mb.tmp_conv_dst = thread_data->td->tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.tmp_obmc_bufs[j] =
+ thread_data->td->tmp_obmc_bufs[j];
+ }
+
+ thread_data->td->mb.e_mbd.tmp_conv_dst = thread_data->td->mb.tmp_conv_dst;
+ for (int j = 0; j < 2; ++j) {
+ thread_data->td->mb.e_mbd.tmp_obmc_bufs[j] =
+ thread_data->td->mb.tmp_obmc_bufs[j];
+ }
+ }
+ }
+}
+
+void av1_encode_tiles_mt(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int tile_cols = cm->tile_cols;
+ const int tile_rows = cm->tile_rows;
+ int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols * tile_rows);
+
+ if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows)
+ av1_alloc_tile_data(cpi);
+
+ av1_init_tile_data(cpi);
+ // Only run once to create threads and allocate thread data.
+ if (cpi->num_workers == 0) {
+ create_enc_workers(cpi, num_workers);
+ } else {
+ num_workers = AOMMIN(num_workers, cpi->num_workers);
+ }
+ prepare_enc_workers(cpi, enc_worker_hook, num_workers);
+ launch_enc_workers(cpi, num_workers);
+ sync_enc_workers(cpi, num_workers);
+ accumulate_counters_enc_workers(cpi, num_workers);
+}
+
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void av1_accumulate_frame_counts(FRAME_COUNTS *acc_counts,
+ const FRAME_COUNTS *counts) {
+ unsigned int *const acc = (unsigned int *)acc_counts;
+ const unsigned int *const cnt = (const unsigned int *)counts;
+
+ const unsigned int n_counts = sizeof(FRAME_COUNTS) / sizeof(unsigned int);
+
+ for (unsigned int i = 0; i < n_counts; i++) acc[i] += cnt[i];
+}
diff --git a/third_party/aom/av1/encoder/ethread.h b/third_party/aom/av1/encoder/ethread.h
new file mode 100644
index 0000000000..5de4b48038
--- /dev/null
+++ b/third_party/aom/av1/encoder/ethread.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ETHREAD_H_
+#define AOM_AV1_ENCODER_ETHREAD_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct AV1_COMP;
+struct ThreadData;
+
+typedef struct EncWorkerData {
+ struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ int start;
+} EncWorkerData;
+
+void av1_encode_tiles_mt(struct AV1_COMP *cpi);
+
+void av1_accumulate_frame_counts(struct FRAME_COUNTS *acc_counts,
+ const struct FRAME_COUNTS *counts);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ETHREAD_H_
diff --git a/third_party/aom/av1/encoder/extend.c b/third_party/aom/av1/encoder/extend.c
new file mode 100644
index 0000000000..e9621a5742
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/extend.h"
+
+static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
+ uint8_t *dst, int dst_pitch, int w, int h,
+ int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+
+ // copy the left and right most columns out
+ const uint8_t *src_ptr1 = src;
+ const uint8_t *src_ptr2 = src + w - 1;
+ uint8_t *dst_ptr1 = dst - extend_left;
+ uint8_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ memset(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w);
+ memset(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize);
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize);
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+static void highbd_copy_and_extend_plane(const uint8_t *src8, int src_pitch,
+ uint8_t *dst8, int dst_pitch, int w,
+ int h, int extend_top, int extend_left,
+ int extend_bottom, int extend_right) {
+ int i, linesize;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+ // copy the left and right most columns out
+ const uint16_t *src_ptr1 = src;
+ const uint16_t *src_ptr2 = src + w - 1;
+ uint16_t *dst_ptr1 = dst - extend_left;
+ uint16_t *dst_ptr2 = dst + w;
+
+ for (i = 0; i < h; i++) {
+ aom_memset16(dst_ptr1, src_ptr1[0], extend_left);
+ memcpy(dst_ptr1 + extend_left, src_ptr1, w * sizeof(src_ptr1[0]));
+ aom_memset16(dst_ptr2, src_ptr2[0], extend_right);
+ src_ptr1 += src_pitch;
+ src_ptr2 += src_pitch;
+ dst_ptr1 += dst_pitch;
+ dst_ptr2 += dst_pitch;
+ }
+
+ // Now copy the top and bottom lines into each line of the respective
+ // borders
+ src_ptr1 = dst - extend_left;
+ src_ptr2 = dst + dst_pitch * (h - 1) - extend_left;
+ dst_ptr1 = dst + dst_pitch * (-extend_top) - extend_left;
+ dst_ptr2 = dst + dst_pitch * (h)-extend_left;
+ linesize = extend_left + extend_right + w;
+
+ for (i = 0; i < extend_top; i++) {
+ memcpy(dst_ptr1, src_ptr1, linesize * sizeof(src_ptr1[0]));
+ dst_ptr1 += dst_pitch;
+ }
+
+ for (i = 0; i < extend_bottom; i++) {
+ memcpy(dst_ptr2, src_ptr2, linesize * sizeof(src_ptr2[0]));
+ dst_ptr2 += dst_pitch;
+ }
+}
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst) {
+ // Extend src frame in buffer
+ // Altref filtering assumes 16 pixel extension
+ const int et_y = 16;
+ const int el_y = 16;
+ // Motion estimation may use src block variance with the block size up
+ // to 64x64, so the right and bottom need to be extended to 64 multiple
+ // or up to 16, whichever is greater.
+ const int er_y =
+ AOMMAX(src->y_width + 16, ALIGN_POWER_OF_TWO(src->y_width, 6)) -
+ src->y_crop_width;
+ const int eb_y =
+ AOMMAX(src->y_height + 16, ALIGN_POWER_OF_TWO(src->y_height, 6)) -
+ src->y_crop_height;
+ const int uv_width_subsampling = (src->uv_width != src->y_width);
+ const int uv_height_subsampling = (src->uv_height != src->y_height);
+ const int et_uv = et_y >> uv_height_subsampling;
+ const int el_uv = el_y >> uv_width_subsampling;
+ const int eb_uv = eb_y >> uv_height_subsampling;
+ const int er_uv = er_y >> uv_width_subsampling;
+
+ if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
+ highbd_copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width,
+ src->y_crop_height, et_y, el_y, eb_y, er_y);
+
+ highbd_copy_and_extend_plane(
+ src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+
+ highbd_copy_and_extend_plane(
+ src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+ src->uv_crop_width, src->uv_crop_height, et_uv, el_uv, eb_uv, er_uv);
+ return;
+ }
+
+ copy_and_extend_plane(src->y_buffer, src->y_stride, dst->y_buffer,
+ dst->y_stride, src->y_crop_width, src->y_crop_height,
+ et_y, el_y, eb_y, er_y);
+
+ copy_and_extend_plane(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv);
+
+ copy_and_extend_plane(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, src->uv_crop_width, src->uv_crop_height,
+ et_uv, el_uv, eb_uv, er_uv);
+}
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw) {
+ // If the side is not touching the bounder then don't extend.
+ const int et_y = srcy ? 0 : dst->border;
+ const int el_y = srcx ? 0 : dst->border;
+ const int eb_y = srcy + srch != src->y_height
+ ? 0
+ : dst->border + dst->y_height - src->y_height;
+ const int er_y = srcx + srcw != src->y_width
+ ? 0
+ : dst->border + dst->y_width - src->y_width;
+ const int src_y_offset = srcy * src->y_stride + srcx;
+ const int dst_y_offset = srcy * dst->y_stride + srcx;
+
+ const int et_uv = ROUND_POWER_OF_TWO(et_y, 1);
+ const int el_uv = ROUND_POWER_OF_TWO(el_y, 1);
+ const int eb_uv = ROUND_POWER_OF_TWO(eb_y, 1);
+ const int er_uv = ROUND_POWER_OF_TWO(er_y, 1);
+ const int src_uv_offset = ((srcy * src->uv_stride) >> 1) + (srcx >> 1);
+ const int dst_uv_offset = ((srcy * dst->uv_stride) >> 1) + (srcx >> 1);
+ const int srch_uv = ROUND_POWER_OF_TWO(srch, 1);
+ const int srcw_uv = ROUND_POWER_OF_TWO(srcw, 1);
+
+ copy_and_extend_plane(src->y_buffer + src_y_offset, src->y_stride,
+ dst->y_buffer + dst_y_offset, dst->y_stride, srcw, srch,
+ et_y, el_y, eb_y, er_y);
+
+ copy_and_extend_plane(src->u_buffer + src_uv_offset, src->uv_stride,
+ dst->u_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv);
+
+ copy_and_extend_plane(src->v_buffer + src_uv_offset, src->uv_stride,
+ dst->v_buffer + dst_uv_offset, dst->uv_stride, srcw_uv,
+ srch_uv, et_uv, el_uv, eb_uv, er_uv);
+}
diff --git a/third_party/aom/av1/encoder/extend.h b/third_party/aom/av1/encoder/extend.h
new file mode 100644
index 0000000000..e0432cc970
--- /dev/null
+++ b/third_party/aom/av1/encoder/extend.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_EXTEND_H_
+#define AOM_AV1_ENCODER_EXTEND_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_copy_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst);
+
+void av1_copy_and_extend_frame_with_rect(const YV12_BUFFER_CONFIG *src,
+ YV12_BUFFER_CONFIG *dst, int srcy,
+ int srcx, int srch, int srcw);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_EXTEND_H_
diff --git a/third_party/aom/av1/encoder/firstpass.c b/third_party/aom/av1/encoder/firstpass.c
new file mode 100644
index 0000000000..69dd20c526
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.c
@@ -0,0 +1,3480 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "aom_scale/aom_scale.h"
+#include "aom_scale/yv12config.h"
+
+#include "aom_dsp/variance.h"
+#include "av1/common/entropymv.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h" // av1_setup_dst_planes()
+#include "av1/common/txb_common.h"
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/block.h"
+#include "av1/encoder/dwt.h"
+#include "av1/encoder/encodeframe.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/reconinter_enc.h"
+
+#define OUTPUT_FPF 0
+#define ARF_STATS_OUTPUT 0
+
+#define GROUP_ADAPTIVE_MAXQ 1
+
+#define BOOST_BREAKOUT 12.5
+#define BOOST_FACTOR 12.5
+#define FACTOR_PT_LOW 0.70
+#define FACTOR_PT_HIGH 0.90
+#define FIRST_PASS_Q 10.0
+#define GF_MAX_BOOST 90.0
+#define INTRA_MODE_PENALTY 1024
+#define KF_MIN_FRAME_BOOST 80.0
+#define KF_MAX_FRAME_BOOST 128.0
+#define MIN_ARF_GF_BOOST 240
+#define MIN_DECAY_FACTOR 0.01
+#define MIN_KF_BOOST 300
+#define NEW_MV_MODE_PENALTY 32
+#define DARK_THRESH 64
+#define DEFAULT_GRP_WEIGHT 1.0
+#define RC_FACTOR_MIN 0.75
+#define RC_FACTOR_MAX 1.75
+#define MIN_FWD_KF_INTERVAL 8
+
+#define NCOUNT_INTRA_THRESH 8192
+#define NCOUNT_INTRA_FACTOR 3
+#define NCOUNT_FRAME_II_THRESH 5.0
+
+#define DOUBLE_DIVIDE_CHECK(x) ((x) < 0 ? (x)-0.000001 : (x) + 0.000001)
+
+#if ARF_STATS_OUTPUT
+unsigned int arf_count = 0;
+#endif
+
+// Resets the first pass file to the given position using a relative seek from
+// the current position.
+static void reset_fpf_position(TWO_PASS *p, const FIRSTPASS_STATS *position) {
+ p->stats_in = position;
+}
+
+// Read frame stats at an offset from the current position.
+static const FIRSTPASS_STATS *read_frame_stats(const TWO_PASS *p, int offset) {
+ if ((offset >= 0 && p->stats_in + offset >= p->stats_in_end) ||
+ (offset < 0 && p->stats_in + offset < p->stats_in_start)) {
+ return NULL;
+ }
+
+ return &p->stats_in[offset];
+}
+
+static int input_stats(TWO_PASS *p, FIRSTPASS_STATS *fps) {
+ if (p->stats_in >= p->stats_in_end) return EOF;
+
+ *fps = *p->stats_in;
+ ++p->stats_in;
+ return 1;
+}
+
+static void output_stats(FIRSTPASS_STATS *stats,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_STATS_PKT;
+ pkt.data.twopass_stats.buf = stats;
+ pkt.data.twopass_stats.sz = sizeof(FIRSTPASS_STATS);
+ aom_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#if OUTPUT_FPF
+ {
+ FILE *fpfile;
+ fpfile = fopen("firstpass.stt", "a");
+
+ fprintf(fpfile,
+ "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
+ "%12.4lf %12.4lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf\n",
+ stats->frame, stats->weight, stats->intra_error, stats->coded_error,
+ stats->sr_coded_error, stats->pcnt_inter, stats->pcnt_motion,
+ stats->pcnt_second_ref, stats->pcnt_neutral, stats->intra_skip_pct,
+ stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr,
+ stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv,
+ stats->MVcv, stats->mv_in_out_count, stats->new_mv_count,
+ stats->count, stats->duration);
+ fclose(fpfile);
+ }
+#endif
+}
+
+#if CONFIG_FP_MB_STATS
+static void output_fpmb_stats(uint8_t *this_frame_mb_stats, int stats_size,
+ struct aom_codec_pkt_list *pktlist) {
+ struct aom_codec_cx_pkt pkt;
+ pkt.kind = AOM_CODEC_FPMB_STATS_PKT;
+ pkt.data.firstpass_mb_stats.buf = this_frame_mb_stats;
+ pkt.data.firstpass_mb_stats.sz = stats_size * sizeof(*this_frame_mb_stats);
+ aom_codec_pkt_list_add(pktlist, &pkt);
+}
+#endif
+
+static void zero_stats(FIRSTPASS_STATS *section) {
+ section->frame = 0.0;
+ section->weight = 0.0;
+ section->intra_error = 0.0;
+ section->frame_avg_wavelet_energy = 0.0;
+ section->coded_error = 0.0;
+ section->sr_coded_error = 0.0;
+ section->pcnt_inter = 0.0;
+ section->pcnt_motion = 0.0;
+ section->pcnt_second_ref = 0.0;
+ section->pcnt_neutral = 0.0;
+ section->intra_skip_pct = 0.0;
+ section->inactive_zone_rows = 0.0;
+ section->inactive_zone_cols = 0.0;
+ section->MVr = 0.0;
+ section->mvr_abs = 0.0;
+ section->MVc = 0.0;
+ section->mvc_abs = 0.0;
+ section->MVrv = 0.0;
+ section->MVcv = 0.0;
+ section->mv_in_out_count = 0.0;
+ section->new_mv_count = 0.0;
+ section->count = 0.0;
+ section->duration = 1.0;
+}
+
+static void accumulate_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame += frame->frame;
+ section->weight += frame->weight;
+ section->intra_error += frame->intra_error;
+ section->frame_avg_wavelet_energy += frame->frame_avg_wavelet_energy;
+ section->coded_error += frame->coded_error;
+ section->sr_coded_error += frame->sr_coded_error;
+ section->pcnt_inter += frame->pcnt_inter;
+ section->pcnt_motion += frame->pcnt_motion;
+ section->pcnt_second_ref += frame->pcnt_second_ref;
+ section->pcnt_neutral += frame->pcnt_neutral;
+ section->intra_skip_pct += frame->intra_skip_pct;
+ section->inactive_zone_rows += frame->inactive_zone_rows;
+ section->inactive_zone_cols += frame->inactive_zone_cols;
+ section->MVr += frame->MVr;
+ section->mvr_abs += frame->mvr_abs;
+ section->MVc += frame->MVc;
+ section->mvc_abs += frame->mvc_abs;
+ section->MVrv += frame->MVrv;
+ section->MVcv += frame->MVcv;
+ section->mv_in_out_count += frame->mv_in_out_count;
+ section->new_mv_count += frame->new_mv_count;
+ section->count += frame->count;
+ section->duration += frame->duration;
+}
+
+static void subtract_stats(FIRSTPASS_STATS *section,
+ const FIRSTPASS_STATS *frame) {
+ section->frame -= frame->frame;
+ section->weight -= frame->weight;
+ section->intra_error -= frame->intra_error;
+ section->frame_avg_wavelet_energy -= frame->frame_avg_wavelet_energy;
+ section->coded_error -= frame->coded_error;
+ section->sr_coded_error -= frame->sr_coded_error;
+ section->pcnt_inter -= frame->pcnt_inter;
+ section->pcnt_motion -= frame->pcnt_motion;
+ section->pcnt_second_ref -= frame->pcnt_second_ref;
+ section->pcnt_neutral -= frame->pcnt_neutral;
+ section->intra_skip_pct -= frame->intra_skip_pct;
+ section->inactive_zone_rows -= frame->inactive_zone_rows;
+ section->inactive_zone_cols -= frame->inactive_zone_cols;
+ section->MVr -= frame->MVr;
+ section->mvr_abs -= frame->mvr_abs;
+ section->MVc -= frame->MVc;
+ section->mvc_abs -= frame->mvc_abs;
+ section->MVrv -= frame->MVrv;
+ section->MVcv -= frame->MVcv;
+ section->mv_in_out_count -= frame->mv_in_out_count;
+ section->new_mv_count -= frame->new_mv_count;
+ section->count -= frame->count;
+ section->duration -= frame->duration;
+}
+
+// Calculate the linear size relative to a baseline of 1080P
+#define BASE_SIZE 2073600.0 // 1920x1080
+static double get_linear_size_factor(const AV1_COMP *cpi) {
+ const double this_area = cpi->initial_width * cpi->initial_height;
+ return pow(this_area / BASE_SIZE, 0.5);
+}
+
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *this_frame) {
+ double active_pct;
+
+ active_pct =
+ 1.0 -
+ ((this_frame->intra_skip_pct / 2) +
+ ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+ return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
+
+// Calculate a modified Error used in distributing bits between easier and
+// harder frames.
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const AV1_COMP *cpi,
+ const TWO_PASS *twopass,
+ const AV1EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ double modified_error =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_error *=
+ pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+ return fclamp(modified_error, twopass->modified_error_min,
+ twopass->modified_error_max);
+}
+
+// This function returns the maximum target rate per frame.
+static int frame_max_bits(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *oxcf) {
+ int64_t max_bits = ((int64_t)rc->avg_frame_bandwidth *
+ (int64_t)oxcf->two_pass_vbrmax_section) /
+ 100;
+ if (max_bits < 0)
+ max_bits = 0;
+ else if (max_bits > rc->max_frame_bandwidth)
+ max_bits = rc->max_frame_bandwidth;
+
+ return (int)max_bits;
+}
+
+void av1_init_first_pass(AV1_COMP *cpi) {
+ zero_stats(&cpi->twopass.total_stats);
+}
+
+void av1_end_first_pass(AV1_COMP *cpi) {
+ output_stats(&cpi->twopass.total_stats, cpi->output_pkt_list);
+}
+
+static aom_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+ switch (bsize) {
+ case BLOCK_8X8: return aom_mse8x8;
+ case BLOCK_16X8: return aom_mse16x8;
+ case BLOCK_8X16: return aom_mse8x16;
+ default: return aom_mse16x16;
+ }
+}
+
+static unsigned int get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = get_block_variance_fn(bsize);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+static aom_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+ int bd) {
+ switch (bd) {
+ default:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_8_mse8x8;
+ case BLOCK_16X8: return aom_highbd_8_mse16x8;
+ case BLOCK_8X16: return aom_highbd_8_mse8x16;
+ default: return aom_highbd_8_mse16x16;
+ }
+ break;
+ case 10:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_10_mse8x8;
+ case BLOCK_16X8: return aom_highbd_10_mse16x8;
+ case BLOCK_8X16: return aom_highbd_10_mse8x16;
+ default: return aom_highbd_10_mse16x16;
+ }
+ break;
+ case 12:
+ switch (bsize) {
+ case BLOCK_8X8: return aom_highbd_12_mse8x8;
+ case BLOCK_16X8: return aom_highbd_12_mse16x8;
+ case BLOCK_8X16: return aom_highbd_12_mse8x16;
+ default: return aom_highbd_12_mse16x16;
+ }
+ break;
+ }
+}
+
+static unsigned int highbd_get_prediction_error(BLOCK_SIZE bsize,
+ const struct buf_2d *src,
+ const struct buf_2d *ref,
+ int bd) {
+ unsigned int sse;
+ const aom_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+ fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
+ return sse;
+}
+
+// Refine the motion search range according to the frame dimension
+// for first pass test.
+static int get_search_range(const AV1_COMP *cpi) {
+ int sr = 0;
+ const int dim = AOMMIN(cpi->initial_width, cpi->initial_height);
+
+ while ((dim << sr) < MAX_FULL_PEL_VAL) ++sr;
+ return sr;
+}
+
+static void first_pass_motion_search(AV1_COMP *cpi, MACROBLOCK *x,
+ const MV *ref_mv, MV *best_mv,
+ int *best_motion_err) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MV tmp_mv = kZeroMv;
+ MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int num00, tmp_err, n;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+ const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+
+ int step_param = 3;
+ int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+ const int sr = get_search_range(cpi);
+ step_param += sr;
+ further_steps -= sr;
+
+ // Override the default variance function to use MSE.
+ v_fn_ptr.vf = get_block_variance_fn(bsize);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ v_fn_ptr.vf = highbd_get_block_variance_fn(bsize, xd->bd);
+ }
+
+ // Center the initial step/diamond search on best mv.
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+ step_param, x->sadperbit16, &num00,
+ &v_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+
+ // Carry out further step/diamond searches as necessary.
+ n = num00;
+ num00 = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ --num00;
+ } else {
+ tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
+ step_param + n, x->sadperbit16, &num00,
+ &v_fn_ptr, ref_mv);
+ if (tmp_err < INT_MAX)
+ tmp_err = av1_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
+ if (tmp_err < INT_MAX - new_mv_mode_penalty)
+ tmp_err += new_mv_mode_penalty;
+
+ if (tmp_err < *best_motion_err) {
+ *best_motion_err = tmp_err;
+ *best_mv = tmp_mv;
+ }
+ }
+ }
+}
+
+static BLOCK_SIZE get_bsize(const AV1_COMMON *cm, int mb_row, int mb_col) {
+ if (mi_size_wide[BLOCK_16X16] * mb_col + mi_size_wide[BLOCK_8X8] <
+ cm->mi_cols) {
+ return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+ cm->mi_rows
+ ? BLOCK_16X16
+ : BLOCK_16X8;
+ } else {
+ return mi_size_wide[BLOCK_16X16] * mb_row + mi_size_wide[BLOCK_8X8] <
+ cm->mi_rows
+ ? BLOCK_8X16
+ : BLOCK_8X8;
+ }
+}
+
+static int find_fp_qindex(aom_bit_depth_t bit_depth) {
+ int i;
+
+ for (i = 0; i < QINDEX_RANGE; ++i)
+ if (av1_convert_qindex_to_q(i, bit_depth) >= FIRST_PASS_Q) break;
+
+ if (i == QINDEX_RANGE) i--;
+
+ return i;
+}
+
+static void set_first_pass_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY))) {
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ // Do not use periodic key frames.
+ cpi->rc.frames_to_key = INT_MAX;
+}
+
+static double raw_motion_error_stdev(int *raw_motion_err_list,
+ int raw_motion_err_counts) {
+ int64_t sum_raw_err = 0;
+ double raw_err_avg = 0;
+ double raw_err_stdev = 0;
+ if (raw_motion_err_counts == 0) return 0;
+
+ int i;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ sum_raw_err += raw_motion_err_list[i];
+ }
+ raw_err_avg = (double)sum_raw_err / raw_motion_err_counts;
+ for (i = 0; i < raw_motion_err_counts; i++) {
+ raw_err_stdev += (raw_motion_err_list[i] - raw_err_avg) *
+ (raw_motion_err_list[i] - raw_err_avg);
+ }
+ // Calculate the standard deviation for the motion error of all the inter
+ // blocks of the 0,0 motion using the last source
+ // frame as the reference.
+ raw_err_stdev = sqrt(raw_err_stdev / raw_motion_err_counts);
+ return raw_err_stdev;
+}
+
+#define UL_INTRA_THRESH 50
+#define INVALID_ROW -1
+void av1_first_pass(AV1_COMP *cpi, const struct lookahead_entry *source) {
+ int mb_row, mb_col;
+ MACROBLOCK *const x = &cpi->td.mb;
+ AV1_COMMON *const cm = &cpi->common;
+ const SequenceHeader *const seq_params = &cm->seq_params;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ TileInfo tile;
+ struct macroblock_plane *const p = x->plane;
+ struct macroblockd_plane *const pd = xd->plane;
+ const PICK_MODE_CONTEXT *ctx =
+ &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
+ int i;
+
+ int recon_yoffset, recon_uvoffset;
+ int64_t intra_error = 0;
+ int64_t frame_avg_wavelet_energy = 0;
+ int64_t coded_error = 0;
+ int64_t sr_coded_error = 0;
+
+ int sum_mvr = 0, sum_mvc = 0;
+ int sum_mvr_abs = 0, sum_mvc_abs = 0;
+ int64_t sum_mvrs = 0, sum_mvcs = 0;
+ int mvcount = 0;
+ int intercount = 0;
+ int second_ref_count = 0;
+ const int intrapenalty = INTRA_MODE_PENALTY;
+ double neutral_count;
+ int intra_skip_count = 0;
+ int image_data_start_row = INVALID_ROW;
+ int new_mv_count = 0;
+ int sum_in_vectors = 0;
+ MV lastmv = kZeroMv;
+ TWO_PASS *twopass = &cpi->twopass;
+ int recon_y_stride, recon_uv_stride, uv_mb_height;
+
+ YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+ YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+ YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+ const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+ double intra_factor;
+ double brightness_factor;
+ BufferPool *const pool = cm->buffer_pool;
+ const int qindex = find_fp_qindex(seq_params->bit_depth);
+ const int mb_scale = mi_size_wide[BLOCK_16X16];
+
+ int *raw_motion_err_list;
+ int raw_motion_err_counts = 0;
+ CHECK_MEM_ERROR(
+ cm, raw_motion_err_list,
+ aom_calloc(cm->mb_rows * cm->mb_cols, sizeof(*raw_motion_err_list)));
+ // First pass code requires valid last and new frame buffers.
+ assert(new_yv12 != NULL);
+ assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ av1_zero_array(cpi->twopass.frame_mb_stats_buf, cpi->initial_mbs);
+ }
+#endif
+
+ aom_clear_system_state();
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+ x->e_mbd.mi[0]->sb_type = BLOCK_16X16;
+
+ intra_factor = 0.0;
+ brightness_factor = 0.0;
+ neutral_count = 0.0;
+
+ set_first_pass_params(cpi);
+ av1_set_quantizer(cm, qindex);
+
+ av1_setup_block_planes(&x->e_mbd, seq_params->subsampling_x,
+ seq_params->subsampling_y, num_planes);
+
+ av1_setup_src_planes(x, cpi->source, 0, 0, num_planes);
+ av1_setup_dst_planes(xd->plane, seq_params->sb_size, new_yv12, 0, 0, 0,
+ num_planes);
+
+ if (!frame_is_intra_only(cm)) {
+ av1_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL, num_planes);
+ }
+
+ xd->mi = cm->mi_grid_visible;
+ xd->mi[0] = cm->mi;
+
+ // Don't store luma on the fist pass since chroma is not computed
+ xd->cfl.store_y = 0;
+ av1_frame_init_quantizer(cpi);
+
+ for (i = 0; i < num_planes; ++i) {
+ p[i].coeff = ctx->coeff[i];
+ p[i].qcoeff = ctx->qcoeff[i];
+ pd[i].dqcoeff = ctx->dqcoeff[i];
+ p[i].eobs = ctx->eobs[i];
+ p[i].txb_entropy_ctx = ctx->txb_entropy_ctx[i];
+ }
+
+ av1_init_mv_probs(cm);
+ av1_init_lv_map(cm);
+ av1_initialize_rd_consts(cpi);
+
+ // Tiling is ignored in the first pass.
+ av1_tile_init(&tile, cm, 0, 0);
+
+ recon_y_stride = new_yv12->y_stride;
+ recon_uv_stride = new_yv12->uv_stride;
+ uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
+
+ for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+ MV best_ref_mv = kZeroMv;
+
+ // Reset above block coeffs.
+ xd->up_available = (mb_row != 0);
+ recon_yoffset = (mb_row * recon_y_stride * 16);
+ recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.row_max =
+ ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+ int this_error;
+ const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+ const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+ double log_intra;
+ int level_sample;
+
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
+
+ aom_clear_system_state();
+
+ const int idx_str = xd->mi_stride * mb_row * mb_scale + mb_col * mb_scale;
+ xd->mi = cm->mi_grid_visible + idx_str;
+ xd->mi[0] = cm->mi + idx_str;
+ xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+ xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+ xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+ xd->left_available = (mb_col != 0);
+ xd->mi[0]->sb_type = bsize;
+ xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+ set_mi_row_col(xd, &tile, mb_row * mb_scale, mi_size_high[bsize],
+ mb_col * mb_scale, mi_size_wide[bsize], cm->mi_rows,
+ cm->mi_cols);
+
+ set_plane_n4(xd, mi_size_wide[bsize], mi_size_high[bsize], num_planes);
+
+ // Do intra 16x16 prediction.
+ xd->mi[0]->segment_id = 0;
+ xd->lossless[xd->mi[0]->segment_id] = (qindex == 0);
+ xd->mi[0]->mode = DC_PRED;
+ xd->mi[0]->tx_size =
+ use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+ av1_encode_intra_block_plane(cpi, x, bsize, 0, 0, mb_row * 2, mb_col * 2);
+ this_error = aom_get_mb_ss(x->plane[0].src_diff);
+
+ // Keep a record of blocks that have almost no intra error residual
+ // (i.e. are in effect completely flat and untextured in the intra
+ // domain). In natural videos this is uncommon, but it is much more
+ // common in animations, graphics and screen content, so may be used
+ // as a signal to detect these types of content.
+ if (this_error < UL_INTRA_THRESH) {
+ ++intra_skip_count;
+ } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
+ image_data_start_row = mb_row;
+ }
+
+ if (seq_params->use_highbitdepth) {
+ switch (seq_params->bit_depth) {
+ case AOM_BITS_8: break;
+ case AOM_BITS_10: this_error >>= 4; break;
+ case AOM_BITS_12: this_error >>= 8; break;
+ default:
+ assert(0 &&
+ "seq_params->bit_depth should be AOM_BITS_8, "
+ "AOM_BITS_10 or AOM_BITS_12");
+ return;
+ }
+ }
+
+ aom_clear_system_state();
+ log_intra = log(this_error + 1.0);
+ if (log_intra < 10.0)
+ intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+ else
+ intra_factor += 1.0;
+
+ if (seq_params->use_highbitdepth)
+ level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+ else
+ level_sample = x->plane[0].src.buf[0];
+ if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
+ brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
+ else
+ brightness_factor += 1.0;
+
+ // Intrapenalty below deals with situations where the intra and inter
+ // error scores are very low (e.g. a plain black frame).
+ // We do not have special cases in first pass for 0,0 and nearest etc so
+ // all inter modes carry an overhead cost estimate for the mv.
+ // When the error score is very low this causes us to pick all or lots of
+ // INTRA modes and throw lots of key frames.
+ // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+ this_error += intrapenalty;
+
+ // Accumulate the intra error.
+ intra_error += (int64_t)this_error;
+
+ int stride = x->plane[0].src.stride;
+ uint8_t *buf = x->plane[0].src.buf;
+ for (int r8 = 0; r8 < 2; ++r8)
+ for (int c8 = 0; c8 < 2; ++c8) {
+ int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ frame_avg_wavelet_energy += av1_haar_ac_sad_8x8_uint8_input(
+ buf + c8 * 8 + r8 * 8 * stride, stride, hbd);
+ }
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // initialization
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ }
+#endif
+
+ // Set up limit values for motion vectors to prevent them extending
+ // outside the UMV borders.
+ x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+ x->mv_limits.col_max =
+ ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+ if (!frame_is_intra_only(cm)) { // Do a motion search
+ int tmp_err, motion_error, raw_motion_error;
+ // Assume 0,0 motion with no mv overhead.
+ MV mv = kZeroMv, tmp_mv = kZeroMv;
+ struct buf_2d unscaled_last_source_buf_2d;
+
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+ }
+
+ // Compute the motion error of the 0,0 motion using the last source
+ // frame as the reference. Skip the further motion search on
+ // reconstructed frame if this error is small.
+ unscaled_last_source_buf_2d.buf =
+ cpi->unscaled_last_source->y_buffer + recon_yoffset;
+ unscaled_last_source_buf_2d.stride =
+ cpi->unscaled_last_source->y_stride;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ raw_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+ } else {
+ raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &unscaled_last_source_buf_2d);
+ }
+
+ // TODO(pengchong): Replace the hard-coded threshold
+ if (raw_motion_error > 25) {
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search.
+ first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
+
+ // If the current best reference mv is not centered on 0,0 then do a
+ // 0,0 based search as well.
+ if (!is_zero_mv(&best_ref_mv)) {
+ tmp_err = INT_MAX;
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv, &tmp_err);
+
+ if (tmp_err < motion_error) {
+ motion_error = tmp_err;
+ mv = tmp_mv;
+ }
+ }
+
+ // Search in an older reference frame.
+ if ((cm->current_video_frame > 1) && gld_yv12 != NULL) {
+ // Assume 0,0 motion with no mv overhead.
+ int gf_motion_error;
+
+ xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ gf_motion_error = highbd_get_prediction_error(
+ bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+ } else {
+ gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+ &xd->plane[0].pre[0]);
+ }
+
+ first_pass_motion_search(cpi, x, &kZeroMv, &tmp_mv,
+ &gf_motion_error);
+
+ if (gf_motion_error < motion_error && gf_motion_error < this_error)
+ ++second_ref_count;
+
+ // Reset to last frame as reference buffer.
+ xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+ xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+ xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+ // In accumulating a score for the older reference frame take the
+ // best of the motion predicted score and the intra coded error
+ // (just as will be done for) accumulation of "coded_error" for
+ // the last frame.
+ if (gf_motion_error < this_error)
+ sr_coded_error += gf_motion_error;
+ else
+ sr_coded_error += this_error;
+ } else {
+ sr_coded_error += motion_error;
+ }
+ } else {
+ sr_coded_error += motion_error;
+ }
+
+ // Start by assuming that intra mode is best.
+ best_ref_mv.row = 0;
+ best_ref_mv.col = 0;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // intra predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
+ if (motion_error <= this_error) {
+ aom_clear_system_state();
+
+ // Keep a count of cases where the inter and intra were very close
+ // and very low. This helps with scene cut detection for example in
+ // cropped clips with black bars at the sides or top and bottom.
+ if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+ (this_error < (2 * intrapenalty))) {
+ neutral_count += 1.0;
+ // Also track cases where the intra is not much worse than the inter
+ // and use this in limiting the GF/arf group length.
+ } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+ (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+ neutral_count +=
+ (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+ }
+
+ mv.row *= 8;
+ mv.col *= 8;
+ this_error = motion_error;
+ xd->mi[0]->mode = NEWMV;
+ xd->mi[0]->mv[0].as_mv = mv;
+ xd->mi[0]->tx_size = TX_4X4;
+ xd->mi[0]->ref_frame[0] = LAST_FRAME;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+ av1_build_inter_predictors_sby(cm, xd, mb_row * mb_scale,
+ mb_col * mb_scale, NULL, bsize);
+ av1_encode_sby_pass1(cm, x, bsize);
+ sum_mvr += mv.row;
+ sum_mvr_abs += abs(mv.row);
+ sum_mvc += mv.col;
+ sum_mvc_abs += abs(mv.col);
+ sum_mvrs += mv.row * mv.row;
+ sum_mvcs += mv.col * mv.col;
+ ++intercount;
+
+ best_ref_mv = mv;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // inter predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+ if (this_error > FPMB_ERROR_LARGE_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LARGE_MASK;
+ } else if (this_error < FPMB_ERROR_SMALL_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_SMALL_MASK;
+ }
+ }
+#endif
+
+ if (!is_zero_mv(&mv)) {
+ ++mvcount;
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_MOTION_ZERO_MASK;
+ // check estimated motion direction
+ if (mv.col > 0 && mv.col >= abs(mv.row)) {
+ // right direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_RIGHT_MASK;
+ } else if (mv.row < 0 && abs(mv.row) >= abs(mv.col)) {
+ // up direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_UP_MASK;
+ } else if (mv.col < 0 && abs(mv.col) >= abs(mv.row)) {
+ // left direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_LEFT_MASK;
+ } else {
+ // down direction
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_MOTION_DOWN_MASK;
+ }
+ }
+#endif
+
+ // Non-zero vector, was it different from the last non zero vector?
+ if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count;
+ lastmv = mv;
+
+ // Does the row vector point inwards or outwards?
+ if (mb_row < cm->mb_rows / 2) {
+ if (mv.row > 0)
+ --sum_in_vectors;
+ else if (mv.row < 0)
+ ++sum_in_vectors;
+ } else if (mb_row > cm->mb_rows / 2) {
+ if (mv.row > 0)
+ ++sum_in_vectors;
+ else if (mv.row < 0)
+ --sum_in_vectors;
+ }
+
+ // Does the col vector point inwards or outwards?
+ if (mb_col < cm->mb_cols / 2) {
+ if (mv.col > 0)
+ --sum_in_vectors;
+ else if (mv.col < 0)
+ ++sum_in_vectors;
+ } else if (mb_col > cm->mb_cols / 2) {
+ if (mv.col > 0)
+ ++sum_in_vectors;
+ else if (mv.col < 0)
+ --sum_in_vectors;
+ }
+ }
+ }
+ raw_motion_err_list[raw_motion_err_counts++] = raw_motion_error;
+ } else {
+ sr_coded_error += (int64_t)this_error;
+ }
+ coded_error += (int64_t)this_error;
+
+ // Adjust to the next column of MBs.
+ x->plane[0].src.buf += 16;
+ x->plane[1].src.buf += uv_mb_height;
+ x->plane[2].src.buf += uv_mb_height;
+
+ recon_yoffset += 16;
+ recon_uvoffset += uv_mb_height;
+ }
+ // Adjust to the next row of MBs.
+ x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
+ x->plane[1].src.buf +=
+ uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+ x->plane[2].src.buf +=
+ uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+
+ aom_clear_system_state();
+ }
+ const double raw_err_stdev =
+ raw_motion_error_stdev(raw_motion_err_list, raw_motion_err_counts);
+ aom_free(raw_motion_err_list);
+
+ // Clamp the image start to rows/2. This number of rows is discarded top
+ // and bottom as dead data so rows / 2 means the frame is blank.
+ if ((image_data_start_row > cm->mb_rows / 2) ||
+ (image_data_start_row == INVALID_ROW)) {
+ image_data_start_row = cm->mb_rows / 2;
+ }
+ // Exclude any image dead zone
+ if (image_data_start_row > 0) {
+ intra_skip_count =
+ AOMMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+ }
+
+ {
+ FIRSTPASS_STATS fps;
+ // The minimum error here insures some bit allocation to frames even
+ // in static regions. The allocation per MB declines for larger formats
+ // where the typical "real" energy per MB also falls.
+ // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+ // number of mbs is proportional to the image area.
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const double min_err = 200 * sqrt(num_mbs);
+
+ intra_factor = intra_factor / (double)num_mbs;
+ brightness_factor = brightness_factor / (double)num_mbs;
+ fps.weight = intra_factor * brightness_factor;
+
+ fps.frame = cm->current_video_frame;
+ fps.coded_error = (double)(coded_error >> 8) + min_err;
+ fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
+ fps.intra_error = (double)(intra_error >> 8) + min_err;
+ fps.frame_avg_wavelet_energy = (double)frame_avg_wavelet_energy;
+ fps.count = 1.0;
+ fps.pcnt_inter = (double)intercount / num_mbs;
+ fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
+ fps.pcnt_neutral = (double)neutral_count / num_mbs;
+ fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
+ fps.inactive_zone_rows = (double)image_data_start_row;
+ fps.inactive_zone_cols = (double)0; // TODO(paulwilkins): fix
+ fps.raw_error_stdev = raw_err_stdev;
+
+ if (mvcount > 0) {
+ fps.MVr = (double)sum_mvr / mvcount;
+ fps.mvr_abs = (double)sum_mvr_abs / mvcount;
+ fps.MVc = (double)sum_mvc / mvcount;
+ fps.mvc_abs = (double)sum_mvc_abs / mvcount;
+ fps.MVrv =
+ ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
+ fps.MVcv =
+ ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
+ fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
+ fps.new_mv_count = new_mv_count;
+ fps.pcnt_motion = (double)mvcount / num_mbs;
+ } else {
+ fps.MVr = 0.0;
+ fps.mvr_abs = 0.0;
+ fps.MVc = 0.0;
+ fps.mvc_abs = 0.0;
+ fps.MVrv = 0.0;
+ fps.MVcv = 0.0;
+ fps.mv_in_out_count = 0.0;
+ fps.new_mv_count = 0.0;
+ fps.pcnt_motion = 0.0;
+ }
+
+ // TODO(paulwilkins): Handle the case when duration is set to 0, or
+ // something less than the full time between subsequent values of
+ // cpi->source_time_stamp.
+ fps.duration = (double)(source->ts_end - source->ts_start);
+
+ // Don't want to do output stats with a stack variable!
+ twopass->this_frame_stats = fps;
+ output_stats(&twopass->this_frame_stats, cpi->output_pkt_list);
+ accumulate_stats(&twopass->total_stats, &fps);
+
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ output_fpmb_stats(twopass->frame_mb_stats_buf, cpi->initial_mbs,
+ cpi->output_pkt_list);
+ }
+#endif
+ }
+
+ // Copy the previous Last Frame back into gf and and arf buffers if
+ // the prediction is good enough... but also don't allow it to lag too far.
+ if ((twopass->sr_update_lag > 3) ||
+ ((cm->current_video_frame > 0) &&
+ (twopass->this_frame_stats.pcnt_inter > 0.20) &&
+ ((twopass->this_frame_stats.intra_error /
+ DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
+ if (gld_yv12 != NULL) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+ cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
+ }
+ twopass->sr_update_lag = 1;
+ } else {
+ ++twopass->sr_update_lag;
+ }
+
+ aom_extend_frame_borders(new_yv12, num_planes);
+
+ // The frame we just compressed now becomes the last frame.
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]],
+ cm->new_fb_idx);
+
+ // Special case for the first frame. Copy into the GF buffer as a second
+ // reference.
+ if (cm->current_video_frame == 0 &&
+ cpi->ref_fb_idx[GOLDEN_FRAME - 1] != INVALID_IDX) {
+ ref_cnt_fb(pool->frame_bufs,
+ &cm->ref_frame_map[cpi->ref_fb_idx[GOLDEN_FRAME - 1]],
+ cm->ref_frame_map[cpi->ref_fb_idx[LAST_FRAME - 1]]);
+ }
+
+ // Use this to see what the first pass reconstruction looks like.
+ if (0) {
+ char filename[512];
+ FILE *recon_file;
+ snprintf(filename, sizeof(filename), "enc%04d.yuv",
+ (int)cm->current_video_frame);
+
+ if (cm->current_video_frame == 0)
+ recon_file = fopen(filename, "wb");
+ else
+ recon_file = fopen(filename, "ab");
+
+ (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
+ fclose(recon_file);
+ }
+
+ ++cm->current_video_frame;
+}
+
+static double calc_correction_factor(double err_per_mb, double err_divisor,
+ double pt_low, double pt_high, int q,
+ aom_bit_depth_t bit_depth) {
+ const double error_term = err_per_mb / err_divisor;
+
+ // Adjustment based on actual quantizer to power term.
+ const double power_term =
+ AOMMIN(av1_convert_qindex_to_q(q, bit_depth) * 0.01 + pt_low, pt_high);
+
+ // Calculate correction factor.
+ if (power_term < 1.0) assert(error_term >= 0.0);
+
+ return fclamp(pow(error_term, power_term), 0.05, 5.0);
+}
+
+#define ERR_DIVISOR 100.0
+static int get_twopass_worst_quality(const AV1_COMP *cpi,
+ const double section_err,
+ double inactive_zone,
+ int section_target_bandwidth,
+ double group_weight_factor) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
+
+ if (section_target_bandwidth <= 0) {
+ return rc->worst_quality; // Highest value allowed
+ } else {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ const int active_mbs = AOMMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
+ const double av_err_per_mb = section_err / active_mbs;
+ const double speed_term = 1.0;
+ double ediv_size_correction;
+ const int target_norm_bits_per_mb =
+ (int)((uint64_t)section_target_bandwidth << BPER_MB_NORMBITS) /
+ active_mbs;
+ int q;
+
+ // Larger image formats are expected to be a little harder to code
+ // relatively given the same prediction error score. This in part at
+ // least relates to the increased size and hence coding overheads of
+ // motion vectors. Some account of this is made through adjustment of
+ // the error divisor.
+ ediv_size_correction =
+ AOMMAX(0.2, AOMMIN(5.0, get_linear_size_factor(cpi)));
+ if (ediv_size_correction < 1.0)
+ ediv_size_correction = -(1.0 / ediv_size_correction);
+ ediv_size_correction *= 4.0;
+
+ // Try and pick a max Q that will be high enough to encode the
+ // content at the given rate.
+ for (q = rc->best_quality; q < rc->worst_quality; ++q) {
+ const double factor = calc_correction_factor(
+ av_err_per_mb, ERR_DIVISOR - ediv_size_correction, FACTOR_PT_LOW,
+ FACTOR_PT_HIGH, q, cpi->common.seq_params.bit_depth);
+ const int bits_per_mb = av1_rc_bits_per_mb(
+ INTER_FRAME, q, factor * speed_term * group_weight_factor,
+ cpi->common.seq_params.bit_depth);
+ if (bits_per_mb <= target_norm_bits_per_mb) break;
+ }
+
+ // Restriction on active max q for constrained quality mode.
+ if (cpi->oxcf.rc_mode == AOM_CQ) q = AOMMAX(q, oxcf->cq_level);
+ return q;
+ }
+}
+
+static void setup_rf_level_maxq(AV1_COMP *cpi) {
+ int i;
+ RATE_CONTROL *const rc = &cpi->rc;
+ for (i = INTER_NORMAL; i < RATE_FACTOR_LEVELS; ++i) {
+ int qdelta = av1_frame_type_qdelta(cpi, i, rc->worst_quality);
+ rc->rf_level_maxq[i] = AOMMAX(rc->worst_quality + qdelta, rc->best_quality);
+ }
+}
+
+void av1_init_second_pass(AV1_COMP *cpi) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ double frame_rate;
+ FIRSTPASS_STATS *stats;
+
+ zero_stats(&twopass->total_stats);
+ zero_stats(&twopass->total_left_stats);
+
+ if (!twopass->stats_in_end) return;
+
+ stats = &twopass->total_stats;
+
+ *stats = *twopass->stats_in_end;
+ twopass->total_left_stats = *stats;
+
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+ av1_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
+ // Scan the first pass file and calculate a modified total error based upon
+ // the bias/power function used to allocate bits.
+ {
+ const double avg_error =
+ stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double modified_error_total = 0.0;
+ twopass->modified_error_min =
+ (avg_error * oxcf->two_pass_vbrmin_section) / 100;
+ twopass->modified_error_max =
+ (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+ while (s < twopass->stats_in_end) {
+ modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->modified_error_left = modified_error_total;
+ }
+
+ // Reset the vbr bits off target counters
+ cpi->rc.vbr_bits_off_target = 0;
+ cpi->rc.vbr_bits_off_target_fast = 0;
+
+ cpi->rc.rate_error_estimate = 0;
+
+ // Static sequence monitor variables.
+ twopass->kf_zeromotion_pct = 100;
+ twopass->last_kfgroup_zeromotion_pct = 100;
+
+ if (oxcf->resize_mode != RESIZE_NONE) {
+ setup_rf_level_maxq(cpi);
+ }
+}
+
+#define SR_DIFF_PART 0.0015
+#define MOTION_AMP_PART 0.003
+#define INTRA_PART 0.005
+#define DEFAULT_DECAY_LIMIT 0.75
+#define LOW_SR_DIFF_TRHESH 0.1
+#define SR_DIFF_MAX 128.0
+
+static double get_sr_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+ double sr_decay = 1.0;
+ double modified_pct_inter;
+ double modified_pcnt_intra;
+ const double motion_amplitude_factor =
+ frame->pcnt_motion * ((frame->mvc_abs + frame->mvr_abs) / 2);
+
+ modified_pct_inter = frame->pcnt_inter;
+ if ((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
+ (double)NCOUNT_FRAME_II_THRESH) {
+ modified_pct_inter = frame->pcnt_inter - frame->pcnt_neutral;
+ }
+ modified_pcnt_intra = 100 * (1.0 - modified_pct_inter);
+
+ if ((sr_diff > LOW_SR_DIFF_TRHESH)) {
+ sr_diff = AOMMIN(sr_diff, SR_DIFF_MAX);
+ sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) -
+ (MOTION_AMP_PART * motion_amplitude_factor) -
+ (INTRA_PART * modified_pcnt_intra);
+ }
+ return AOMMAX(sr_decay, AOMMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter));
+}
+
+// This function gives an estimate of how badly we believe the prediction
+// quality is decaying from frame to frame.
+static double get_zero_motion_factor(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *frame) {
+ const double zero_motion_pct = frame->pcnt_inter - frame->pcnt_motion;
+ double sr_decay = get_sr_decay_rate(cpi, frame);
+ return AOMMIN(sr_decay, zero_motion_pct);
+}
+
+#define ZM_POWER_FACTOR 0.75
+
+static double get_prediction_decay_rate(const AV1_COMP *cpi,
+ const FIRSTPASS_STATS *next_frame) {
+ const double sr_decay_rate = get_sr_decay_rate(cpi, next_frame);
+ const double zero_motion_factor =
+ (0.95 * pow((next_frame->pcnt_inter - next_frame->pcnt_motion),
+ ZM_POWER_FACTOR));
+
+ return AOMMAX(zero_motion_factor,
+ (sr_decay_rate + ((1.0 - sr_decay_rate) * zero_motion_factor)));
+}
+
+// Function to test for a condition where a complex transition is followed
+// by a static section. For example in slide shows where there is a fade
+// between slides. This is to help with more optimal kf and gf positioning.
+static int detect_transition_to_still(AV1_COMP *cpi, int frame_interval,
+ int still_interval,
+ double loop_decay_rate,
+ double last_decay_rate) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Break clause to detect very still sections after motion
+ // For example a static image after a fade or other transition
+ // instead of a clean scene cut.
+ if (frame_interval > rc->min_gf_interval && loop_decay_rate >= 0.999 &&
+ last_decay_rate < 0.9) {
+ int j;
+
+ // Look ahead a few frames to see if static condition persists...
+ for (j = 0; j < still_interval; ++j) {
+ const FIRSTPASS_STATS *stats = &twopass->stats_in[j];
+ if (stats >= twopass->stats_in_end) break;
+
+ if (stats->pcnt_inter - stats->pcnt_motion < 0.999) break;
+ }
+
+ // Only if it does do we signal a transition to still.
+ return j == still_interval;
+ }
+
+ return 0;
+}
+
+// This function detects a flash through the high relative pcnt_second_ref
+// score in the frame following a flash frame. The offset passed in should
+// reflect this.
+static int detect_flash(const TWO_PASS *twopass, int offset) {
+ const FIRSTPASS_STATS *const next_frame = read_frame_stats(twopass, offset);
+
+ // What we are looking for here is a situation where there is a
+ // brief break in prediction (such as a flash) but subsequent frames
+ // are reasonably well predicted by an earlier (pre flash) frame.
+ // The recovery after a flash is indicated by a high pcnt_second_ref
+ // compared to pcnt_inter.
+ return next_frame != NULL &&
+ next_frame->pcnt_second_ref > next_frame->pcnt_inter &&
+ next_frame->pcnt_second_ref >= 0.5;
+}
+
+// Update the motion related elements to the GF arf boost calculation.
+static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
+ double *mv_in_out,
+ double *mv_in_out_accumulator,
+ double *abs_mv_in_out_accumulator,
+ double *mv_ratio_accumulator) {
+ const double pct = stats->pcnt_motion;
+
+ // Accumulate Motion In/Out of frame stats.
+ *mv_in_out = stats->mv_in_out_count * pct;
+ *mv_in_out_accumulator += *mv_in_out;
+ *abs_mv_in_out_accumulator += fabs(*mv_in_out);
+
+ // Accumulate a measure of how uniform (or conversely how random) the motion
+ // field is (a ratio of abs(mv) / mv).
+ if (pct > 0.05) {
+ const double mvr_ratio =
+ fabs(stats->mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVr));
+ const double mvc_ratio =
+ fabs(stats->mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(stats->MVc));
+
+ *mv_ratio_accumulator +=
+ pct * (mvr_ratio < stats->mvr_abs ? mvr_ratio : stats->mvr_abs);
+ *mv_ratio_accumulator +=
+ pct * (mvc_ratio < stats->mvc_abs ? mvc_ratio : stats->mvc_abs);
+ }
+}
+
+#define BASELINE_ERR_PER_MB 1000.0
+static double calc_frame_boost(AV1_COMP *cpi, const FIRSTPASS_STATS *this_frame,
+ double this_frame_mv_in_out, double max_boost) {
+ double frame_boost;
+ const double lq = av1_convert_qindex_to_q(
+ cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.seq_params.bit_depth);
+ const double boost_q_correction = AOMMIN((0.5 + (lq * 0.015)), 1.5);
+ int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+
+ // Correct for any inactive region in the image
+ num_mbs = (int)AOMMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+
+ // Underlying boost factor is based on inter error ratio.
+ frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
+ frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction;
+
+ // Increase boost for frames where new data coming into frame (e.g. zoom out).
+ // Slightly reduce boost if there is a net balance of motion out of the frame
+ // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0.
+ if (this_frame_mv_in_out > 0.0)
+ frame_boost += frame_boost * (this_frame_mv_in_out * 2.0);
+ // In the extreme case the boost is halved.
+ else
+ frame_boost += frame_boost * (this_frame_mv_in_out / 2.0);
+
+ return AOMMIN(frame_boost, max_boost * boost_q_correction);
+}
+
+static int calc_arf_boost(AV1_COMP *cpi, int offset, int f_frames, int b_frames,
+ int *f_boost, int *b_boost) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ int i;
+ double boost_score = 0.0;
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+ int arf_boost;
+ int flash_detected = 0;
+
+ // Search forward from the proposed arf/next gf position.
+ for (i = 0; i < f_frames; ++i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+
+ *f_boost = (int)boost_score;
+
+ // Reset for backward looking loop.
+ boost_score = 0.0;
+ mv_ratio_accumulator = 0.0;
+ decay_accumulator = 1.0;
+ this_frame_mv_in_out = 0.0;
+ mv_in_out_accumulator = 0.0;
+ abs_mv_in_out_accumulator = 0.0;
+
+ // Search backward towards last gf position.
+ for (i = -1; i >= -b_frames; --i) {
+ const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+ if (this_frame == NULL) break;
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+
+ // We want to discount the the flash frame itself and the recovery
+ // frame that follows as both will have poor scores.
+ flash_detected = detect_flash(twopass, i + offset) ||
+ detect_flash(twopass, i + offset + 1);
+
+ // Cumulative effect of prediction quality decay.
+ if (!flash_detected) {
+ decay_accumulator *= get_prediction_decay_rate(cpi, this_frame);
+ decay_accumulator = decay_accumulator < MIN_DECAY_FACTOR
+ ? MIN_DECAY_FACTOR
+ : decay_accumulator;
+ }
+
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+ }
+ *b_boost = (int)boost_score;
+
+ arf_boost = (*f_boost + *b_boost);
+ if (arf_boost < ((b_frames + f_frames) * 20))
+ arf_boost = ((b_frames + f_frames) * 20);
+ arf_boost = AOMMAX(arf_boost, MIN_ARF_GF_BOOST);
+
+ return arf_boost;
+}
+
+// Calculate a section intra ratio used in setting max loop filter.
+static int calculate_section_intra_ratio(const FIRSTPASS_STATS *begin,
+ const FIRSTPASS_STATS *end,
+ int section_length) {
+ const FIRSTPASS_STATS *s = begin;
+ double intra_error = 0.0;
+ double coded_error = 0.0;
+ int i = 0;
+
+ while (s < end && i < section_length) {
+ intra_error += s->intra_error;
+ coded_error += s->coded_error;
+ ++s;
+ ++i;
+ }
+
+ return (int)(intra_error / DOUBLE_DIVIDE_CHECK(coded_error));
+}
+
+// Calculate the total bits to allocate in this GF/ARF group.
+static int64_t calculate_total_gf_group_bits(AV1_COMP *cpi,
+ double gf_group_err) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+ int64_t total_group_bits;
+
+ // Calculate the bits to be allocated to the group as a whole.
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ total_group_bits = (int64_t)(twopass->kf_group_bits *
+ (gf_group_err / twopass->kf_group_error_left));
+ } else {
+ total_group_bits = 0;
+ }
+
+ // Clamp odd edge cases.
+ total_group_bits = (total_group_bits < 0)
+ ? 0
+ : (total_group_bits > twopass->kf_group_bits)
+ ? twopass->kf_group_bits
+ : total_group_bits;
+
+ // Clip based on user supplied data rate variability limit.
+ if (total_group_bits > (int64_t)max_bits * rc->baseline_gf_interval)
+ total_group_bits = (int64_t)max_bits * rc->baseline_gf_interval;
+
+ return total_group_bits;
+}
+
+// Calculate the number bits extra to assign to boosted frames in a group.
+static int calculate_boost_bits(int frame_count, int boost,
+ int64_t total_group_bits) {
+ int allocation_chunks;
+
+ // return 0 for invalid inputs (could arise e.g. through rounding errors)
+ if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+
+ allocation_chunks = (frame_count * 100) + boost;
+
+ // Prevent overflow.
+ if (boost > 1023) {
+ int divisor = boost >> 10;
+ boost /= divisor;
+ allocation_chunks /= divisor;
+ }
+
+ // Calculate the number of extra bits for use in the boosted frame or frames.
+ return AOMMAX((int)(((int64_t)boost * total_group_bits) / allocation_chunks),
+ 0);
+}
+
+#if USE_SYMM_MULTI_LAYER
+// #define CHCEK_GF_PARAMETER
+#ifdef CHCEK_GF_PARAMETER
+void check_frame_params(GF_GROUP *const gf_group, int gf_interval,
+ int frame_nums) {
+ static const char *update_type_strings[] = {
+ "KF_UPDATE", "LF_UPDATE", "GF_UPDATE",
+ "ARF_UPDATE", "OVERLAY_UPDATE", "BRF_UPDATE",
+ "LAST_BIPRED_UPDATE", "BIPRED_UPDATE", "INTNL_OVERLAY_UPDATE",
+ "INTNL_ARF_UPDATE"
+ };
+ FILE *fid = fopen("GF_PARAMS.txt", "a");
+
+ fprintf(fid, "\n{%d}\n", gf_interval);
+ for (int i = 0; i <= frame_nums; ++i) {
+ fprintf(fid, "%s %d %d %d %d\n",
+ update_type_strings[gf_group->update_type[i]],
+ gf_group->arf_src_offset[i], gf_group->arf_pos_in_gf[i],
+ gf_group->arf_update_idx[i], gf_group->pyramid_level[i]);
+ }
+
+ fprintf(fid, "number of nodes in each level: \n");
+ for (int i = 0; i < MAX_PYRAMID_LVL; ++i) {
+ fprintf(fid, "lvl %d: %d ", i, gf_group->pyramid_lvl_nodes[i]);
+ }
+ fprintf(fid, "\n");
+ fclose(fid);
+}
+#endif // CHCEK_GF_PARAMETER
+static int update_type_2_rf_level(FRAME_UPDATE_TYPE update_type) {
+ // Derive rf_level from update_type
+ switch (update_type) {
+ case LF_UPDATE: return INTER_NORMAL;
+ case ARF_UPDATE: return GF_ARF_STD;
+ case OVERLAY_UPDATE: return INTER_NORMAL;
+ case BRF_UPDATE: return GF_ARF_LOW;
+ case LAST_BIPRED_UPDATE: return INTER_NORMAL;
+ case BIPRED_UPDATE: return INTER_NORMAL;
+ case INTNL_ARF_UPDATE: return GF_ARF_LOW;
+ case INTNL_OVERLAY_UPDATE: return INTER_NORMAL;
+ default: return INTER_NORMAL;
+ }
+}
+
+static void set_multi_layer_params(GF_GROUP *const gf_group, int l, int r,
+ int *frame_ind, int arf_ind, int level) {
+ if (r - l < 4) {
+ while (++l < r) {
+ // leaf nodes, not a look-ahead frame
+ gf_group->update_type[*frame_ind] = LF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = arf_ind;
+ gf_group->pyramid_level[*frame_ind] = 0;
+ ++gf_group->pyramid_lvl_nodes[0];
+ ++(*frame_ind);
+ }
+ } else {
+ int m = (l + r) / 2;
+ int arf_pos_in_gf = *frame_ind;
+
+ gf_group->update_type[*frame_ind] = INTNL_ARF_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = m - l - 1;
+ gf_group->arf_pos_in_gf[*frame_ind] = 0;
+ gf_group->arf_update_idx[*frame_ind] = 1; // mark all internal ARF 1
+ gf_group->pyramid_level[*frame_ind] = level;
+ ++gf_group->pyramid_lvl_nodes[level];
+ ++(*frame_ind);
+
+ // set parameters for frames displayed before this frame
+ set_multi_layer_params(gf_group, l, m, frame_ind, 1, level - 1);
+
+ // for overlay frames, we need to record the position of its corresponding
+ // arf frames for bit allocation
+ gf_group->update_type[*frame_ind] = INTNL_OVERLAY_UPDATE;
+ gf_group->arf_src_offset[*frame_ind] = 0;
+ gf_group->arf_pos_in_gf[*frame_ind] = arf_pos_in_gf;
+ gf_group->arf_update_idx[*frame_ind] = 1;
+ gf_group->pyramid_level[*frame_ind] = 0;
+ ++(*frame_ind);
+
+ // set parameters for frames displayed after this frame
+ set_multi_layer_params(gf_group, m, r, frame_ind, arf_ind, level - 1);
+ }
+}
+
+static INLINE unsigned char get_pyramid_height(int pyramid_width) {
+ assert(pyramid_width <= 16 && pyramid_width >= 4 &&
+ "invalid gf interval for pyramid structure");
+
+ return pyramid_width > 12 ? 4 : (pyramid_width > 6 ? 3 : 2);
+}
+
+static int construct_multi_layer_gf_structure(GF_GROUP *const gf_group,
+ const int gf_interval) {
+ int frame_index = 0;
+ gf_group->pyramid_height = get_pyramid_height(gf_interval);
+
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL);
+
+ av1_zero_array(gf_group->pyramid_lvl_nodes, MAX_PYRAMID_LVL);
+
+ // At the beginning of each GF group it will be a key or overlay frame,
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = 0;
+ ++frame_index;
+
+ // ALT0
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->arf_src_offset[frame_index] = gf_interval - 1;
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->pyramid_level[frame_index] = gf_group->pyramid_height;
+ ++frame_index;
+
+ // set parameters for the rest of the frames
+ set_multi_layer_params(gf_group, 0, gf_interval, &frame_index, 0,
+ gf_group->pyramid_height - 1);
+ return frame_index;
+}
+
+void define_customized_gf_group_structure(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ assert(rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE);
+
+ const int gf_update_frames =
+ construct_multi_layer_gf_structure(gf_group, rc->baseline_gf_interval);
+ int frame_index;
+
+ cpi->num_extra_arfs = 0;
+
+ for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+ // Set unused variables to default values
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ // Special handle for the first frame for assigning update_type
+ if (frame_index == 0) {
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ if (key_frame) {
+ gf_group->update_type[frame_index] = KF_UPDATE;
+ continue;
+ }
+
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ }
+ } else {
+ if (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+ ++cpi->num_extra_arfs;
+ }
+
+ // Assign rf level based on update type
+ gf_group->rf_level[frame_index] =
+ update_type_2_rf_level(gf_group->update_type[frame_index]);
+ }
+
+ // NOTE: We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ gf_group->arf_update_idx[frame_index] = 0;
+ // This value is only used for INTNL_OVERLAY_UPDATE
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+
+ // This parameter is useless?
+ gf_group->arf_ref_idx[frame_index] = 0;
+#ifdef CHCEK_GF_PARAMETER
+ check_frame_params(gf_group, rc->baseline_gf_interval, gf_update_frames);
+#endif
+}
+
+// It is an example of how to define a GF stucture manually. The function will
+// result in exactly the same GF group structure as
+// define_customized_gf_group_structure() when rc->baseline_gf_interval == 4
+#if USE_MANUAL_GF4_STRUCT
+#define GF_INTERVAL_4 4
+static const unsigned char gf4_multi_layer_params[][GF_FRAME_PARAMS] = {
+ {
+ // gf_group->index == 0 (Frame 0)
+ // It can also be KEY frame. Will assign the proper value
+ // in define_gf_group_structure
+ OVERLAY_UPDATE, // update_type (default value)
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 1 (Frame 4)
+ ARF_UPDATE, // update_type
+ GF_INTERVAL_4 - 1, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 2 (Frame 2)
+ INTNL_ARF_UPDATE, // update_type
+ (GF_INTERVAL_4 >> 1) - 1, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 3 (Frame 1)
+ LAST_BIPRED_UPDATE, // update_type
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+
+ {
+ // gf_group->index == 4 (Frame 2 - OVERLAY)
+ INTNL_OVERLAY_UPDATE, // update_type
+ 0, // arf_src_offset
+ 2, // arf_pos_in_gf
+ 0 // arf_update_idx
+ },
+ {
+ // gf_group->index == 5 (Frame 3)
+ LF_UPDATE, // update_type
+ 0, // arf_src_offset
+ 0, // arf_pos_in_gf
+ 1 // arf_update_idx
+ }
+};
+
+static int define_gf_group_structure_4(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ assert(rc->baseline_gf_interval == GF_INTERVAL_4);
+
+ const int gf_update_frames = rc->baseline_gf_interval + 2;
+ int frame_index;
+
+ for (frame_index = 0; frame_index < gf_update_frames; ++frame_index) {
+ int param_idx = 0;
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+
+ if (frame_index == 0) {
+ // gf_group->arf_src_offset[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ if (key_frame) continue;
+
+ gf_group->update_type[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ }
+ param_idx++;
+ } else {
+ gf_group->update_type[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+ }
+
+ // setup other parameters
+ gf_group->rf_level[frame_index] =
+ update_type_2_rf_level(gf_group->update_type[frame_index]);
+
+ // == arf_src_offset ==
+ gf_group->arf_src_offset[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ // == arf_pos_in_gf ==
+ gf_group->arf_pos_in_gf[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx++];
+
+ // == arf_update_idx ==
+ gf_group->brf_src_offset[frame_index] =
+ gf4_multi_layer_params[frame_index][param_idx];
+ }
+
+ // NOTE: We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ // This value is only used for INTNL_OVERLAY_UPDATE
+ gf_group->arf_pos_in_gf[frame_index] = 0;
+
+ return gf_update_frames;
+}
+#endif // USE_MANUAL_GF4_STRUCT
+#endif // USE_SYMM_MULTI_LAYER
+
+static void define_gf_group_structure(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+#if USE_SYMM_MULTI_LAYER
+ const int valid_customized_gf_length =
+ rc->baseline_gf_interval >= 4 &&
+ rc->baseline_gf_interval <= MAX_PYRAMID_SIZE;
+ // used the new structure only if extra_arf is allowed
+ if (valid_customized_gf_length && rc->source_alt_ref_pending &&
+ cpi->extra_arf_allowed > 0) {
+#if USE_MANUAL_GF4_STRUCT
+ if (rc->baseline_gf_interval == 4)
+ define_gf_group_structure_4(cpi);
+ else
+#endif
+ define_customized_gf_group_structure(cpi);
+ cpi->new_bwdref_update_rule = 1;
+ return;
+ } else {
+ cpi->new_bwdref_update_rule = 0;
+ }
+#endif
+
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ int i;
+ int frame_index = 0;
+ const int key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ // The use of bi-predictive frames are only enabled when following 3
+ // conditions are met:
+ // (1) ALTREF is enabled;
+ // (2) The bi-predictive group interval is at least 2; and
+ // (3) The bi-predictive group interval is strictly smaller than the
+ // golden group interval.
+ const int is_bipred_enabled =
+ cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
+ rc->bipred_group_interval &&
+ rc->bipred_group_interval <=
+ (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+ int bipred_group_end = 0;
+ int bipred_frame_index = 0;
+
+ const unsigned char ext_arf_interval =
+ (unsigned char)(rc->baseline_gf_interval / (cpi->num_extra_arfs + 1) - 1);
+ int which_arf = cpi->num_extra_arfs;
+ int subgroup_interval[MAX_EXT_ARFS + 1];
+ int is_sg_bipred_enabled = is_bipred_enabled;
+ int accumulative_subgroup_interval = 0;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ if (!key_frame) {
+ if (rc->source_alt_ref_active) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ frame_index++;
+
+ bipred_frame_index++;
+
+ // === [frame_index == 1] ===
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ gf_group->arf_src_offset[frame_index] =
+ (unsigned char)(rc->baseline_gf_interval - 1);
+
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+
+ // Work out the ARFs' positions in this gf group
+ // NOTE(weitinglin): ALT_REFs' are indexed inversely, but coded in display
+ // order (except for the original ARF). In the example of three ALT_REF's,
+ // We index ALTREF's as: KEY ----- ALT2 ----- ALT1 ----- ALT0
+ // but code them in the following order:
+ // KEY-ALT0-ALT2 ----- OVERLAY2-ALT1 ----- OVERLAY1 ----- OVERLAY0
+ //
+ // arf_pos_for_ovrly[]: Position for OVERLAY
+ // arf_pos_in_gf[]: Position for ALTREF
+ cpi->arf_pos_for_ovrly[0] = frame_index + cpi->num_extra_arfs +
+ gf_group->arf_src_offset[frame_index] + 1;
+ for (i = 0; i < cpi->num_extra_arfs; ++i) {
+ cpi->arf_pos_for_ovrly[i + 1] =
+ frame_index + (cpi->num_extra_arfs - i) * (ext_arf_interval + 2);
+ subgroup_interval[i] = cpi->arf_pos_for_ovrly[i] -
+ cpi->arf_pos_for_ovrly[i + 1] - (i == 0 ? 1 : 2);
+ }
+ subgroup_interval[cpi->num_extra_arfs] =
+ cpi->arf_pos_for_ovrly[cpi->num_extra_arfs] - frame_index -
+ (cpi->num_extra_arfs == 0 ? 1 : 2);
+
+ ++frame_index;
+
+ // Insert an extra ARF
+ // === [frame_index == 2] ===
+ if (cpi->num_extra_arfs) {
+ gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = 0;
+ ++frame_index;
+ }
+ accumulative_subgroup_interval += subgroup_interval[cpi->num_extra_arfs];
+ }
+
+ for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = which_arf;
+
+ // If we are going to have ARFs, check whether we can have BWDREF in this
+ // subgroup, and further, whether we can have ARF subgroup which contains
+ // the BWDREF subgroup but contained within the GF group:
+ //
+ // GF group --> ARF subgroup --> BWDREF subgroup
+ if (rc->source_alt_ref_pending) {
+ is_sg_bipred_enabled =
+ is_bipred_enabled &&
+ (subgroup_interval[which_arf] > rc->bipred_group_interval);
+ }
+
+ // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+ // frame group interval is strictly smaller than that of the GOLDEN
+ // FRAME group interval.
+ // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+ if (is_sg_bipred_enabled && !bipred_group_end) {
+ const int cur_brf_src_offset = rc->bipred_group_interval - 1;
+
+ if (bipred_frame_index == 1) {
+ // --- BRF_UPDATE ---
+ gf_group->update_type[frame_index] = BRF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
+ } else if (bipred_frame_index == rc->bipred_group_interval) {
+ // --- LAST_BIPRED_UPDATE ---
+ gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->brf_src_offset[frame_index] = 0;
+
+ // Reset the bi-predictive frame index.
+ bipred_frame_index = 0;
+ } else {
+ // --- BIPRED_UPDATE ---
+ gf_group->update_type[frame_index] = BIPRED_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->brf_src_offset[frame_index] = 0;
+ }
+ gf_group->bidir_pred_enabled[frame_index] = 1;
+
+ bipred_frame_index++;
+ // Check whether the next bi-predictive frame group would entirely be
+ // included within the current golden frame group.
+ // In addition, we need to avoid coding a BRF right before an ARF.
+ if (bipred_frame_index == 1 &&
+ (i + 2 + cur_brf_src_offset) >= accumulative_subgroup_interval) {
+ bipred_group_end = 1;
+ }
+ } else {
+ gf_group->update_type[frame_index] = LF_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+ }
+
+ ++frame_index;
+
+ // Check if we need to update the ARF.
+ if (is_sg_bipred_enabled && cpi->num_extra_arfs && which_arf > 0 &&
+ frame_index > cpi->arf_pos_for_ovrly[which_arf]) {
+ --which_arf;
+ accumulative_subgroup_interval += subgroup_interval[which_arf] + 1;
+
+ // Meet the new subgroup; Reset the bipred_group_end flag.
+ bipred_group_end = 0;
+ // Insert another extra ARF after the overlay frame
+ if (which_arf) {
+ gf_group->update_type[frame_index] = INTNL_ARF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_LOW;
+ gf_group->arf_src_offset[frame_index] = ext_arf_interval;
+
+ gf_group->arf_update_idx[frame_index] = which_arf;
+ gf_group->arf_ref_idx[frame_index] = 0;
+ ++frame_index;
+ }
+ }
+ }
+
+ // NOTE: We need to configure the frame at the end of the sequence + 1 that
+ // will be the start frame for the next group. Otherwise prior to the
+ // call to av1_rc_get_second_pass_params() the data will be undefined.
+ gf_group->arf_update_idx[frame_index] = 0;
+ gf_group->arf_ref_idx[frame_index] = 0;
+
+ if (rc->source_alt_ref_pending) {
+ gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+ gf_group->rf_level[frame_index] = INTER_NORMAL;
+
+ cpi->arf_pos_in_gf[0] = 1;
+ if (cpi->num_extra_arfs) {
+ // Overwrite the update_type for extra-ARF's corresponding internal
+ // OVERLAY's: Change from LF_UPDATE to INTNL_OVERLAY_UPDATE.
+ for (i = cpi->num_extra_arfs; i > 0; --i) {
+ cpi->arf_pos_in_gf[i] =
+ (i == cpi->num_extra_arfs ? 2 : cpi->arf_pos_for_ovrly[i + 1] + 1);
+
+ gf_group->update_type[cpi->arf_pos_for_ovrly[i]] = INTNL_OVERLAY_UPDATE;
+ gf_group->rf_level[cpi->arf_pos_for_ovrly[i]] = INTER_NORMAL;
+ }
+ }
+ } else {
+ gf_group->update_type[frame_index] = GF_UPDATE;
+ gf_group->rf_level[frame_index] = GF_ARF_STD;
+ }
+
+ gf_group->bidir_pred_enabled[frame_index] = 0;
+ gf_group->brf_src_offset[frame_index] = 0;
+}
+
+#if USE_SYMM_MULTI_LAYER
+#define LEAF_REDUCTION_FACTOR 0.75f
+#define LVL_3_BOOST_FACTOR 0.8f
+#define LVL_2_BOOST_FACTOR 0.3f
+
+static float_t lvl_budget_factor[MAX_PYRAMID_LVL - 1][MAX_PYRAMID_LVL - 1] = {
+ { 1, 0, 0 },
+ { LVL_3_BOOST_FACTOR, 0, 0 }, // Leaking budget works better
+ { LVL_3_BOOST_FACTOR, (1 - LVL_3_BOOST_FACTOR) * LVL_2_BOOST_FACTOR,
+ (1 - LVL_3_BOOST_FACTOR) * (1 - LVL_2_BOOST_FACTOR) }
+};
+#endif // USE_SYMM_MULTI_LAYER
+static void allocate_gf_group_bits(AV1_COMP *cpi, int64_t gf_group_bits,
+ double group_error, int gf_arf_bits) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ FIRSTPASS_STATS frame_stats;
+ int i;
+ int frame_index = 0;
+ int target_frame_size;
+ int key_frame;
+ const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+ int64_t total_group_bits = gf_group_bits;
+ double modified_err = 0.0;
+ double err_fraction;
+ int ext_arf_boost[MAX_EXT_ARFS];
+
+ define_gf_group_structure(cpi);
+
+ av1_zero_array(ext_arf_boost, MAX_EXT_ARFS);
+
+ key_frame = cpi->common.frame_type == KEY_FRAME;
+
+ // For key frames the frame target rate is already set and it
+ // is also the golden frame.
+ // === [frame_index == 0] ===
+ if (!key_frame) {
+ if (rc->source_alt_ref_active)
+ gf_group->bit_allocation[frame_index] = 0;
+ else
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ // Step over the golden frame / overlay frame
+ if (EOF == input_stats(twopass, &frame_stats)) return;
+ }
+
+ // Deduct the boost bits for arf (or gf if it is not a key frame)
+ // from the group total.
+ if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+ frame_index++;
+
+ // Store the bits to spend on the ARF if there is one.
+ // === [frame_index == 1] ===
+ if (rc->source_alt_ref_pending) {
+ gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+ ++frame_index;
+
+ // Skip all the extra-ARF's right after ARF at the starting segment of
+ // the current GF group.
+ if (cpi->num_extra_arfs) {
+ while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+ ++frame_index;
+ }
+ }
+
+ // Allocate bits to the other frames in the group.
+ for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
+ if (EOF == input_stats(twopass, &frame_stats)) break;
+
+ modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
+
+ if (group_error > 0)
+ err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
+ else
+ err_fraction = 0.0;
+
+ target_frame_size = (int)((double)total_group_bits * err_fraction);
+
+ target_frame_size =
+ clamp(target_frame_size, 0, AOMMIN(max_bits, (int)total_group_bits));
+
+ if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+ // Boost up the allocated bits on BWDREF_FRAME
+ gf_group->bit_allocation[frame_index] =
+ target_frame_size + (target_frame_size >> 2);
+ } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+ // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+ gf_group->bit_allocation[frame_index] =
+ target_frame_size - (target_frame_size >> 1);
+ } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+ // TODO(zoeliu): To investigate whether the allocated bits on
+ // BIPRED_UPDATE frames need to be further adjusted.
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+#if USE_SYMM_MULTI_LAYER
+ } else if (cpi->new_bwdref_update_rule &&
+ gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE) {
+ assert(gf_group->pyramid_height <= MAX_PYRAMID_LVL &&
+ gf_group->pyramid_height >= 0 &&
+ "non-valid height for a pyramid structure");
+
+ int arf_pos = gf_group->arf_pos_in_gf[frame_index];
+ gf_group->bit_allocation[frame_index] = 0;
+
+ gf_group->bit_allocation[arf_pos] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+ const int pyr_h = gf_group->pyramid_height - 2;
+ const int this_lvl = gf_group->pyramid_level[arf_pos];
+ const int dist2top = gf_group->pyramid_height - 1 - this_lvl;
+
+ const float_t budget =
+ LEAF_REDUCTION_FACTOR * gf_group->pyramid_lvl_nodes[0];
+ const float_t lvl_boost = budget * lvl_budget_factor[pyr_h][dist2top] /
+ gf_group->pyramid_lvl_nodes[this_lvl];
+
+ gf_group->bit_allocation[arf_pos] += (int)(target_frame_size * lvl_boost);
+#endif // MULTI_LVL_BOOST_VBR_CQ
+#endif // USE_SYMM_MULTI_LAYER
+ } else {
+ assert(gf_group->update_type[frame_index] == LF_UPDATE ||
+ gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
+ gf_group->bit_allocation[frame_index] = target_frame_size;
+#if MULTI_LVL_BOOST_VBR_CQ
+ if (cpi->new_bwdref_update_rule) {
+ gf_group->bit_allocation[frame_index] -=
+ (int)(target_frame_size * LEAF_REDUCTION_FACTOR);
+ }
+#endif // MULTI_LVL_BOOST_VBR_CQ
+ }
+
+ ++frame_index;
+
+ // Skip all the extra-ARF's.
+ if (cpi->num_extra_arfs) {
+ while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+ ++frame_index;
+ }
+ }
+
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 0 && rc->source_alt_ref_pending) {
+#else
+ if (rc->source_alt_ref_pending) {
+#endif
+ if (cpi->num_extra_arfs) {
+ // NOTE: For bit allocation, move the allocated bits associated with
+ // INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
+ // i > 0 for extra-ARF's and i == 0 for ARF:
+ // arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
+ // arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
+ for (i = cpi->num_extra_arfs; i > 0; --i) {
+ assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
+ INTNL_OVERLAY_UPDATE);
+
+ // Encoder's choice:
+ // Set show_existing_frame == 1 for all extra-ARF's, and hence
+ // allocate zero bit for both all internal OVERLAY frames.
+ gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
+ gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
+ gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
+ }
+ }
+ }
+}
+
+// Analyse and define a gf/arf group.
+static void define_gf_group(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ FIRSTPASS_STATS next_frame;
+ const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
+ int i;
+
+ double boost_score = 0.0;
+#if !CONFIG_FIX_GF_LENGTH
+ double old_boost_score = 0.0;
+ double mv_ratio_accumulator_thresh;
+ int active_max_gf_interval;
+ int active_min_gf_interval;
+#endif
+ double gf_group_err = 0.0;
+#if GROUP_ADAPTIVE_MAXQ
+ double gf_group_raw_error = 0.0;
+#endif
+ double gf_group_skip_pct = 0.0;
+ double gf_group_inactive_zone_rows = 0.0;
+ double gf_first_frame_err = 0.0;
+ double mod_frame_err = 0.0;
+
+ double mv_ratio_accumulator = 0.0;
+ double decay_accumulator = 1.0;
+ double zero_motion_accumulator = 1.0;
+
+ double loop_decay_rate = 1.00;
+ double last_loop_decay_rate = 1.00;
+
+ double this_frame_mv_in_out = 0.0;
+ double mv_in_out_accumulator = 0.0;
+ double abs_mv_in_out_accumulator = 0.0;
+
+ unsigned int allow_alt_ref = is_altref_enabled(cpi);
+
+ int f_boost = 0;
+ int b_boost = 0;
+ int flash_detected;
+ int64_t gf_group_bits;
+ double gf_group_error_left;
+ int gf_arf_bits;
+ const int is_key_frame = frame_is_intra_only(cm);
+ const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
+
+ cpi->extra_arf_allowed = 1;
+
+ // Reset the GF group data structures unless this is a key
+ // frame in which case it will already have been done.
+ if (is_key_frame == 0) {
+ av1_zero(twopass->gf_group);
+ }
+
+ aom_clear_system_state();
+ av1_zero(next_frame);
+
+ // Load stats for the current frame.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Note the error of the frame at the start of the group. This will be
+ // the GF frame error if we code a normal gf.
+ gf_first_frame_err = mod_frame_err;
+
+ // If this is a key frame or the overlay from a previous arf then
+ // the error score / cost of this frame has already been accounted for.
+ if (arf_active_or_kf) {
+ gf_group_err -= gf_first_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error -= this_frame->coded_error;
+#endif
+ gf_group_skip_pct -= this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows -= this_frame->inactive_zone_rows;
+ }
+#if !CONFIG_FIX_GF_LENGTH
+ // Motion breakout threshold for loop below depends on image size.
+ mv_ratio_accumulator_thresh =
+ (cpi->initial_height + cpi->initial_width) / 4.0;
+ // Set a maximum and minimum interval for the GF group.
+ // If the image appears almost completely static we can extend beyond this.
+ {
+ int int_max_q = (int)(av1_convert_qindex_to_q(
+ twopass->active_worst_quality, cpi->common.seq_params.bit_depth));
+ int int_lbq = (int)(av1_convert_qindex_to_q(
+ rc->last_boosted_qindex, cpi->common.seq_params.bit_depth));
+
+ active_min_gf_interval = rc->min_gf_interval + AOMMIN(2, int_max_q / 200);
+ if (active_min_gf_interval > rc->max_gf_interval)
+ active_min_gf_interval = rc->max_gf_interval;
+
+ // The value chosen depends on the active Q range. At low Q we have
+ // bits to spare and are better with a smaller interval and smaller boost.
+ // At high Q when there are few bits to spare we are better with a longer
+ // interval to spread the cost of the GF.
+ active_max_gf_interval = 12 + AOMMIN(4, (int_lbq / 6));
+
+ // We have: active_min_gf_interval <= rc->max_gf_interval
+ if (active_max_gf_interval < active_min_gf_interval)
+ active_max_gf_interval = active_min_gf_interval;
+ else if (active_max_gf_interval > rc->max_gf_interval)
+ active_max_gf_interval = rc->max_gf_interval;
+ }
+#endif // !CONFIG_FIX_GF_LENGTH
+ double avg_sr_coded_error = 0;
+ double avg_raw_err_stdev = 0;
+ int non_zero_stdev_count = 0;
+
+ i = 0;
+ while (i < rc->static_scene_max_gf_interval && i < rc->frames_to_key) {
+ ++i;
+
+ // Accumulate error score of frames in this gf group.
+ mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ gf_group_err += mod_frame_err;
+#if GROUP_ADAPTIVE_MAXQ
+ gf_group_raw_error += this_frame->coded_error;
+#endif
+ gf_group_skip_pct += this_frame->intra_skip_pct;
+ gf_group_inactive_zone_rows += this_frame->inactive_zone_rows;
+
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Test for the case where there is a brief flash but the prediction
+ // quality back to an earlier frame is then restored.
+ flash_detected = detect_flash(twopass, 0);
+
+ // Update the motion related elements to the boost calculation.
+ accumulate_frame_motion_stats(
+ &next_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
+ &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
+ // sum up the metric values of current gf group
+ avg_sr_coded_error += next_frame.sr_coded_error;
+ if (fabs(next_frame.raw_error_stdev) > 0.000001) {
+ non_zero_stdev_count++;
+ avg_raw_err_stdev += next_frame.raw_error_stdev;
+ }
+
+ // Accumulate the effect of prediction quality decay.
+ if (!flash_detected) {
+ last_loop_decay_rate = loop_decay_rate;
+ loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
+
+ decay_accumulator = decay_accumulator * loop_decay_rate;
+
+ // Monitor for static sections.
+ zero_motion_accumulator = AOMMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+
+ // Break clause to detect very still sections after motion. For example,
+ // a static image after a fade or other transition.
+ if (detect_transition_to_still(cpi, i, 5, loop_decay_rate,
+ last_loop_decay_rate)) {
+ allow_alt_ref = 0;
+ break;
+ }
+ }
+
+ // Calculate a boost number for this frame.
+ boost_score +=
+ decay_accumulator *
+ calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST);
+#if CONFIG_FIX_GF_LENGTH
+ if (i == (FIXED_GF_LENGTH + 1)) break;
+#else
+ // Skip breaking condition for CONFIG_FIX_GF_LENGTH
+ // Break out conditions.
+ if (
+ // Break at active_max_gf_interval unless almost totally static.
+ (i >= (active_max_gf_interval + arf_active_or_kf) &&
+ zero_motion_accumulator < 0.995) ||
+ (
+ // Don't break out with a very short interval.
+ (i >= active_min_gf_interval + arf_active_or_kf) &&
+ (!flash_detected) &&
+ ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
+ (abs_mv_in_out_accumulator > 3.0) ||
+ (mv_in_out_accumulator < -2.0) ||
+ ((boost_score - old_boost_score) < BOOST_BREAKOUT)))) {
+ // If GF group interval is < 12, we force it to be 8. Otherwise,
+ // if it is >= 12, we keep it as is.
+ // NOTE: 'i' is 1 more than the GF group interval candidate that is being
+ // checked.
+ if (i == (8 + 1) || i >= (12 + 1)) {
+ boost_score = old_boost_score;
+ break;
+ }
+ }
+ old_boost_score = boost_score;
+#endif // CONFIG_FIX_GF_LENGTH
+ *this_frame = next_frame;
+ }
+ twopass->gf_zeromotion_pct = (int)(zero_motion_accumulator * 1000.0);
+
+ // Was the group length constrained by the requirement for a new KF?
+ rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
+
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+ : cpi->common.MBs;
+ assert(num_mbs > 0);
+ if (i) avg_sr_coded_error /= i;
+
+ if (non_zero_stdev_count) avg_raw_err_stdev /= non_zero_stdev_count;
+
+ // Disable extra altrefs and backward refs for "still" gf group:
+ // zero_motion_accumulator: minimum percentage of (0,0) motion;
+ // avg_sr_coded_error: average of the SSE per pixel of each frame;
+ // avg_raw_err_stdev: average of the standard deviation of (0,0)
+ // motion error per block of each frame.
+ const int disable_bwd_extarf =
+ (zero_motion_accumulator > MIN_ZERO_MOTION &&
+ avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+ avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+
+ if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+#define REDUCE_GF_LENGTH_THRESH 4
+#define REDUCE_GF_LENGTH_TO_KEY_THRESH 9
+#define REDUCE_GF_LENGTH_BY 1
+ int alt_offset = 0;
+#if REDUCE_LAST_GF_LENGTH
+ // TODO(weitinglin): The length reduction stretagy is tweaking using AOM_Q
+ // mode, and hurting the performance of VBR mode. We need to investigate how
+ // to adjust GF length for other modes.
+
+ int allow_gf_length_reduction =
+ cpi->oxcf.rc_mode == AOM_Q || cpi->extra_arf_allowed == 0;
+
+ // We are going to have an alt ref, but we don't have do adjustment for
+ // lossless mode
+ if (allow_alt_ref && allow_gf_length_reduction &&
+ (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval) &&
+ !is_lossless_requested(&cpi->oxcf)) {
+ // adjust length of this gf group if one of the following condition met
+ // 1: only one overlay frame left and this gf is too long
+ // 2: next gf group is too short to have arf compared to the current gf
+
+ // maximum length of next gf group
+ const int next_gf_len = rc->frames_to_key - i;
+ const int single_overlay_left =
+ next_gf_len == 0 && i > REDUCE_GF_LENGTH_THRESH;
+ // the next gf is probably going to have a ARF but it will be shorter than
+ // this gf
+ const int unbalanced_gf =
+ i > REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 < REDUCE_GF_LENGTH_TO_KEY_THRESH &&
+ next_gf_len + 1 >= rc->min_gf_interval;
+
+ if (single_overlay_left || unbalanced_gf) {
+ // Note: Tried roll_back = DIVIDE_AND_ROUND(i, 8), but is does not work
+ // better in the current setting
+ const int roll_back = REDUCE_GF_LENGTH_BY;
+ alt_offset = -roll_back;
+ i -= roll_back;
+ }
+ }
+#endif
+
+ // Should we use the alternate reference frame.
+ if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
+ (i >= rc->min_gf_interval)) {
+ // Calculate the boost for alt ref.
+ rc->gfu_boost =
+ calc_arf_boost(cpi, alt_offset, (i - 1), (i - 1), &f_boost, &b_boost);
+ rc->source_alt_ref_pending = 1;
+
+ // do not replace ARFs with overlay frames, and keep it as GOLDEN_REF
+ cpi->preserve_arf_as_gld = 1;
+ } else {
+ rc->gfu_boost = AOMMAX((int)boost_score, MIN_ARF_GF_BOOST);
+ rc->source_alt_ref_pending = 0;
+ cpi->preserve_arf_as_gld = 0;
+ }
+
+ // Set the interval until the next gf.
+ // If forward keyframes are enabled, ensure the final gf group obeys the
+ // MIN_FWD_KF_INTERVAL.
+ if (cpi->oxcf.fwd_kf_enabled &&
+ ((twopass->stats_in - i + rc->frames_to_key) < twopass->stats_in_end)) {
+ if (i == rc->frames_to_key) {
+ rc->baseline_gf_interval = i;
+ // if the last gf group will be smaller than MIN_FWD_KF_INTERVAL
+ } else if ((rc->frames_to_key - i <
+ AOMMAX(MIN_FWD_KF_INTERVAL, rc->min_gf_interval)) &&
+ (rc->frames_to_key != i)) {
+ // if possible, merge the last two gf groups
+ if (rc->frames_to_key <= MAX_PYRAMID_SIZE) {
+ rc->baseline_gf_interval = rc->frames_to_key;
+ // if merging the last two gf groups creates a group that is too long,
+ // split them and force the last gf group to be the MIN_FWD_KF_INTERVAL
+ } else {
+ rc->baseline_gf_interval = rc->frames_to_key - MIN_FWD_KF_INTERVAL;
+ }
+ } else {
+ rc->baseline_gf_interval =
+ i - (is_key_frame || rc->source_alt_ref_pending);
+ }
+ } else {
+ rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+ }
+
+#if REDUCE_LAST_ALT_BOOST
+#define LAST_ALR_BOOST_FACTOR 0.2f
+ rc->arf_boost_factor = 1.0;
+ if (rc->source_alt_ref_pending && !is_lossless_requested(&cpi->oxcf)) {
+ // Reduce the boost of altref in the last gf group
+ if (rc->frames_to_key - i == REDUCE_GF_LENGTH_BY ||
+ rc->frames_to_key - i == 0) {
+ rc->arf_boost_factor = LAST_ALR_BOOST_FACTOR;
+ }
+ }
+#endif
+
+ if (!cpi->extra_arf_allowed) {
+ cpi->num_extra_arfs = 0;
+ } else {
+#if USE_SYMM_MULTI_LAYER
+ if (rc->baseline_gf_interval == 4 && rc->source_alt_ref_pending)
+ cpi->num_extra_arfs = 1;
+ else
+ cpi->num_extra_arfs = get_number_of_extra_arfs(
+ rc->baseline_gf_interval, rc->source_alt_ref_pending);
+#else
+ // Compute how many extra alt_refs we can have
+ cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+ rc->source_alt_ref_pending);
+#endif // USE_SYMM_MULTI_LAYER
+ }
+
+#if !USE_SYMM_MULTI_LAYER
+ // Currently at maximum two extra ARFs' are allowed
+ assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+#endif
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+ rc->bipred_group_interval = BFG_INTERVAL;
+ // The minimum bi-predictive frame group interval is 2.
+ if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate the bits to be allocated to the gf/arf group as a whole
+ gf_group_bits = calculate_total_gf_group_bits(cpi, gf_group_err);
+
+#if GROUP_ADAPTIVE_MAXQ
+ // Calculate an estimate of the maxq needed for the group.
+ // We are more agressive about correcting for sections
+ // where there could be significant overshoot than for easier
+ // sections where we do not wish to risk creating an overshoot
+ // of the allocated bit budget.
+ if ((cpi->oxcf.rc_mode != AOM_Q) && (rc->baseline_gf_interval > 1)) {
+ const int vbr_group_bits_per_frame =
+ (int)(gf_group_bits / rc->baseline_gf_interval);
+ const double group_av_err = gf_group_raw_error / rc->baseline_gf_interval;
+ const double group_av_skip_pct =
+ gf_group_skip_pct / rc->baseline_gf_interval;
+ const double group_av_inactive_zone =
+ ((gf_group_inactive_zone_rows * 2) /
+ (rc->baseline_gf_interval * (double)cm->mb_rows));
+
+ int tmp_q;
+ // rc factor is a weight factor that corrects for local rate control drift.
+ double rc_factor = 1.0;
+ if (rc->rate_error_estimate > 0) {
+ rc_factor = AOMMAX(RC_FACTOR_MIN,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ } else {
+ rc_factor = AOMMIN(RC_FACTOR_MAX,
+ (double)(100 - rc->rate_error_estimate) / 100.0);
+ }
+ tmp_q = get_twopass_worst_quality(
+ cpi, group_av_err, (group_av_skip_pct + group_av_inactive_zone),
+ vbr_group_bits_per_frame, twopass->kfgroup_inter_fraction * rc_factor);
+ twopass->active_worst_quality =
+ AOMMAX(tmp_q, twopass->active_worst_quality >> 1);
+ }
+#endif
+
+ // Calculate the extra bits to be used for boosted frame(s)
+ gf_arf_bits = calculate_boost_bits(rc->baseline_gf_interval, rc->gfu_boost,
+ gf_group_bits);
+
+ // Adjust KF group bits and error remaining.
+ twopass->kf_group_error_left -= (int64_t)gf_group_err;
+
+ // If this is an arf update we want to remove the score for the overlay
+ // frame at the end which will usually be very cheap to code.
+ // The overlay frame has already, in effect, been coded so we want to spread
+ // the remaining bits among the other frames.
+ // For normal GFs remove the score for the GF itself unless this is
+ // also a key frame in which case it has already been accounted for.
+ if (rc->source_alt_ref_pending) {
+ gf_group_error_left = gf_group_err - mod_frame_err;
+ } else if (is_key_frame == 0) {
+ gf_group_error_left = gf_group_err - gf_first_frame_err;
+ } else {
+ gf_group_error_left = gf_group_err;
+ }
+
+ // Allocate bits to each of the frames in the GF group.
+ allocate_gf_group_bits(cpi, gf_group_bits, gf_group_error_left, gf_arf_bits);
+
+ // Reset the file position.
+ reset_fpf_position(twopass, start_pos);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_pos, twopass->stats_in_end, rc->baseline_gf_interval);
+ }
+}
+
+// Threshold for use of the lagging second reference frame. High second ref
+// usage may point to a transient event like a flash or occlusion rather than
+// a real scene cut.
+#define SECOND_REF_USEAGE_THRESH 0.1
+// Minimum % intra coding observed in first pass (1.0 = 100%)
+#define MIN_INTRA_LEVEL 0.25
+// Minimum ratio between the % of intra coding and inter coding in the first
+// pass after discounting neutral blocks (discounting neutral blocks in this
+// way helps catch scene cuts in clips with very flat areas or letter box
+// format clips with image padding.
+#define INTRA_VS_INTER_THRESH 2.0
+// Hard threshold where the first pass chooses intra for almost all blocks.
+// In such a case even if the frame is not a scene cut coding a key frame
+// may be a good option.
+#define VERY_LOW_INTER_THRESH 0.05
+// Maximum threshold for the relative ratio of intra error score vs best
+// inter error score.
+#define KF_II_ERR_THRESHOLD 2.5
+// In real scene cuts there is almost always a sharp change in the intra
+// or inter error score.
+#define ERR_CHANGE_THRESHOLD 0.4
+// For real scene cuts we expect an improvment in the intra inter error
+// ratio in the next frame.
+#define II_IMPROVEMENT_THRESHOLD 3.5
+#define KF_II_MAX 128.0
+
+static int test_candidate_kf(TWO_PASS *twopass,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ int is_viable_kf = 0;
+ double pcnt_intra = 1.0 - this_frame->pcnt_inter;
+ double modified_pcnt_inter =
+ this_frame->pcnt_inter - this_frame->pcnt_neutral;
+
+ // Does the frame satisfy the primary criteria of a key frame?
+ // See above for an explanation of the test criteria.
+ // If so, then examine how well it predicts subsequent frames.
+ if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ (next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+ ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
+ ((pcnt_intra > MIN_INTRA_LEVEL) &&
+ (pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
+ ((this_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) <
+ KF_II_ERR_THRESHOLD) &&
+ ((fabs(last_frame->coded_error - this_frame->coded_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->coded_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ (fabs(last_frame->intra_error - this_frame->intra_error) /
+ DOUBLE_DIVIDE_CHECK(this_frame->intra_error) >
+ ERR_CHANGE_THRESHOLD) ||
+ ((next_frame->intra_error /
+ DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) >
+ II_IMPROVEMENT_THRESHOLD))))) {
+ int i;
+ const FIRSTPASS_STATS *start_pos = twopass->stats_in;
+ FIRSTPASS_STATS local_next_frame = *next_frame;
+ double boost_score = 0.0;
+ double old_boost_score = 0.0;
+ double decay_accumulator = 1.0;
+
+ // Examine how well the key frame predicts subsequent frames.
+ for (i = 0; i < 16; ++i) {
+ double next_iiratio = (BOOST_FACTOR * local_next_frame.intra_error /
+ DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error));
+
+ if (next_iiratio > KF_II_MAX) next_iiratio = KF_II_MAX;
+
+ // Cumulative effect of decay in prediction quality.
+ if (local_next_frame.pcnt_inter > 0.85)
+ decay_accumulator *= local_next_frame.pcnt_inter;
+ else
+ decay_accumulator *= (0.85 + local_next_frame.pcnt_inter) / 2.0;
+
+ // Keep a running total.
+ boost_score += (decay_accumulator * next_iiratio);
+
+ // Test various breakout clauses.
+ if ((local_next_frame.pcnt_inter < 0.05) || (next_iiratio < 1.5) ||
+ (((local_next_frame.pcnt_inter - local_next_frame.pcnt_neutral) <
+ 0.20) &&
+ (next_iiratio < 3.0)) ||
+ ((boost_score - old_boost_score) < 3.0) ||
+ (local_next_frame.intra_error < 200)) {
+ break;
+ }
+
+ old_boost_score = boost_score;
+
+ // Get the next frame details
+ if (EOF == input_stats(twopass, &local_next_frame)) break;
+ }
+
+ // If there is tolerable prediction for at least the next 3 frames then
+ // break out else discard this potential key frame and move on
+ if (boost_score > 30.0 && (i > 3)) {
+ is_viable_kf = 1;
+ } else {
+ // Reset the file position
+ reset_fpf_position(twopass, start_pos);
+
+ is_viable_kf = 0;
+ }
+ }
+
+ return is_viable_kf;
+}
+
+#define FRAMES_TO_CHECK_DECAY 8
+
+static void find_next_key_frame(AV1_COMP *cpi, FIRSTPASS_STATS *this_frame) {
+ int i, j;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const FIRSTPASS_STATS first_frame = *this_frame;
+ const FIRSTPASS_STATS *const start_position = twopass->stats_in;
+ FIRSTPASS_STATS next_frame;
+ FIRSTPASS_STATS last_frame;
+ int kf_bits = 0;
+ int loop_decay_counter = 0;
+ double decay_accumulator = 1.0;
+ double av_decay_accumulator = 0.0;
+ double zero_motion_accumulator = 1.0;
+ double boost_score = 0.0;
+ double kf_mod_err = 0.0;
+ double kf_group_err = 0.0;
+ double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
+
+ av1_zero(next_frame);
+
+ cpi->common.frame_type = KEY_FRAME;
+
+ // Reset the GF group data structures.
+ av1_zero(*gf_group);
+
+ // Is this a forced key frame by interval.
+ rc->this_key_frame_forced = rc->next_key_frame_forced;
+
+ // Clear the alt ref active flag and last group multi arf flags as they
+ // can never be set for a key frame.
+ rc->source_alt_ref_active = 0;
+
+ // KF is always a GF so clear frames till next gf counter.
+ rc->frames_till_gf_update_due = 0;
+
+ rc->frames_to_key = 1;
+
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0; // Group modified error score.
+
+ kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Initialize the decay rates for the recent frames to check
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
+
+ // Find the next keyframe.
+ i = 0;
+ while (twopass->stats_in < twopass->stats_in_end &&
+ rc->frames_to_key < cpi->oxcf.key_freq) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+
+ // Load the next frame's stats.
+ last_frame = *this_frame;
+ input_stats(twopass, this_frame);
+
+ // Provided that we are not at the end of the file...
+ if (cpi->oxcf.auto_key && twopass->stats_in < twopass->stats_in_end) {
+ double loop_decay_rate;
+
+ // Check for a scene cut.
+ if (test_candidate_kf(twopass, &last_frame, this_frame,
+ twopass->stats_in))
+ break;
+
+ // How fast is the prediction quality decaying?
+ loop_decay_rate = get_prediction_decay_rate(cpi, twopass->stats_in);
+
+ // We want to know something about the recent past... rather than
+ // as used elsewhere where we are concerned with decay in prediction
+ // quality since the last GF or KF.
+ recent_loop_decay[i % FRAMES_TO_CHECK_DECAY] = loop_decay_rate;
+ decay_accumulator = 1.0;
+ for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j)
+ decay_accumulator *= recent_loop_decay[j];
+
+ // Special check for transition or high motion followed by a
+ // static scene.
+ if (detect_transition_to_still(cpi, i, cpi->oxcf.key_freq - i,
+ loop_decay_rate, decay_accumulator))
+ break;
+
+ // Step on to the next frame.
+ ++rc->frames_to_key;
+
+ // If we don't have a real key frame within the next two
+ // key_freq intervals then break out of the loop.
+ if (rc->frames_to_key >= 2 * cpi->oxcf.key_freq) break;
+ } else {
+ ++rc->frames_to_key;
+ }
+ ++i;
+ }
+
+ // If there is a max kf interval set by the user we must obey it.
+ // We already breakout of the loop above at 2x max.
+ // This code centers the extra kf if the actual natural interval
+ // is between 1x and 2x.
+ if (cpi->oxcf.auto_key && rc->frames_to_key > cpi->oxcf.key_freq) {
+ FIRSTPASS_STATS tmp_frame = first_frame;
+
+ rc->frames_to_key /= 2;
+
+ // Reset to the start of the group.
+ reset_fpf_position(twopass, start_position);
+
+ kf_group_err = 0.0;
+
+ // Rescan to get the correct error data for the forced kf group.
+ for (i = 0; i < rc->frames_to_key; ++i) {
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+ input_stats(twopass, &tmp_frame);
+ }
+ rc->next_key_frame_forced = 1;
+ } else if (twopass->stats_in == twopass->stats_in_end ||
+ rc->frames_to_key >= cpi->oxcf.key_freq) {
+ rc->next_key_frame_forced = 1;
+ } else {
+ rc->next_key_frame_forced = 0;
+ }
+
+ // Special case for the last key frame of the file.
+ if (twopass->stats_in >= twopass->stats_in_end) {
+ // Accumulate kf group error.
+ kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ }
+
+ // Calculate the number of bits that should be assigned to the kf group.
+ if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+ // Maximum number of bits for a single normal frame (not key frame).
+ const int max_bits = frame_max_bits(rc, &cpi->oxcf);
+
+ // Maximum number of bits allocated to the key frame group.
+ int64_t max_grp_bits;
+
+ // Default allocation based on bits left and relative
+ // complexity of the section.
+ twopass->kf_group_bits = (int64_t)(
+ twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+
+ // Clip based on maximum per frame rate defined by the user.
+ max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
+ if (twopass->kf_group_bits > max_grp_bits)
+ twopass->kf_group_bits = max_grp_bits;
+ } else {
+ twopass->kf_group_bits = 0;
+ }
+ twopass->kf_group_bits = AOMMAX(0, twopass->kf_group_bits);
+
+ // Reset the first pass file position.
+ reset_fpf_position(twopass, start_position);
+
+ // Scan through the kf group collating various stats used to determine
+ // how many bits to spend on it.
+ decay_accumulator = 1.0;
+ boost_score = 0.0;
+ const double kf_max_boost =
+ cpi->oxcf.rc_mode == AOM_Q
+ ? AOMMIN(AOMMAX(rc->frames_to_key * 2.0, KF_MIN_FRAME_BOOST),
+ KF_MAX_FRAME_BOOST)
+ : KF_MAX_FRAME_BOOST;
+ for (i = 0; i < (rc->frames_to_key - 1); ++i) {
+ if (EOF == input_stats(twopass, &next_frame)) break;
+
+ // Monitor for static sections.
+ zero_motion_accumulator = AOMMIN(zero_motion_accumulator,
+ get_zero_motion_factor(cpi, &next_frame));
+
+ // Not all frames in the group are necessarily used in calculating boost.
+ if ((i <= rc->max_gf_interval) ||
+ ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) {
+ const double frame_boost =
+ calc_frame_boost(cpi, this_frame, 0, kf_max_boost);
+
+ // How fast is prediction quality decaying.
+ if (!detect_flash(twopass, 0)) {
+ const double loop_decay_rate =
+ get_prediction_decay_rate(cpi, &next_frame);
+ decay_accumulator *= loop_decay_rate;
+ decay_accumulator = AOMMAX(decay_accumulator, MIN_DECAY_FACTOR);
+ av_decay_accumulator += decay_accumulator;
+ ++loop_decay_counter;
+ }
+ boost_score += (decay_accumulator * frame_boost);
+ }
+ }
+ if (loop_decay_counter > 0)
+ av_decay_accumulator /= (double)loop_decay_counter;
+
+ reset_fpf_position(twopass, start_position);
+
+ // Store the zero motion percentage
+ twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0);
+
+ // Calculate a section intra ratio used in setting max loop filter.
+ twopass->section_intra_rating = calculate_section_intra_ratio(
+ start_position, twopass->stats_in_end, rc->frames_to_key);
+
+ // Apply various clamps for min and max boost
+ rc->kf_boost = (int)(av_decay_accumulator * boost_score);
+ rc->kf_boost = AOMMAX(rc->kf_boost, (rc->frames_to_key * 3));
+ rc->kf_boost = AOMMAX(rc->kf_boost, MIN_KF_BOOST);
+
+ // Work out how many bits to allocate for the key frame itself.
+ kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
+ twopass->kf_group_bits);
+ // printf("kf boost = %d kf_bits = %d kf_zeromotion_pct = %d\n", rc->kf_boost,
+ // kf_bits, twopass->kf_zeromotion_pct);
+
+ // Work out the fraction of the kf group bits reserved for the inter frames
+ // within the group after discounting the bits for the kf itself.
+ if (twopass->kf_group_bits) {
+ twopass->kfgroup_inter_fraction =
+ (double)(twopass->kf_group_bits - kf_bits) /
+ (double)twopass->kf_group_bits;
+ } else {
+ twopass->kfgroup_inter_fraction = 1.0;
+ }
+
+ twopass->kf_group_bits -= kf_bits;
+
+ // Save the bits to spend on the key frame.
+ gf_group->bit_allocation[0] = kf_bits;
+ gf_group->update_type[0] = KF_UPDATE;
+ gf_group->rf_level[0] = KF_STD;
+
+ // Note the total error score of the kf group minus the key frame itself.
+ twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+ // Adjust the count of total modified error left.
+ // The count of bits left is adjusted elsewhere based on real coded frame
+ // sizes.
+ twopass->modified_error_left -= kf_group_err;
+}
+
+// Define the reference buffers that will be updated post encode.
+static void configure_buffer_updates(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+
+ // NOTE(weitinglin): Should we define another function to take care of
+ // cpi->rc.is_$Source_Type to make this function as it is in the comment?
+
+ cpi->rc.is_src_frame_alt_ref = 0;
+ cpi->rc.is_bwd_ref_frame = 0;
+ cpi->rc.is_last_bipred_frame = 0;
+ cpi->rc.is_bipred_frame = 0;
+ cpi->rc.is_src_frame_ext_arf = 0;
+
+ switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
+ case KF_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_alt2_ref_frame = 1;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+
+ case LF_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ case GF_UPDATE:
+ // TODO(zoeliu): To further investigate whether 'refresh_last_frame' is
+ // needed.
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ case OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 1;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ break;
+
+ case ARF_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ // NOTE: BWDREF does not get updated along with ALTREF_FRAME.
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 1;
+ break;
+
+ case BRF_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bwd_ref_frame = 1;
+ break;
+
+ case LAST_BIPRED_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_last_bipred_frame = 1;
+ break;
+
+ case BIPRED_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_bipred_frame = 1;
+ break;
+
+ case INTNL_OVERLAY_UPDATE:
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ cpi->rc.is_src_frame_alt_ref = 1;
+ cpi->rc.is_src_frame_ext_arf = 1;
+ break;
+
+ case INTNL_ARF_UPDATE:
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule == 1) {
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_alt2_ref_frame = 0;
+ } else {
+#endif
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 1;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ cpi->refresh_alt_ref_frame = 0;
+ break;
+
+ default: assert(0); break;
+ }
+}
+
+void av1_configure_buffer_updates_firstpass(AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE update_type) {
+ RATE_CONTROL *rc = &cpi->rc;
+
+ cpi->refresh_last_frame = 1;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ rc->is_bwd_ref_frame = 0;
+
+ switch (update_type) {
+ case ARF_UPDATE:
+ cpi->refresh_alt_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+
+ rc->is_src_frame_alt_ref = 0;
+ break;
+ case INTNL_ARF_UPDATE:
+ cpi->refresh_alt2_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_bwd_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+ rc->is_src_frame_alt_ref = 0;
+ rc->is_src_frame_ext_arf = 0;
+
+ break;
+ case BIPRED_UPDATE:
+ cpi->refresh_bwd_ref_frame = 1;
+ cpi->refresh_last_frame = 0;
+ cpi->refresh_golden_frame = 0;
+ cpi->refresh_alt2_ref_frame = 0;
+ cpi->refresh_alt_ref_frame = 0;
+
+ rc->is_bwd_ref_frame = 1;
+ break;
+ default: break;
+ }
+}
+
+static int is_skippable_frame(const AV1_COMP *cpi) {
+ // If the current frame does not have non-zero motion vector detected in the
+ // first pass, and so do its previous and forward frames, then this frame
+ // can be skipped for partition check, and the partition size is assigned
+ // according to the variance
+ const TWO_PASS *const twopass = &cpi->twopass;
+
+ return (!frame_is_intra_only(&cpi->common) &&
+ twopass->stats_in - 2 > twopass->stats_in_start &&
+ twopass->stats_in < twopass->stats_in_end &&
+ (twopass->stats_in - 1)->pcnt_inter -
+ (twopass->stats_in - 1)->pcnt_motion ==
+ 1 &&
+ (twopass->stats_in - 2)->pcnt_inter -
+ (twopass->stats_in - 2)->pcnt_motion ==
+ 1 &&
+ twopass->stats_in->pcnt_inter - twopass->stats_in->pcnt_motion == 1);
+}
+
+void av1_rc_get_second_pass_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ TWO_PASS *const twopass = &cpi->twopass;
+ GF_GROUP *const gf_group = &twopass->gf_group;
+ int frames_left;
+ FIRSTPASS_STATS this_frame;
+
+ int target_rate;
+
+ frames_left = (int)(twopass->total_stats.count - cm->current_video_frame);
+
+ if (!twopass->stats_in) return;
+
+ // If this is an arf frame then we dont want to read the stats file or
+ // advance the input pointer as we already have what we need.
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+ gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+ configure_buffer_updates(cpi);
+ target_rate = gf_group->bit_allocation[gf_group->index];
+ target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+ rc->base_frame_target = target_rate;
+
+ if (cpi->no_show_kf) {
+ assert(gf_group->update_type[gf_group->index] == ARF_UPDATE);
+ cm->frame_type = KEY_FRAME;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ return;
+ }
+
+ aom_clear_system_state();
+
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ twopass->active_worst_quality = cpi->oxcf.cq_level;
+ } else if (cm->current_video_frame == 0) {
+ // Special case code for first frame.
+ const int section_target_bandwidth =
+ (int)(twopass->bits_left / frames_left);
+ const double section_length = twopass->total_left_stats.count;
+ const double section_error =
+ twopass->total_left_stats.coded_error / section_length;
+ const double section_intra_skip =
+ twopass->total_left_stats.intra_skip_pct / section_length;
+ const double section_inactive_zone =
+ (twopass->total_left_stats.inactive_zone_rows * 2) /
+ ((double)cm->mb_rows * section_length);
+ const int tmp_q = get_twopass_worst_quality(
+ cpi, section_error, section_intra_skip + section_inactive_zone,
+ section_target_bandwidth, DEFAULT_GRP_WEIGHT);
+
+ twopass->active_worst_quality = tmp_q;
+ twopass->baseline_active_worst_quality = tmp_q;
+ rc->ni_av_qi = tmp_q;
+ rc->last_q[INTER_FRAME] = tmp_q;
+ rc->avg_q = av1_convert_qindex_to_q(tmp_q, cm->seq_params.bit_depth);
+ rc->avg_frame_qindex[INTER_FRAME] = tmp_q;
+ rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
+ rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
+ }
+
+ av1_zero(this_frame);
+ if (EOF == input_stats(twopass, &this_frame)) return;
+
+ // Set the frame content type flag.
+ if (this_frame.intra_skip_pct >= FC_ANIMATION_THRESH)
+ twopass->fr_content_type = FC_GRAPHICS_ANIMATION;
+ else
+ twopass->fr_content_type = FC_NORMAL;
+
+ // Keyframe and section processing.
+ if (rc->frames_to_key == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY)) {
+ FIRSTPASS_STATS this_frame_copy;
+ this_frame_copy = this_frame;
+ // Define next KF group and assign bits to it.
+ find_next_key_frame(cpi, &this_frame);
+ this_frame = this_frame_copy;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+
+ // Define a new GF/ARF group. (Should always enter here for key frames).
+ if (rc->frames_till_gf_update_due == 0) {
+ define_gf_group(cpi, &this_frame);
+
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+
+#if ARF_STATS_OUTPUT
+ {
+ FILE *fpfile;
+ fpfile = fopen("arf.stt", "a");
+ ++arf_count;
+ fprintf(fpfile, "%10d %10d %10d %10d %10d\n", cm->current_video_frame,
+ rc->frames_till_gf_update_due, rc->kf_boost, arf_count,
+ rc->gfu_boost);
+
+ fclose(fpfile);
+ }
+#endif
+ }
+
+ configure_buffer_updates(cpi);
+
+ // Do the firstpass stats indicate that this frame is skippable for the
+ // partition search?
+ if (cpi->sf.allow_partition_search_skip && cpi->oxcf.pass == 2) {
+ cpi->partition_search_skippable_frame = is_skippable_frame(cpi);
+ }
+
+ target_rate = gf_group->bit_allocation[gf_group->index];
+
+ if (cpi->common.frame_type == KEY_FRAME)
+ target_rate = av1_rc_clamp_iframe_target_size(cpi, target_rate);
+ else
+ target_rate = av1_rc_clamp_pframe_target_size(cpi, target_rate);
+
+ rc->base_frame_target = target_rate;
+
+ {
+ const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+ ? cpi->initial_mbs
+ : cpi->common.MBs;
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error / num_mbs) + 1.0);
+ twopass->frame_avg_haar_energy =
+ log((this_frame.frame_avg_wavelet_energy / num_mbs) + 1.0);
+ }
+
+ // Update the total stats remaining structure.
+ subtract_stats(&twopass->total_left_stats, &this_frame);
+}
+
+#define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
+#define HIGH_UNDERSHOOT_RATIO 2
+void av1_twopass_postencode_update(AV1_COMP *cpi) {
+ TWO_PASS *const twopass = &cpi->twopass;
+ RATE_CONTROL *const rc = &cpi->rc;
+ const int bits_used = rc->base_frame_target;
+
+ // VBR correction is done through rc->vbr_bits_off_target. Based on the
+ // sign of this value, a limited % adjustment is made to the target rate
+ // of subsequent frames, to try and push it back towards 0. This method
+ // is designed to prevent extreme behaviour at the end of a clip
+ // or group of frames.
+ rc->vbr_bits_off_target += rc->base_frame_target - rc->projected_frame_size;
+ twopass->bits_left = AOMMAX(twopass->bits_left - bits_used, 0);
+
+ // Calculate the pct rc error.
+ if (rc->total_actual_bits) {
+ rc->rate_error_estimate =
+ (int)((rc->vbr_bits_off_target * 100) / rc->total_actual_bits);
+ rc->rate_error_estimate = clamp(rc->rate_error_estimate, -100, 100);
+ } else {
+ rc->rate_error_estimate = 0;
+ }
+
+ if (cpi->common.frame_type != KEY_FRAME) {
+ twopass->kf_group_bits -= bits_used;
+ twopass->last_kfgroup_zeromotion_pct = twopass->kf_zeromotion_pct;
+ }
+ twopass->kf_group_bits = AOMMAX(twopass->kf_group_bits, 0);
+
+ // If the rate control is drifting consider adjustment to min or maxq.
+ if ((cpi->oxcf.rc_mode != AOM_Q) &&
+ (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD) &&
+ !cpi->rc.is_src_frame_alt_ref) {
+ const int maxq_adj_limit =
+ rc->worst_quality - twopass->active_worst_quality;
+ const int minq_adj_limit =
+ (cpi->oxcf.rc_mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
+
+ // Undershoot.
+ if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {
+ --twopass->extend_maxq;
+ if (rc->rolling_target_bits >= rc->rolling_actual_bits)
+ ++twopass->extend_minq;
+ // Overshoot.
+ } else if (rc->rate_error_estimate < -cpi->oxcf.over_shoot_pct) {
+ --twopass->extend_minq;
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ ++twopass->extend_maxq;
+ } else {
+ // Adjustment for extreme local overshoot.
+ if (rc->projected_frame_size > (2 * rc->base_frame_target) &&
+ rc->projected_frame_size > (2 * rc->avg_frame_bandwidth))
+ ++twopass->extend_maxq;
+
+ // Unwind undershoot or overshoot adjustment.
+ if (rc->rolling_target_bits < rc->rolling_actual_bits)
+ --twopass->extend_minq;
+ else if (rc->rolling_target_bits > rc->rolling_actual_bits)
+ --twopass->extend_maxq;
+ }
+
+ twopass->extend_minq = clamp(twopass->extend_minq, 0, minq_adj_limit);
+ twopass->extend_maxq = clamp(twopass->extend_maxq, 0, maxq_adj_limit);
+
+ // If there is a big and undexpected undershoot then feed the extra
+ // bits back in quickly. One situation where this may happen is if a
+ // frame is unexpectedly almost perfectly predicted by the ARF or GF
+ // but not very well predcited by the previous frame.
+ if (!frame_is_kf_gf_arf(cpi) && !cpi->rc.is_src_frame_alt_ref) {
+ int fast_extra_thresh = rc->base_frame_target / HIGH_UNDERSHOOT_RATIO;
+ if (rc->projected_frame_size < fast_extra_thresh) {
+ rc->vbr_bits_off_target_fast +=
+ fast_extra_thresh - rc->projected_frame_size;
+ rc->vbr_bits_off_target_fast =
+ AOMMIN(rc->vbr_bits_off_target_fast, (4 * rc->avg_frame_bandwidth));
+
+ // Fast adaptation of minQ if necessary to use up the extra bits.
+ if (rc->avg_frame_bandwidth) {
+ twopass->extend_minq_fast =
+ (int)(rc->vbr_bits_off_target_fast * 8 / rc->avg_frame_bandwidth);
+ }
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else if (rc->vbr_bits_off_target_fast) {
+ twopass->extend_minq_fast = AOMMIN(
+ twopass->extend_minq_fast, minq_adj_limit - twopass->extend_minq);
+ } else {
+ twopass->extend_minq_fast = 0;
+ }
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/firstpass.h b/third_party/aom/av1/encoder/firstpass.h
new file mode 100644
index 0000000000..4b7325ae21
--- /dev/null
+++ b/third_party/aom/av1/encoder/firstpass.h
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_FIRSTPASS_H_
+#define AOM_AV1_ENCODER_FIRSTPASS_H_
+
+#include "av1/common/enums.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/encoder/lookahead.h"
+#include "av1/encoder/ratectrl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+
+#define FPMB_MOTION_ZERO_MASK 0x02
+#define FPMB_MOTION_LEFT_MASK 0x04
+#define FPMB_MOTION_RIGHT_MASK 0x08
+#define FPMB_MOTION_UP_MASK 0x10
+#define FPMB_MOTION_DOWN_MASK 0x20
+
+#define FPMB_ERROR_SMALL_MASK 0x40
+#define FPMB_ERROR_LARGE_MASK 0x80
+#define FPMB_ERROR_SMALL_TH 2000
+#define FPMB_ERROR_LARGE_TH 48000
+
+typedef struct {
+ uint8_t *mb_stats_start;
+ uint8_t *mb_stats_end;
+} FIRSTPASS_MB_STATS;
+#endif
+
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+// number of bi-predictive frames.
+#define BFG_INTERVAL 2
+// The maximum number of extra ALTREF's except ALTREF_FRAME
+#define MAX_EXT_ARFS (REF_FRAMES - BWDREF_FRAME - 1)
+
+#define MIN_EXT_ARF_INTERVAL 4
+
+#define MIN_ZERO_MOTION 0.95
+#define MAX_SR_CODED_ERROR 40
+#define MAX_RAW_ERR_VAR 2000
+#define MIN_MV_IN_OUT 0.4
+
+#define VLOW_MOTION_THRESHOLD 950
+
+typedef struct {
+ double frame;
+ double weight;
+ double intra_error;
+ double frame_avg_wavelet_energy;
+ double coded_error;
+ double sr_coded_error;
+ double pcnt_inter;
+ double pcnt_motion;
+ double pcnt_second_ref;
+ double pcnt_neutral;
+ double intra_skip_pct;
+ double inactive_zone_rows; // Image mask rows top and bottom.
+ double inactive_zone_cols; // Image mask columns at left and right edges.
+ double MVr;
+ double mvr_abs;
+ double MVc;
+ double mvc_abs;
+ double MVrv;
+ double MVcv;
+ double mv_in_out_count;
+ double new_mv_count;
+ double duration;
+ double count;
+ // standard deviation for (0, 0) motion prediction error
+ double raw_error_stdev;
+} FIRSTPASS_STATS;
+
+typedef enum {
+ KF_UPDATE = 0,
+ LF_UPDATE = 1,
+ GF_UPDATE = 2,
+ ARF_UPDATE = 3,
+ OVERLAY_UPDATE = 4,
+ BRF_UPDATE = 5, // Backward Reference Frame
+ LAST_BIPRED_UPDATE = 6, // Last Bi-predictive Frame
+ BIPRED_UPDATE = 7, // Bi-predictive Frame, but not the last one
+ INTNL_OVERLAY_UPDATE = 8, // Internal Overlay Frame
+ INTNL_ARF_UPDATE = 9, // Internal Altref Frame (candidate for ALTREF2)
+ FRAME_UPDATE_TYPES = 10
+} FRAME_UPDATE_TYPE;
+
+#define FC_ANIMATION_THRESH 0.15
+typedef enum {
+ FC_NORMAL = 0,
+ FC_GRAPHICS_ANIMATION = 1,
+ FRAME_CONTENT_TYPES = 2
+} FRAME_CONTENT_TYPE;
+
+typedef struct {
+ unsigned char index;
+ RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
+ FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if USE_SYMM_MULTI_LAYER
+ unsigned char arf_pos_in_gf[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char pyramid_level[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char pyramid_height;
+ unsigned char pyramid_lvl_nodes[MAX_PYRAMID_LVL];
+#endif
+ unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char ref_fb_idx_map[(MAX_LAG_BUFFERS * 2) + 1][REF_FRAMES];
+ unsigned char refresh_idx[(MAX_LAG_BUFFERS * 2) + 1];
+ unsigned char refresh_flag[(MAX_LAG_BUFFERS * 2) + 1];
+ int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+} GF_GROUP;
+
+typedef struct {
+ unsigned int section_intra_rating;
+ FIRSTPASS_STATS total_stats;
+ FIRSTPASS_STATS this_frame_stats;
+ const FIRSTPASS_STATS *stats_in;
+ const FIRSTPASS_STATS *stats_in_start;
+ const FIRSTPASS_STATS *stats_in_end;
+ FIRSTPASS_STATS total_left_stats;
+ int first_pass_done;
+ int64_t bits_left;
+ double modified_error_min;
+ double modified_error_max;
+ double modified_error_left;
+ double mb_av_energy;
+ double frame_avg_haar_energy;
+
+#if CONFIG_FP_MB_STATS
+ uint8_t *frame_mb_stats_buf;
+ uint8_t *this_frame_mb_stats;
+ FIRSTPASS_MB_STATS firstpass_mb_stats;
+#endif
+ // An indication of the content type of the current frame
+ FRAME_CONTENT_TYPE fr_content_type;
+
+ // Projected total bits available for a key frame group of frames
+ int64_t kf_group_bits;
+
+ // Error score of frames still to be coded in kf group
+ int64_t kf_group_error_left;
+
+ // The fraction for a kf groups total bits allocated to the inter frames
+ double kfgroup_inter_fraction;
+
+ int sr_update_lag;
+
+ int kf_zeromotion_pct;
+ int last_kfgroup_zeromotion_pct;
+ int gf_zeromotion_pct;
+ int active_worst_quality;
+ int baseline_active_worst_quality;
+ int extend_minq;
+ int extend_maxq;
+ int extend_minq_fast;
+
+ GF_GROUP gf_group;
+} TWO_PASS;
+
+struct AV1_COMP;
+
+void av1_init_first_pass(struct AV1_COMP *cpi);
+void av1_rc_get_first_pass_params(struct AV1_COMP *cpi);
+void av1_first_pass(struct AV1_COMP *cpi, const struct lookahead_entry *source);
+void av1_end_first_pass(struct AV1_COMP *cpi);
+
+void av1_init_second_pass(struct AV1_COMP *cpi);
+void av1_rc_get_second_pass_params(struct AV1_COMP *cpi);
+void av1_configure_buffer_updates_firstpass(struct AV1_COMP *cpi,
+ FRAME_UPDATE_TYPE update_type);
+
+// Post encode update of the rate control parameters for 2-pass
+void av1_twopass_postencode_update(struct AV1_COMP *cpi);
+
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+ if (arf_pending && MAX_EXT_ARFS > 0)
+ return interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1)
+ ? MAX_EXT_ARFS
+ : interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS
+ ? MAX_EXT_ARFS - 1
+ : 0;
+ else
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_FIRSTPASS_H_
diff --git a/third_party/aom/av1/encoder/global_motion.c b/third_party/aom/av1/encoder/global_motion.c
new file mode 100644
index 0000000000..e9f8b0bb47
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "av1/encoder/global_motion.h"
+
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/corner_detect.h"
+#include "av1/encoder/corner_match.h"
+#include "av1/encoder/ransac.h"
+
+#define MAX_CORNERS 4096
+#define MIN_INLIER_PROB 0.1
+
+#define MIN_TRANS_THRESH (1 * GM_TRANS_DECODE_FACTOR)
+
+// Border over which to compute the global motion
+#define ERRORADV_BORDER 0
+
+static const double erroradv_tr[] = { 0.65, 0.60, 0.55 };
+static const double erroradv_prod_tr[] = { 20000, 18000, 16000 };
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+ int erroradv_type) {
+ assert(erroradv_type < GM_ERRORADV_TR_TYPES);
+ return best_erroradvantage < erroradv_tr[erroradv_type] &&
+ best_erroradvantage * params_cost < erroradv_prod_tr[erroradv_type];
+}
+
+static void convert_to_params(const double *params, int32_t *model) {
+ int i;
+ int alpha_present = 0;
+ model[0] = (int32_t)floor(params[0] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[1] = (int32_t)floor(params[1] * (1 << GM_TRANS_PREC_BITS) + 0.5);
+ model[0] = (int32_t)clamp(model[0], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+ model[1] = (int32_t)clamp(model[1], GM_TRANS_MIN, GM_TRANS_MAX) *
+ GM_TRANS_DECODE_FACTOR;
+
+ for (i = 2; i < 6; ++i) {
+ const int diag_value = ((i == 2 || i == 5) ? (1 << GM_ALPHA_PREC_BITS) : 0);
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ALPHA_PREC_BITS) + 0.5);
+ model[i] =
+ (int32_t)clamp(model[i] - diag_value, GM_ALPHA_MIN, GM_ALPHA_MAX);
+ alpha_present |= (model[i] != 0);
+ model[i] = (model[i] + diag_value) * GM_ALPHA_DECODE_FACTOR;
+ }
+ for (; i < 8; ++i) {
+ model[i] = (int32_t)floor(params[i] * (1 << GM_ROW3HOMO_PREC_BITS) + 0.5);
+ model[i] = (int32_t)clamp(model[i], GM_ROW3HOMO_MIN, GM_ROW3HOMO_MAX) *
+ GM_ROW3HOMO_DECODE_FACTOR;
+ alpha_present |= (model[i] != 0);
+ }
+
+ if (!alpha_present) {
+ if (abs(model[0]) < MIN_TRANS_THRESH && abs(model[1]) < MIN_TRANS_THRESH) {
+ model[0] = 0;
+ model[1] = 0;
+ }
+ }
+}
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model) {
+ convert_to_params(params, model->wmmat);
+ model->wmtype = get_gmtype(model);
+ model->invalid = 0;
+}
+
+// Adds some offset to a global motion parameter and handles
+// all of the necessary precision shifts, clamping, and
+// zero-centering.
+static int32_t add_param_offset(int param_index, int32_t param_value,
+ int32_t offset) {
+ const int scale_vals[3] = { GM_TRANS_PREC_DIFF, GM_ALPHA_PREC_DIFF,
+ GM_ROW3HOMO_PREC_DIFF };
+ const int clamp_vals[3] = { GM_TRANS_MAX, GM_ALPHA_MAX, GM_ROW3HOMO_MAX };
+ // type of param: 0 - translation, 1 - affine, 2 - homography
+ const int param_type = (param_index < 2 ? 0 : (param_index < 6 ? 1 : 2));
+ const int is_one_centered = (param_index == 2 || param_index == 5);
+
+ // Make parameter zero-centered and offset the shift that was done to make
+ // it compatible with the warped model
+ param_value = (param_value - (is_one_centered << WARPEDMODEL_PREC_BITS)) >>
+ scale_vals[param_type];
+ // Add desired offset to the rescaled/zero-centered parameter
+ param_value += offset;
+ // Clamp the parameter so it does not overflow the number of bits allotted
+ // to it in the bitstream
+ param_value = (int32_t)clamp(param_value, -clamp_vals[param_type],
+ clamp_vals[param_type]);
+ // Rescale the parameter to WARPEDMODEL_PRECISION_BITS so it is compatible
+ // with the warped motion library
+ param_value *= (1 << scale_vals[param_type]);
+
+ // Undo the zero-centering step if necessary
+ return param_value + (is_one_centered << WARPEDMODEL_PREC_BITS);
+}
+
+static void force_wmtype(WarpedMotionParams *wm, TransformationType wmtype) {
+ switch (wmtype) {
+ case IDENTITY:
+ wm->wmmat[0] = 0;
+ wm->wmmat[1] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case TRANSLATION:
+ wm->wmmat[2] = 1 << WARPEDMODEL_PREC_BITS;
+ wm->wmmat[3] = 0;
+ AOM_FALLTHROUGH_INTENDED;
+ case ROTZOOM:
+ wm->wmmat[4] = -wm->wmmat[3];
+ wm->wmmat[5] = wm->wmmat[2];
+ AOM_FALLTHROUGH_INTENDED;
+ case AFFINE: wm->wmmat[6] = wm->wmmat[7] = 0; break;
+ default: assert(0);
+ }
+ wm->wmtype = wmtype;
+}
+
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+ TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height,
+ int r_stride, uint8_t *dst, int d_width,
+ int d_height, int d_stride, int n_refinements,
+ int64_t best_frame_error) {
+ static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+ const int border = ERRORADV_BORDER;
+ int i = 0, p;
+ int n_params = max_trans_model_params[wmtype];
+ int32_t *param_mat = wm->wmmat;
+ int64_t step_error, best_error;
+ int32_t step;
+ int32_t *param;
+ int32_t curr_param;
+ int32_t best_param;
+
+ force_wmtype(wm, wmtype);
+ best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, border, border,
+ d_width - 2 * border, d_height - 2 * border,
+ d_stride, 0, 0, best_frame_error);
+ best_error = AOMMIN(best_error, best_frame_error);
+ step = 1 << (n_refinements - 1);
+ for (i = 0; i < n_refinements; i++, step >>= 1) {
+ for (p = 0; p < n_params; ++p) {
+ int step_dir = 0;
+ // Skip searches for parameters that are forced to be 0
+ param = param_mat + p;
+ curr_param = *param;
+ best_param = curr_param;
+ // look to the left
+ *param = add_param_offset(p, curr_param, -step);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, border, border,
+ d_width - 2 * border, d_height - 2 * border, d_stride,
+ 0, 0, best_error);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = -1;
+ }
+
+ // look to the right
+ *param = add_param_offset(p, curr_param, step);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, border, border,
+ d_width - 2 * border, d_height - 2 * border, d_stride,
+ 0, 0, best_error);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ step_dir = 1;
+ }
+ *param = best_param;
+
+ // look to the direction chosen above repeatedly until error increases
+ // for the biggest step size
+ while (step_dir) {
+ *param = add_param_offset(p, best_param, step * step_dir);
+ step_error =
+ av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
+ dst + border * d_stride + border, border, border,
+ d_width - 2 * border, d_height - 2 * border,
+ d_stride, 0, 0, best_error);
+ if (step_error < best_error) {
+ best_error = step_error;
+ best_param = *param;
+ } else {
+ *param = best_param;
+ step_dir = 0;
+ }
+ }
+ }
+ }
+ force_wmtype(wm, wmtype);
+ wm->wmtype = get_gmtype(wm);
+ return best_error;
+}
+
+static INLINE RansacFunc get_ransac_type(TransformationType type) {
+ switch (type) {
+ case AFFINE: return ransac_affine;
+ case ROTZOOM: return ransac_rotzoom;
+ case TRANSLATION: return ransac_translation;
+ default: assert(0); return NULL;
+ }
+}
+
+static unsigned char *downconvert_frame(YV12_BUFFER_CONFIG *frm,
+ int bit_depth) {
+ int i, j;
+ uint16_t *orig_buf = CONVERT_TO_SHORTPTR(frm->y_buffer);
+ uint8_t *buf_8bit = frm->y_buffer_8bit;
+ assert(buf_8bit);
+ if (!frm->buf_8bit_valid) {
+ for (i = 0; i < frm->y_height; ++i) {
+ for (j = 0; j < frm->y_width; ++j) {
+ buf_8bit[i * frm->y_stride + j] =
+ orig_buf[i * frm->y_stride + j] >> (bit_depth - 8);
+ }
+ }
+ frm->buf_8bit_valid = 1;
+ }
+ return buf_8bit;
+}
+
+int compute_global_motion_feature_based(TransformationType type,
+ YV12_BUFFER_CONFIG *frm,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ int *num_inliers_by_motion,
+ double *params_by_motion,
+ int num_motions) {
+ int i;
+ int num_frm_corners, num_ref_corners;
+ int num_correspondences;
+ int *correspondences;
+ int frm_corners[2 * MAX_CORNERS], ref_corners[2 * MAX_CORNERS];
+ unsigned char *frm_buffer = frm->y_buffer;
+ unsigned char *ref_buffer = ref->y_buffer;
+ RansacFunc ransac = get_ransac_type(type);
+
+ if (frm->flags & YV12_FLAG_HIGHBITDEPTH) {
+ // The frame buffer is 16-bit, so we need to convert to 8 bits for the
+ // following code. We cache the result until the frame is released.
+ frm_buffer = downconvert_frame(frm, bit_depth);
+ }
+ if (ref->flags & YV12_FLAG_HIGHBITDEPTH) {
+ ref_buffer = downconvert_frame(ref, bit_depth);
+ }
+
+ // compute interest points in images using FAST features
+ num_frm_corners = fast_corner_detect(frm_buffer, frm->y_width, frm->y_height,
+ frm->y_stride, frm_corners, MAX_CORNERS);
+ num_ref_corners = fast_corner_detect(ref_buffer, ref->y_width, ref->y_height,
+ ref->y_stride, ref_corners, MAX_CORNERS);
+
+ // find correspondences between the two images
+ correspondences =
+ (int *)malloc(num_frm_corners * 4 * sizeof(*correspondences));
+ num_correspondences = determine_correspondence(
+ frm_buffer, (int *)frm_corners, num_frm_corners, ref_buffer,
+ (int *)ref_corners, num_ref_corners, frm->y_width, frm->y_height,
+ frm->y_stride, ref->y_stride, correspondences);
+
+ ransac(correspondences, num_correspondences, num_inliers_by_motion,
+ params_by_motion, num_motions);
+
+ free(correspondences);
+
+ // Set num_inliers = 0 for motions with too few inliers so they are ignored.
+ for (i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] < MIN_INLIER_PROB * num_correspondences) {
+ num_inliers_by_motion[i] = 0;
+ }
+ }
+
+ // Return true if any one of the motions has inliers.
+ for (i = 0; i < num_motions; ++i) {
+ if (num_inliers_by_motion[i] > 0) return 1;
+ }
+ return 0;
+}
diff --git a/third_party/aom/av1/encoder/global_motion.h b/third_party/aom/av1/encoder/global_motion.h
new file mode 100644
index 0000000000..c7c016c430
--- /dev/null
+++ b/third_party/aom/av1/encoder/global_motion.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+#define AOM_AV1_ENCODER_GLOBAL_MOTION_H_
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "av1/common/mv.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RANSAC_NUM_MOTIONS 1
+
+void convert_model_to_params(const double *params, WarpedMotionParams *model);
+
+int is_enough_erroradvantage(double best_erroradvantage, int params_cost,
+ int erroradv_type);
+
+// Returns the av1_warp_error between "dst" and the result of applying the
+// motion params that result from fine-tuning "wm" to "ref". Note that "wm" is
+// modified in place.
+int64_t refine_integerized_param(WarpedMotionParams *wm,
+ TransformationType wmtype, int use_hbd, int bd,
+ uint8_t *ref, int r_width, int r_height,
+ int r_stride, uint8_t *dst, int d_width,
+ int d_height, int d_stride, int n_refinements,
+ int64_t best_frame_error);
+
+/*
+ Computes "num_motions" candidate global motion parameters between two frames.
+ The array "params_by_motion" should be length 8 * "num_motions". The ordering
+ of each set of parameters is best described by the homography:
+
+ [x' (m2 m3 m0 [x
+ z . y' = m4 m5 m1 * y
+ 1] m6 m7 1) 1]
+
+ where m{i} represents the ith value in any given set of parameters.
+
+ "num_inliers" should be length "num_motions", and will be populated with the
+ number of inlier feature points for each motion. Params for which the
+ num_inliers entry is 0 should be ignored by the caller.
+*/
+int compute_global_motion_feature_based(TransformationType type,
+ YV12_BUFFER_CONFIG *frm,
+ YV12_BUFFER_CONFIG *ref, int bit_depth,
+ int *num_inliers_by_motion,
+ double *params_by_motion,
+ int num_motions);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // AOM_AV1_ENCODER_GLOBAL_MOTION_H_
diff --git a/third_party/aom/av1/encoder/grain_test_vectors.h b/third_party/aom/av1/encoder/grain_test_vectors.h
new file mode 100644
index 0000000000..945dc37331
--- /dev/null
+++ b/third_party/aom/av1/encoder/grain_test_vectors.h
@@ -0,0 +1,781 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+#define AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
+
+/* Test vectors for emulation of different film grain types.
+ * Note that bit depth would be derived from the bitstream and
+ * not signaled in film grain metadata. The parameters are valid
+ * for any bit depth.
+ */
+static aom_film_grain_t film_grain_test_vectors[16] = {
+ /* Test 1 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 16, 0 },
+ { 25, 136 },
+ { 33, 144 },
+ { 41, 160 },
+ { 48, 168 },
+ { 56, 136 },
+ { 67, 128 },
+ { 82, 144 },
+ { 97, 152 },
+ { 113, 144 },
+ { 128, 176 },
+ { 143, 168 },
+ { 158, 176 },
+ { 178, 184 } },
+ 14 /* num_points_y */,
+ { { 16, 0 },
+ { 20, 64 },
+ { 28, 88 },
+ { 60, 104 },
+ { 90, 136 },
+ { 105, 160 },
+ { 134, 168 },
+ { 168, 208 } },
+ 8 /* num_cb_points */,
+ { { 16, 0 },
+ { 28, 96 },
+ { 56, 80 },
+ { 66, 96 },
+ { 80, 104 },
+ { 108, 96 },
+ { 122, 112 },
+ { 137, 112 },
+ { 169, 176 } },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 247 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 18 /* cb_offset */,
+ 229 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 54 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /* chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 2 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cb_points */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 3 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 192 }, { 255, 192 } },
+ 2 /* num_points_y */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cb_points */,
+ { { 0, 128 }, { 255, 128 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ {
+ 4, -7, 2, 4, 12, -12, 5, -8, 6, 8, -19, -16, 19,
+ -10, -2, 17, -42, 58, -2, -13, 9, 14, -36, 67, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 1 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 4 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 137 },
+ { 53, 146 },
+ { 63, 155 },
+ { 78, 155 },
+ { 107, 150 },
+ { 122, 147 },
+ { 136, 147 },
+ { 166, 153 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 72 },
+ { 27, 82 },
+ { 33, 91 },
+ { 69, 121 },
+ { 95, 143 },
+ { 108, 154 },
+ { 134, 169 },
+ { 147, 177 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 24, 95 },
+ { 54, 93 },
+ { 65, 94 },
+ { 79, 98 },
+ { 109, 107 },
+ { 124, 119 },
+ { 139, 136 },
+ { 169, 170 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 5 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 64 }, { 255, 64 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 96 },
+ { 32, 90 },
+ { 64, 83 },
+ { 96, 76 },
+ { 128, 68 },
+ { 159, 59 },
+ { 191, 48 },
+ { 223, 34 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 34 },
+ { 64, 48 },
+ { 96, 59 },
+ { 128, 68 },
+ { 159, 76 },
+ { 191, 83 },
+ { 223, 90 },
+ { 255, 96 },
+ },
+ 9 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ -2, 2, -5, 7, -6, 4, -2, -1, 1, -2, 0, -2, 2,
+ -3, -5, 13, -13, 6, -14, 8, -1, 18, -36, 58, 0,
+ },
+ {
+ -2, -1, -3, 14, -4, -1, -3, 0, -1, 7, -31, 7, 2,
+ 0, 1, 0, -7, 50, -8, -2, 2, 2, 2, -4, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1063 /* random_seed */
+ },
+ /* Test 6 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 96 },
+ { 20, 92 },
+ { 39, 88 },
+ { 59, 84 },
+ { 78, 80 },
+ { 98, 75 },
+ { 118, 70 },
+ { 137, 65 },
+ { 157, 60 },
+ { 177, 53 },
+ { 196, 46 },
+ { 216, 38 },
+ { 235, 27 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 2754 /* random_seed */
+ },
+ /* Test 7 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 27 },
+ { 39, 38 },
+ { 59, 46 },
+ { 78, 53 },
+ { 98, 60 },
+ { 118, 65 },
+ { 137, 70 },
+ { 157, 75 },
+ { 177, 80 },
+ { 196, 84 },
+ { 216, 88 },
+ { 235, 92 },
+ { 255, 96 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 8 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 2 /* num_points_y */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cb_points */,
+ { { 0, 62 }, { 255, 62 } },
+ 2 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 9 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 127 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 10 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 48 }, { 255, 48 } },
+ 2 /* num_points_y */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cb_points */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ { -7, -6, -48, -22, 2, -3, -45, 73, -11, -26, -52, 76, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 11 */
+ {
+ 1 /* apply_grain */,
+ 0 /* update_parameters */,
+ { { 0, 32 }, { 255, 32 } },
+ 2 /* num_points_y */,
+ {
+ { 0, 48 },
+ { 32, 45 },
+ { 64, 42 },
+ { 96, 38 },
+ { 128, 34 },
+ { 159, 29 },
+ { 191, 24 },
+ { 223, 17 },
+ { 255, 0 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 0, 0 },
+ { 32, 17 },
+ { 64, 24 },
+ { 96, 29 },
+ { 128, 34 },
+ { 159, 38 },
+ { 191, 42 },
+ { 223, 45 },
+ { 255, 48 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 7, -9, 2, 4, 7, -12, 7, -18, 18, -30, -27, -42,
+ 13, -20, 7, -18, 6, 107, 55, -2, -4, -9, -22, 113,
+ },
+ {
+ -3, -1, -4, 3, -6, -2, 3, 1, -4, -10, -10, -5, -5,
+ -3, -1, -13, -28, -25, -31, -6, -4, 14, -64, 66, 0,
+ },
+ {
+ 0, 4, -3, 13, 0, 1, -3, 0, -3, -10, -68, -4, -2,
+ -5, 2, -3, -20, 62, -31, 0, -4, -1, -8, -29, 0,
+ },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 1357 /* random_seed */
+ },
+ /* Test 12 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 24, 49 },
+ { 39, 69 },
+ { 46, 84 },
+ { 53, 91 },
+ { 63, 100 },
+ { 78, 114 },
+ { 92, 134 },
+ { 164, 139 },
+ },
+ 9 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 20, 31 },
+ { 26, 42 },
+ { 33, 54 },
+ { 40, 65 },
+ { 47, 72 },
+ { 56, 85 },
+ { 84, 123 },
+ { 152, 157 },
+ },
+ 9 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 25, 14 },
+ { 39, 33 },
+ { 47, 40 },
+ { 54, 47 },
+ { 64, 62 },
+ { 79, 76 },
+ { 94, 83 },
+ { 167, 101 },
+ },
+ 9 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 0, 0, -58, 0, 0, 0, -76, 100, -43, 0, -51, 82 },
+ { 0, 0, -49, 0, 0, 0, -36, 22, -30, 0, -38, 7, 39 },
+ { 0, 0, -47, 0, 0, 0, -31, 31, -25, 0, -32, 13, -100 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 0 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 13 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 48 },
+ { 20, 46 },
+ { 39, 44 },
+ { 59, 42 },
+ { 78, 40 },
+ { 98, 38 },
+ { 118, 35 },
+ { 137, 33 },
+ { 157, 30 },
+ { 177, 27 },
+ { 196, 23 },
+ { 216, 19 },
+ { 235, 13 },
+ { 255, 0 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 14 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 0, 0 },
+ { 20, 13 },
+ { 39, 19 },
+ { 59, 23 },
+ { 78, 27 },
+ { 98, 30 },
+ { 118, 33 },
+ { 137, 35 },
+ { 157, 38 },
+ { 177, 40 },
+ { 196, 42 },
+ { 216, 44 },
+ { 235, 46 },
+ { 255, 48 },
+ },
+ 14 /* num_points_y */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cb_points */,
+ { { 0, 0 }, { 255, 0 } },
+ 0 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 10, -30, -20, -39, 1, -24, 12, 103, 60, -9, -24, 113 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ 8 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 1 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 15 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ { { 0, 96 }, { 255, 96 } },
+ 1 /* num_points_y */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cb_points */,
+ { { 0, 96 }, { 255, 96 } },
+ 0 /* num_cr_points */,
+ 11 /* scaling_shift */,
+ 2 /* ar_coeff_lag */,
+ { 5, -15, -10, -19, 0, -12, 6, 51, 30, -5, -12, 56 },
+ { 2, 2, -24, -5, 1, 1, -18, 37, -2, 0, -15, 39, -70 },
+ { 2, 3, -24, -5, -1, 0, -18, 38, -2, 0, -15, 39, -55 },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 1 /*chroma_scaling_from_luma*/,
+ 0 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+ /* Test 16 */
+ {
+ 1 /* apply_grain */,
+ 1 /* update_parameters */,
+ {
+ { 16, 0 },
+ { 58, 126 },
+ { 87, 120 },
+ { 97, 122 },
+ { 112, 125 },
+ { 126, 131 },
+ { 141, 139 },
+ { 199, 153 },
+ },
+ 8 /* num_points_y */,
+ {
+ { 16, 0 },
+ { 59, 68 },
+ { 66, 76 },
+ { 73, 82 },
+ { 79, 85 },
+ { 86, 86 },
+ { 151, 95 },
+ { 192, 101 },
+ },
+ 8 /* num_cb_points */,
+ {
+ { 16, 0 },
+ { 59, 64 },
+ { 89, 80 },
+ { 99, 86 },
+ { 114, 90 },
+ { 129, 93 },
+ { 144, 97 },
+ { 203, 85 },
+ },
+ 8 /* num_cr_points */,
+ 10 /* scaling_shift */,
+ 3 /* ar_coeff_lag */,
+ {
+ 4, 1, 3, 0, 1, -3, 8, -3, 7, -23, 1, -25,
+ 0, -10, 6, -17, -4, 53, 36, 5, -5, -17, 8, 66,
+ },
+ {
+ 0, -2, -2, 8, 5, -1, 1, -1, 5, 16, -33, -9, 6,
+ -1, -3, 10, -47, 63, 0, -15, 3, 11, -42, 75, -69,
+ },
+ {
+ 1, -1, -1, 9, 5, 0, 1, -1, 5, 15, -32, -10, 8,
+ -2, -4, 11, -46, 62, 1, -16, 3, 13, -43, 75, -55,
+ },
+ 7 /* ar_coeff_shift */,
+ 128 /* cb_mult */,
+ 192 /* cb_luma_mult */,
+ 256 /* cb_offset */,
+ 128 /* cr_mult */,
+ 192 /* cr_luma_mult */,
+ 256 /* cr_offset */,
+ 1 /* overlap_flag */,
+ 0 /* clip_to_restricted_range */,
+ 8 /* bit_depth */,
+ 0 /*chroma_scaling_from_luma*/,
+ 2 /* grain_scale_shift*/,
+ 45231 /* random_seed */
+ },
+};
+#endif // AOM_AV1_ENCODER_GRAIN_TEST_VECTORS_H_
diff --git a/third_party/aom/av1/encoder/hash.c b/third_party/aom/av1/encoder/hash.c
new file mode 100644
index 0000000000..180115d9f2
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/hash.h"
+
+static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator,
+ uint8_t *pData, uint32_t dataLength) {
+ for (uint32_t i = 0; i < dataLength; i++) {
+ const uint8_t index =
+ (p_crc_calculator->remainder >> (p_crc_calculator->bits - 8)) ^
+ pData[i];
+ p_crc_calculator->remainder <<= 8;
+ p_crc_calculator->remainder ^= p_crc_calculator->table[index];
+ }
+}
+
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+ p_crc_calculator->remainder = 0;
+}
+
+static uint32_t crc_calculator_get_crc(CRC_CALCULATOR *p_crc_calculator) {
+ return p_crc_calculator->remainder & p_crc_calculator->final_result_mask;
+}
+
+static void crc_calculator_init_table(CRC_CALCULATOR *p_crc_calculator) {
+ const uint32_t high_bit = 1 << (p_crc_calculator->bits - 1);
+ const uint32_t byte_high_bit = 1 << (8 - 1);
+
+ for (uint32_t value = 0; value < 256; value++) {
+ uint32_t remainder = 0;
+ for (uint8_t mask = byte_high_bit; mask != 0; mask >>= 1) {
+ if (value & mask) {
+ remainder ^= high_bit;
+ }
+
+ if (remainder & high_bit) {
+ remainder <<= 1;
+ remainder ^= p_crc_calculator->trunc_poly;
+ } else {
+ remainder <<= 1;
+ }
+ }
+ p_crc_calculator->table[value] = remainder;
+ }
+}
+
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly) {
+ p_crc_calculator->remainder = 0;
+ p_crc_calculator->bits = bits;
+ p_crc_calculator->trunc_poly = truncPoly;
+ p_crc_calculator->final_result_mask = (1 << bits) - 1;
+ crc_calculator_init_table(p_crc_calculator);
+}
+
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length) {
+ CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
+ crc_calculator_reset(p_crc_calculator);
+ crc_calculator_process_data(p_crc_calculator, p, length);
+ return crc_calculator_get_crc(p_crc_calculator);
+}
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Construct table for software CRC-32C calculation. */
+void av1_crc32c_calculator_init(CRC32C *p_crc32c) {
+ uint32_t crc;
+
+ for (int n = 0; n < 256; n++) {
+ crc = n;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ p_crc32c->table[0][n] = crc;
+ }
+ for (int n = 0; n < 256; n++) {
+ crc = p_crc32c->table[0][n];
+ for (int k = 1; k < 8; k++) {
+ crc = p_crc32c->table[0][crc & 0xff] ^ (crc >> 8);
+ p_crc32c->table[k][n] = crc;
+ }
+ }
+}
+
+/* Table-driven software version as a fall-back. This is about 15 times slower
+ than using the hardware instructions. This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t av1_get_crc32c_value_c(CRC32C *p, uint8_t *buf, size_t len) {
+ const uint8_t *next = (const uint8_t *)(buf);
+ uint64_t crc;
+
+ crc = 0 ^ 0xffffffff;
+ while (len && ((uintptr_t)next & 7) != 0) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ while (len >= 8) {
+ crc ^= *(uint64_t *)next;
+ crc = p->table[7][crc & 0xff] ^ p->table[6][(crc >> 8) & 0xff] ^
+ p->table[5][(crc >> 16) & 0xff] ^ p->table[4][(crc >> 24) & 0xff] ^
+ p->table[3][(crc >> 32) & 0xff] ^ p->table[2][(crc >> 40) & 0xff] ^
+ p->table[1][(crc >> 48) & 0xff] ^ p->table[0][crc >> 56];
+ next += 8;
+ len -= 8;
+ }
+ while (len) {
+ crc = p->table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ return (uint32_t)crc ^ 0xffffffff;
+}
diff --git a/third_party/aom/av1/encoder/hash.h b/third_party/aom/av1/encoder/hash.h
new file mode 100644
index 0000000000..826c004d6f
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_H_
+#define AOM_AV1_ENCODER_HASH_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _crc_calculator {
+ uint32_t remainder;
+ uint32_t trunc_poly;
+ uint32_t bits;
+ uint32_t table[256];
+ uint32_t final_result_mask;
+} CRC_CALCULATOR;
+
+// Initialize the crc calculator. It must be executed at least once before
+// calling av1_get_crc_value().
+void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
+ uint32_t truncPoly);
+uint32_t av1_get_crc_value(void *crc_calculator, uint8_t *p, int length);
+
+// CRC32C: POLY = 0x82f63b78;
+typedef struct _CRC32C {
+ /* Table for a quadword-at-a-time software crc. */
+ uint32_t table[8][256];
+} CRC32C;
+
+// init table for software version crc32c
+void av1_crc32c_calculator_init(CRC32C *p_crc32c);
+
+#define AOM_BUFFER_SIZE_FOR_BLOCK_HASH (4096)
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_H_
diff --git a/third_party/aom/av1/encoder/hash_motion.c b/third_party/aom/av1/encoder/hash_motion.c
new file mode 100644
index 0000000000..e85a516e85
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/hash.h"
+#include "av1/encoder/hash_motion.h"
+
+static const int crc_bits = 16;
+static const int block_size_bits = 3;
+
+static void hash_table_clear_all(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table == NULL) {
+ return;
+ }
+ int max_addr = 1 << (crc_bits + block_size_bits);
+ for (int i = 0; i < max_addr; i++) {
+ if (p_hash_table->p_lookup_table[i] != NULL) {
+ aom_vector_destroy(p_hash_table->p_lookup_table[i]);
+ aom_free(p_hash_table->p_lookup_table[i]);
+ p_hash_table->p_lookup_table[i] = NULL;
+ }
+ }
+}
+
+// TODO(youzhou@microsoft.com): is higher than 8 bits screen content supported?
+// If yes, fix this function
+static void get_pixels_in_1D_char_array_by_block_2x2(uint8_t *y_src, int stride,
+ uint8_t *p_pixels_in1D) {
+ uint8_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static void get_pixels_in_1D_short_array_by_block_2x2(uint16_t *y_src,
+ int stride,
+ uint16_t *p_pixels_in1D) {
+ uint16_t *p_pel = y_src;
+ int index = 0;
+ for (int i = 0; i < 2; i++) {
+ for (int j = 0; j < 2; j++) {
+ p_pixels_in1D[index++] = p_pel[j];
+ }
+ p_pel += stride;
+ }
+}
+
+static int is_block_2x2_row_same_value(uint8_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_row_same_value(uint16_t *p) {
+ if (p[0] != p[1] || p[2] != p[3]) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block_2x2_col_same_value(uint8_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+static int is_block16_2x2_col_same_value(uint16_t *p) {
+ if ((p[0] != p[2]) || (p[1] != p[3])) {
+ return 0;
+ }
+ return 1;
+}
+
+// the hash value (hash_value1 consists two parts, the first 3 bits relate to
+// the block size and the remaining 16 bits are the crc values. This fuction
+// is used to get the first 3 bits.
+static int hash_block_size_to_index(int block_size) {
+ switch (block_size) {
+ case 4: return 0;
+ case 8: return 1;
+ case 16: return 2;
+ case 32: return 3;
+ case 64: return 4;
+ case 128: return 5;
+ default: return -1;
+ }
+}
+
+void av1_hash_table_init(hash_table *p_hash_table, MACROBLOCK *x) {
+ if (x->g_crc_initialized == 0) {
+ av1_crc_calculator_init(&x->crc_calculator1, 24, 0x5D6DCB);
+ av1_crc_calculator_init(&x->crc_calculator2, 24, 0x864CFB);
+ x->g_crc_initialized = 1;
+ }
+ p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_destroy(hash_table *p_hash_table) {
+ hash_table_clear_all(p_hash_table);
+ aom_free(p_hash_table->p_lookup_table);
+ p_hash_table->p_lookup_table = NULL;
+}
+
+void av1_hash_table_create(hash_table *p_hash_table) {
+ if (p_hash_table->p_lookup_table != NULL) {
+ hash_table_clear_all(p_hash_table);
+ return;
+ }
+ const int max_addr = 1 << (crc_bits + block_size_bits);
+ p_hash_table->p_lookup_table =
+ (Vector **)aom_malloc(sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+ memset(p_hash_table->p_lookup_table, 0,
+ sizeof(p_hash_table->p_lookup_table[0]) * max_addr);
+}
+
+static void hash_table_add_to_table(hash_table *p_hash_table,
+ uint32_t hash_value,
+ block_hash *curr_block_hash) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ p_hash_table->p_lookup_table[hash_value] =
+ aom_malloc(sizeof(p_hash_table->p_lookup_table[0][0]));
+ aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10,
+ sizeof(curr_block_hash[0]));
+ aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash);
+ } else {
+ aom_vector_push_back(p_hash_table->p_lookup_table[hash_value],
+ curr_block_hash);
+ }
+}
+
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value) {
+ if (p_hash_table->p_lookup_table[hash_value] == NULL) {
+ return 0;
+ } else {
+ return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size);
+ }
+}
+
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value) {
+ assert(av1_hash_table_count(p_hash_table, hash_value) > 0);
+ return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]);
+}
+
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2) {
+ if (p_hash_table->p_lookup_table[hash_value1] == NULL) {
+ return 0;
+ }
+ Iterator iterator =
+ aom_vector_begin(p_hash_table->p_lookup_table[hash_value1]);
+ Iterator last = aom_vector_end(p_hash_table->p_lookup_table[hash_value1]);
+ for (; !iterator_equals(&iterator, &last); iterator_increment(&iterator)) {
+ if ((*(block_hash *)iterator_get(&iterator)).hash_value2 == hash_value2) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3],
+ MACROBLOCK *x) {
+ const int width = 2;
+ const int height = 2;
+ const int x_end = picture->y_crop_width - width + 1;
+ const int y_end = picture->y_crop_height - height + 1;
+
+ const int length = width * 2;
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_short_array_by_block_2x2(
+ CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride +
+ x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block16_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block16_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] = av1_get_crc_value(
+ &x->crc_calculator1, (uint8_t *)p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] = av1_get_crc_value(
+ &x->crc_calculator2, (uint8_t *)p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ } else {
+ uint8_t p[4];
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ get_pixels_in_1D_char_array_by_block_2x2(
+ picture->y_buffer + y_pos * picture->y_stride + x_pos,
+ picture->y_stride, p);
+ pic_block_same_info[0][pos] = is_block_2x2_row_same_value(p);
+ pic_block_same_info[1][pos] = is_block_2x2_col_same_value(p);
+
+ pic_block_hash[0][pos] =
+ av1_get_crc_value(&x->crc_calculator1, p, length * sizeof(p[0]));
+ pic_block_hash[1][pos] =
+ av1_get_crc_value(&x->crc_calculator2, p, length * sizeof(p[0]));
+ pos++;
+ }
+ pos += width - 1;
+ }
+ }
+}
+
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3],
+ MACROBLOCK *x) {
+ const int pic_width = picture->y_crop_width;
+ const int x_end = picture->y_crop_width - block_size + 1;
+ const int y_end = picture->y_crop_height - block_size + 1;
+
+ const int src_size = block_size >> 1;
+ const int quad_size = block_size >> 2;
+
+ uint32_t p[4];
+ const int length = sizeof(p);
+
+ int pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ p[0] = src_pic_block_hash[0][pos];
+ p[1] = src_pic_block_hash[0][pos + src_size];
+ p[2] = src_pic_block_hash[0][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[0][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[0][pos] =
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)p, length);
+
+ p[0] = src_pic_block_hash[1][pos];
+ p[1] = src_pic_block_hash[1][pos + src_size];
+ p[2] = src_pic_block_hash[1][pos + src_size * pic_width];
+ p[3] = src_pic_block_hash[1][pos + src_size * pic_width + src_size];
+ dst_pic_block_hash[1][pos] =
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)p, length);
+
+ dst_pic_block_same_info[0][pos] =
+ src_pic_block_same_info[0][pos] &&
+ src_pic_block_same_info[0][pos + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + quad_size] &&
+ src_pic_block_same_info[0][pos + src_size * pic_width + src_size];
+
+ dst_pic_block_same_info[1][pos] =
+ src_pic_block_same_info[1][pos] &&
+ src_pic_block_same_info[1][pos + src_size] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width] &&
+ src_pic_block_same_info[1][pos + quad_size * pic_width + src_size] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width] &&
+ src_pic_block_same_info[1][pos + src_size * pic_width + src_size];
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+
+ if (block_size >= 4) {
+ const int size_minus_1 = block_size - 1;
+ pos = 0;
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ dst_pic_block_same_info[2][pos] =
+ (!dst_pic_block_same_info[0][pos] &&
+ !dst_pic_block_same_info[1][pos]) ||
+ (((x_pos & size_minus_1) == 0) && ((y_pos & size_minus_1) == 0));
+ pos++;
+ }
+ pos += block_size - 1;
+ }
+ }
+}
+
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size) {
+ const int x_end = pic_width - block_size + 1;
+ const int y_end = pic_height - block_size + 1;
+
+ const int8_t *src_is_added = pic_is_same;
+ const uint32_t *src_hash[2] = { pic_hash[0], pic_hash[1] };
+
+ int add_value = hash_block_size_to_index(block_size);
+ assert(add_value >= 0);
+ add_value <<= crc_bits;
+ const int crc_mask = (1 << crc_bits) - 1;
+
+ for (int x_pos = 0; x_pos < x_end; x_pos++) {
+ for (int y_pos = 0; y_pos < y_end; y_pos++) {
+ const int pos = y_pos * pic_width + x_pos;
+ // valid data
+ if (src_is_added[pos]) {
+ block_hash curr_block_hash;
+ curr_block_hash.x = x_pos;
+ curr_block_hash.y = y_pos;
+
+ const uint32_t hash_value1 = (src_hash[0][pos] & crc_mask) + add_value;
+ curr_block_hash.hash_value2 = src_hash[1][pos];
+
+ hash_table_add_to_table(p_hash_table, hash_value1, &curr_block_hash);
+ }
+ }
+ }
+}
+
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j] != p16[0]) {
+ return 0;
+ }
+ }
+ p16 += stride;
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j] != p[0]) {
+ return 0;
+ }
+ }
+ p += stride;
+ }
+ }
+
+ return 1;
+}
+
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start) {
+ const int stride = picture->y_stride;
+ const uint8_t *p = picture->y_buffer + y_start * stride + x_start;
+
+ if (picture->flags & YV12_FLAG_HIGHBITDEPTH) {
+ const uint16_t *p16 = CONVERT_TO_SHORTPTR(p);
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p16[j * stride + i] != p16[i]) {
+ return 0;
+ }
+ }
+ }
+ } else {
+ for (int i = 0; i < block_size; i++) {
+ for (int j = 1; j < block_size; j++) {
+ if (p[j * stride + i] != p[i]) {
+ return 0;
+ }
+ }
+ }
+ }
+ return 1;
+}
+
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth, MACROBLOCK *x) {
+ uint32_t to_hash[4];
+ const int add_value = hash_block_size_to_index(block_size) << crc_bits;
+ assert(add_value >= 0);
+ const int crc_mask = (1 << crc_bits) - 1;
+
+ // 2x2 subblock hash values in current CU
+ int sub_block_in_width = (block_size >> 1);
+ if (use_highbitdepth) {
+ uint16_t pixel_to_hash[4];
+ uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src);
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_short_array_by_block_2x2(
+ y16_src + y_pos * stride + x_pos, stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ x->hash_value_buffer[0][0][pos] =
+ av1_get_crc_value(&x->crc_calculator1, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] =
+ av1_get_crc_value(&x->crc_calculator2, (uint8_t *)pixel_to_hash,
+ sizeof(pixel_to_hash));
+ }
+ }
+ } else {
+ uint8_t pixel_to_hash[4];
+ for (int y_pos = 0; y_pos < block_size; y_pos += 2) {
+ for (int x_pos = 0; x_pos < block_size; x_pos += 2) {
+ int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1);
+ get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos,
+ stride, pixel_to_hash);
+ assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ x->hash_value_buffer[0][0][pos] = av1_get_crc_value(
+ &x->crc_calculator1, pixel_to_hash, sizeof(pixel_to_hash));
+ x->hash_value_buffer[1][0][pos] = av1_get_crc_value(
+ &x->crc_calculator2, pixel_to_hash, sizeof(pixel_to_hash));
+ }
+ }
+ }
+
+ int src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+
+ int src_idx = 1;
+ int dst_idx = 0;
+
+ // 4x4 subblock hash values to current block hash values
+ for (int sub_width = 4; sub_width <= block_size; sub_width *= 2) {
+ src_idx = 1 - src_idx;
+ dst_idx = 1 - dst_idx;
+
+ int dst_pos = 0;
+ for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) {
+ for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) {
+ int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1);
+
+ assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(srcPos + src_sub_block_in_width + 1 <
+ AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH);
+ to_hash[0] = x->hash_value_buffer[0][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[0][src_idx][srcPos + 1];
+ to_hash[2] =
+ x->hash_value_buffer[0][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[0][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
+
+ x->hash_value_buffer[0][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator1, (uint8_t *)to_hash, sizeof(to_hash));
+
+ to_hash[0] = x->hash_value_buffer[1][src_idx][srcPos];
+ to_hash[1] = x->hash_value_buffer[1][src_idx][srcPos + 1];
+ to_hash[2] =
+ x->hash_value_buffer[1][src_idx][srcPos + src_sub_block_in_width];
+ to_hash[3] = x->hash_value_buffer[1][src_idx]
+ [srcPos + src_sub_block_in_width + 1];
+ x->hash_value_buffer[1][dst_idx][dst_pos] = av1_get_crc_value(
+ &x->crc_calculator2, (uint8_t *)to_hash, sizeof(to_hash));
+ dst_pos++;
+ }
+ }
+
+ src_sub_block_in_width = sub_block_in_width;
+ sub_block_in_width >>= 1;
+ }
+
+ *hash_value1 = (x->hash_value_buffer[0][dst_idx][0] & crc_mask) + add_value;
+ *hash_value2 = x->hash_value_buffer[1][dst_idx][0];
+}
diff --git a/third_party/aom/av1/encoder/hash_motion.h b/third_party/aom/av1/encoder/hash_motion.h
new file mode 100644
index 0000000000..df3ec32158
--- /dev/null
+++ b/third_party/aom/av1/encoder/hash_motion.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HASH_MOTION_H_
+#define AOM_AV1_ENCODER_HASH_MOTION_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_scale/yv12config.h"
+#include "third_party/vector/vector.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// store a block's hash info.
+// x and y are the position from the top left of the picture
+// hash_value2 is used to store the second hash value
+typedef struct _block_hash {
+ int16_t x;
+ int16_t y;
+ uint32_t hash_value2;
+} block_hash;
+
+typedef struct _hash_table {
+ Vector **p_lookup_table;
+} hash_table;
+
+void av1_hash_table_init(hash_table *p_hash_table, struct macroblock *x);
+void av1_hash_table_destroy(hash_table *p_hash_table);
+void av1_hash_table_create(hash_table *p_hash_table);
+int32_t av1_hash_table_count(hash_table *p_hash_table, uint32_t hash_value);
+Iterator av1_hash_get_first_iterator(hash_table *p_hash_table,
+ uint32_t hash_value);
+int32_t av1_has_exact_match(hash_table *p_hash_table, uint32_t hash_value1,
+ uint32_t hash_value2);
+void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture,
+ uint32_t *pic_block_hash[2],
+ int8_t *pic_block_same_info[3],
+ struct macroblock *x);
+void av1_generate_block_hash_value(const YV12_BUFFER_CONFIG *picture,
+ int block_size,
+ uint32_t *src_pic_block_hash[2],
+ uint32_t *dst_pic_block_hash[2],
+ int8_t *src_pic_block_same_info[3],
+ int8_t *dst_pic_block_same_info[3],
+ struct macroblock *x);
+void av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table,
+ uint32_t *pic_hash[2],
+ int8_t *pic_is_same,
+ int pic_width, int pic_height,
+ int block_size);
+
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all rows
+int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+// check whether the block starts from (x_start, y_start) with the size of
+// block_size x block_size has the same color in all columns
+int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture,
+ int block_size, int x_start, int y_start);
+void av1_get_block_hash_value(uint8_t *y_src, int stride, int block_size,
+ uint32_t *hash_value1, uint32_t *hash_value2,
+ int use_highbitdepth, struct macroblock *x);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HASH_MOTION_H_
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.c b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000000..67898fd184
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "av1/common/idct.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+
+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
+ pixel. */
+void av1_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+ int i;
+ tran_high_t a1, b1, c1, d1, e1;
+ const int16_t *ip_pass0 = input;
+ const tran_low_t *ip = NULL;
+ tran_low_t *op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip_pass0[0 * stride];
+ b1 = ip_pass0[1 * stride];
+ c1 = ip_pass0[2 * stride];
+ d1 = ip_pass0[3 * stride];
+
+ a1 += b1;
+ d1 = d1 - c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)a1;
+ op[4] = (tran_low_t)c1;
+ op[8] = (tran_low_t)d1;
+ op[12] = (tran_low_t)b1;
+
+ ip_pass0++;
+ op++;
+ }
+ ip = output;
+ op = output;
+
+ for (i = 0; i < 4; i++) {
+ a1 = ip[0];
+ b1 = ip[1];
+ c1 = ip[2];
+ d1 = ip[3];
+
+ a1 += b1;
+ d1 -= c1;
+ e1 = (a1 - d1) >> 1;
+ b1 = e1 - b1;
+ c1 = e1 - c1;
+ a1 -= c1;
+ d1 += b1;
+ op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR);
+ op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR);
+ op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR);
+ op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR);
+
+ ip += 4;
+ op += 4;
+ }
+}
+
+void av1_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output,
+ int stride) {
+ av1_fwht4x4_c(input, output, stride);
+}
+
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ if (txfm_param->lossless) {
+ assert(tx_type == DCT_DCT);
+ av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
+ return;
+ }
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_32x8_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ av1_fwd_txfm2d_8x32_c(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const TX_TYPE tx_type = txfm_param->tx_type;
+ const int bd = txfm_param->bd;
+ switch (tx_type) {
+ // use the c version for anything including identity for now
+ case V_DCT:
+ case H_DCT:
+ case V_ADST:
+ case H_ADST:
+ case V_FLIPADST:
+ case H_FLIPADST:
+ case IDTX:
+ av1_fwd_txfm2d_32x32_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ default:
+ av1_fwd_txfm2d_32x32(src_diff, dst_coeff, diff_stride, tx_type, bd);
+ break;
+ }
+}
+
+static void highbd_fwd_txfm_32x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_32x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x32(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x32_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_16x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_16x64_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x16(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x16_c(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+static void highbd_fwd_txfm_64x64(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(txfm_param->tx_type == DCT_DCT);
+ int32_t *dst_coeff = (int32_t *)coeff;
+ const int bd = txfm_param->bd;
+ av1_fwd_txfm2d_64x64(src_diff, dst_coeff, diff_stride, DCT_DCT, bd);
+}
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param) {
+ if (txfm_param->bd == 8)
+ av1_lowbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+ else
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_lowbd_fwd_txfm_c(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, txfm_param);
+}
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
+ const TX_SIZE tx_size = txfm_param->tx_size;
+ switch (tx_size) {
+ case TX_64X64:
+ highbd_fwd_txfm_64x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X64:
+ highbd_fwd_txfm_32x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X32:
+ highbd_fwd_txfm_64x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X64:
+ highbd_fwd_txfm_16x64(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_64X16:
+ highbd_fwd_txfm_64x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X32:
+ highbd_fwd_txfm_32x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X16:
+ highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X8:
+ highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X8:
+ highbd_fwd_txfm_4x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X4:
+ highbd_fwd_txfm_8x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X16:
+ highbd_fwd_txfm_8x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X8:
+ highbd_fwd_txfm_16x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X32:
+ highbd_fwd_txfm_16x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X16:
+ highbd_fwd_txfm_32x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X4:
+ highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_4X16:
+ highbd_fwd_txfm_4x16(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_16X4:
+ highbd_fwd_txfm_16x4(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_8X32:
+ highbd_fwd_txfm_8x32(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ case TX_32X8:
+ highbd_fwd_txfm_32x8(src_diff, coeff, diff_stride, txfm_param);
+ break;
+ default: assert(0); break;
+ }
+}
diff --git a/third_party/aom/av1/encoder/hybrid_fwd_txfm.h b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000000..daabc7119a
--- /dev/null
+++ b/third_party/aom/av1/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+#define AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "config/aom_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+ TxfmParam *txfm_param);
+
+void av1_highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/third_party/aom/av1/encoder/k_means_template.h b/third_party/aom/av1/encoder/k_means_template.h
new file mode 100644
index 0000000000..9e526b88b7
--- /dev/null
+++ b/third_party/aom/av1/encoder/k_means_template.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#ifndef AV1_K_MEANS_DIM
+#error "This template requires AV1_K_MEANS_DIM to be defined"
+#endif
+
+#define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y)
+#define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM)
+
+static int RENAME(calc_dist)(const int *p1, const int *p2) {
+ int dist = 0;
+ for (int i = 0; i < AV1_K_MEANS_DIM; ++i) {
+ const int diff = p1[i] - p2[i];
+ dist += diff * diff;
+ }
+ return dist;
+}
+
+void RENAME(av1_calc_indices)(const int *data, const int *centroids,
+ uint8_t *indices, int n, int k) {
+ for (int i = 0; i < n; ++i) {
+ int min_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM, centroids);
+ indices[i] = 0;
+ for (int j = 1; j < k; ++j) {
+ const int this_dist = RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+ centroids + j * AV1_K_MEANS_DIM);
+ if (this_dist < min_dist) {
+ min_dist = this_dist;
+ indices[i] = j;
+ }
+ }
+ }
+}
+
+static void RENAME(calc_centroids)(const int *data, int *centroids,
+ const uint8_t *indices, int n, int k) {
+ int i, j;
+ int count[PALETTE_MAX_SIZE] = { 0 };
+ unsigned int rand_state = (unsigned int)data[0];
+ assert(n <= 32768);
+ memset(centroids, 0, sizeof(centroids[0]) * k * AV1_K_MEANS_DIM);
+
+ for (i = 0; i < n; ++i) {
+ const int index = indices[i];
+ assert(index < k);
+ ++count[index];
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids[index * AV1_K_MEANS_DIM + j] += data[i * AV1_K_MEANS_DIM + j];
+ }
+ }
+
+ for (i = 0; i < k; ++i) {
+ if (count[i] == 0) {
+ memcpy(centroids + i * AV1_K_MEANS_DIM,
+ data + (lcg_rand16(&rand_state) % n) * AV1_K_MEANS_DIM,
+ sizeof(centroids[0]) * AV1_K_MEANS_DIM);
+ } else {
+ for (j = 0; j < AV1_K_MEANS_DIM; ++j) {
+ centroids[i * AV1_K_MEANS_DIM + j] =
+ DIVIDE_AND_ROUND(centroids[i * AV1_K_MEANS_DIM + j], count[i]);
+ }
+ }
+ }
+}
+
+static int64_t RENAME(calc_total_dist)(const int *data, const int *centroids,
+ const uint8_t *indices, int n, int k) {
+ int64_t dist = 0;
+ (void)k;
+ for (int i = 0; i < n; ++i) {
+ dist += RENAME(calc_dist)(data + i * AV1_K_MEANS_DIM,
+ centroids + indices[i] * AV1_K_MEANS_DIM);
+ }
+ return dist;
+}
+
+void RENAME(av1_k_means)(const int *data, int *centroids, uint8_t *indices,
+ int n, int k, int max_itr) {
+ int pre_centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t pre_indices[MAX_SB_SQUARE];
+
+ RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+ int64_t this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+ for (int i = 0; i < max_itr; ++i) {
+ const int64_t pre_dist = this_dist;
+ memcpy(pre_centroids, centroids,
+ sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+ memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+
+ RENAME(calc_centroids)(data, centroids, indices, n, k);
+ RENAME(av1_calc_indices)(data, centroids, indices, n, k);
+ this_dist = RENAME(calc_total_dist)(data, centroids, indices, n, k);
+
+ if (this_dist > pre_dist) {
+ memcpy(centroids, pre_centroids,
+ sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM);
+ memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+ break;
+ }
+ if (!memcmp(centroids, pre_centroids,
+ sizeof(pre_centroids[0]) * k * AV1_K_MEANS_DIM))
+ break;
+ }
+}
+#undef RENAME_
+#undef RENAME
diff --git a/third_party/aom/av1/encoder/lookahead.c b/third_party/aom/av1/encoder/lookahead.c
new file mode 100644
index 0000000000..1bf8ecbacb
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/common.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/lookahead.h"
+
+/* Return the buffer at the given absolute index and increment the index */
+static struct lookahead_entry *pop(struct lookahead_ctx *ctx, int *idx) {
+ int index = *idx;
+ struct lookahead_entry *buf = ctx->buf + index;
+
+ assert(index < ctx->max_sz);
+ if (++index >= ctx->max_sz) index -= ctx->max_sz;
+ *idx = index;
+ return buf;
+}
+
+void av1_lookahead_destroy(struct lookahead_ctx *ctx) {
+ if (ctx) {
+ if (ctx->buf) {
+ int i;
+
+ for (i = 0; i < ctx->max_sz; i++) aom_free_frame_buffer(&ctx->buf[i].img);
+ free(ctx->buf);
+ }
+ free(ctx);
+ }
+}
+
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth) {
+ struct lookahead_ctx *ctx = NULL;
+
+ // Clamp the lookahead queue depth
+ depth = clamp(depth, 1, MAX_LAG_BUFFERS);
+
+ // Allocate memory to keep previous source frames available.
+ depth += MAX_PRE_FRAMES;
+
+ // Allocate the lookahead structures
+ ctx = calloc(1, sizeof(*ctx));
+ if (ctx) {
+ const int legacy_byte_alignment = 0;
+ unsigned int i;
+ ctx->max_sz = depth;
+ ctx->buf = calloc(depth, sizeof(*ctx->buf));
+ if (!ctx->buf) goto bail;
+ for (i = 0; i < depth; i++)
+ if (aom_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x,
+ subsampling_y, use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, legacy_byte_alignment))
+ goto bail;
+ }
+ return ctx;
+bail:
+ av1_lookahead_destroy(ctx);
+ return NULL;
+}
+
+#define USE_PARTIAL_COPY 0
+
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ aom_enc_frame_flags_t flags) {
+ struct lookahead_entry *buf;
+#if USE_PARTIAL_COPY
+ int row, col, active_end;
+ int mb_rows = (src->y_height + 15) >> 4;
+ int mb_cols = (src->y_width + 15) >> 4;
+#endif
+ int width = src->y_crop_width;
+ int height = src->y_crop_height;
+ int uv_width = src->uv_crop_width;
+ int uv_height = src->uv_crop_height;
+ int subsampling_x = src->subsampling_x;
+ int subsampling_y = src->subsampling_y;
+ int larger_dimensions, new_dimensions;
+
+ if (ctx->sz + 1 + MAX_PRE_FRAMES > ctx->max_sz) return 1;
+ ctx->sz++;
+ buf = pop(ctx, &ctx->write_idx);
+
+ new_dimensions = width != buf->img.y_crop_width ||
+ height != buf->img.y_crop_height ||
+ uv_width != buf->img.uv_crop_width ||
+ uv_height != buf->img.uv_crop_height;
+ larger_dimensions = width > buf->img.y_width || height > buf->img.y_height ||
+ uv_width > buf->img.uv_width ||
+ uv_height > buf->img.uv_height;
+ assert(!larger_dimensions || new_dimensions);
+
+#if USE_PARTIAL_COPY
+ // TODO(jkoleszar): This is disabled for now, as
+ // av1_copy_and_extend_frame_with_rect is not subsampling/alpha aware.
+
+ // Only do this partial copy if the following conditions are all met:
+ // 1. Lookahead queue has has size of 1.
+ // 2. Active map is provided.
+ // 3. This is not a key frame, golden nor altref frame.
+ if (!new_dimensions && ctx->max_sz == 1 && active_map && !flags) {
+ for (row = 0; row < mb_rows; ++row) {
+ col = 0;
+
+ while (1) {
+ // Find the first active macroblock in this row.
+ for (; col < mb_cols; ++col) {
+ if (active_map[col]) break;
+ }
+
+ // No more active macroblock in this row.
+ if (col == mb_cols) break;
+
+ // Find the end of active region in this row.
+ active_end = col;
+
+ for (; active_end < mb_cols; ++active_end) {
+ if (!active_map[active_end]) break;
+ }
+
+ // Only copy this active region.
+ av1_copy_and_extend_frame_with_rect(src, &buf->img, row << 4, col << 4,
+ 16, (active_end - col) << 4);
+
+ // Start again from the end of this active region.
+ col = active_end;
+ }
+
+ active_map += mb_cols;
+ }
+ } else {
+#endif
+ if (larger_dimensions) {
+ YV12_BUFFER_CONFIG new_img;
+ memset(&new_img, 0, sizeof(new_img));
+ if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x,
+ subsampling_y, use_highbitdepth,
+ AOM_BORDER_IN_PIXELS, 0))
+ return 1;
+ aom_free_frame_buffer(&buf->img);
+ buf->img = new_img;
+ } else if (new_dimensions) {
+ buf->img.y_crop_width = src->y_crop_width;
+ buf->img.y_crop_height = src->y_crop_height;
+ buf->img.uv_crop_width = src->uv_crop_width;
+ buf->img.uv_crop_height = src->uv_crop_height;
+ buf->img.subsampling_x = src->subsampling_x;
+ buf->img.subsampling_y = src->subsampling_y;
+ }
+ // Partial copy not implemented yet
+ av1_copy_and_extend_frame(src, &buf->img);
+#if USE_PARTIAL_COPY
+ }
+#endif
+
+ buf->ts_start = ts_start;
+ buf->ts_end = ts_end;
+ buf->flags = flags;
+ return 0;
+}
+
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx,
+ int drain) {
+ struct lookahead_entry *buf = NULL;
+
+ if (ctx && ctx->sz && (drain || ctx->sz == ctx->max_sz - MAX_PRE_FRAMES)) {
+ buf = pop(ctx, &ctx->read_idx);
+ ctx->sz--;
+ }
+ return buf;
+}
+
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+ int index) {
+ struct lookahead_entry *buf = NULL;
+
+ if (index >= 0) {
+ // Forward peek
+ if (index < ctx->sz) {
+ index += ctx->read_idx;
+ if (index >= ctx->max_sz) index -= ctx->max_sz;
+ buf = ctx->buf + index;
+ }
+ } else if (index < 0) {
+ // Backward peek
+ if (-index <= MAX_PRE_FRAMES) {
+ index += (int)(ctx->read_idx);
+ if (index < 0) index += (int)(ctx->max_sz);
+ buf = ctx->buf + index;
+ }
+ }
+
+ return buf;
+}
+
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx) { return ctx->sz; }
diff --git a/third_party/aom/av1/encoder/lookahead.h b/third_party/aom/av1/encoder/lookahead.h
new file mode 100644
index 0000000000..e55224cf75
--- /dev/null
+++ b/third_party/aom/av1/encoder/lookahead.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_LOOKAHEAD_H_
+#define AOM_AV1_ENCODER_LOOKAHEAD_H_
+
+#include "aom_scale/yv12config.h"
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_LAG_BUFFERS 25
+
+struct lookahead_entry {
+ YV12_BUFFER_CONFIG img;
+ int64_t ts_start;
+ int64_t ts_end;
+ aom_enc_frame_flags_t flags;
+};
+
+// The max of past frames we want to keep in the queue.
+#define MAX_PRE_FRAMES 1
+
+struct lookahead_ctx {
+ int max_sz; /* Absolute size of the queue */
+ int sz; /* Number of buffers currently in the queue */
+ int read_idx; /* Read index */
+ int write_idx; /* Write index */
+ struct lookahead_entry *buf; /* Buffer list */
+};
+
+/**\brief Initializes the lookahead stage
+ *
+ * The lookahead stage is a queue of frame buffers on which some analysis
+ * may be done when buffers are enqueued.
+ */
+struct lookahead_ctx *av1_lookahead_init(
+ unsigned int width, unsigned int height, unsigned int subsampling_x,
+ unsigned int subsampling_y, int use_highbitdepth, unsigned int depth);
+
+/**\brief Destroys the lookahead stage
+ */
+void av1_lookahead_destroy(struct lookahead_ctx *ctx);
+
+/**\brief Enqueue a source buffer
+ *
+ * This function will copy the source image into a new framebuffer with
+ * the expected stride/border.
+ *
+ * If active_map is non-NULL and there is only one frame in the queue, then copy
+ * only active macroblocks.
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] src Pointer to the image to enqueue
+ * \param[in] ts_start Timestamp for the start of this frame
+ * \param[in] ts_end Timestamp for the end of this frame
+ * \param[in] flags Flags set on this frame
+ * \param[in] active_map Map that specifies which macroblock is active
+ */
+int av1_lookahead_push(struct lookahead_ctx *ctx, YV12_BUFFER_CONFIG *src,
+ int64_t ts_start, int64_t ts_end, int use_highbitdepth,
+ aom_enc_frame_flags_t flags);
+
+/**\brief Get the next source buffer to encode
+ *
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] drain Flag indicating the buffer should be drained
+ * (return a buffer regardless of the current queue depth)
+ *
+ * \retval NULL, if drain set and queue is empty
+ * \retval NULL, if drain not set and queue not of the configured depth
+ */
+struct lookahead_entry *av1_lookahead_pop(struct lookahead_ctx *ctx, int drain);
+
+/**\brief Get a future source buffer to encode
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ * \param[in] index Index of the frame to be returned, 0 == next frame
+ *
+ * \retval NULL, if no buffer exists at the specified index
+ */
+struct lookahead_entry *av1_lookahead_peek(struct lookahead_ctx *ctx,
+ int index);
+
+/**\brief Get the number of frames currently in the lookahead queue
+ *
+ * \param[in] ctx Pointer to the lookahead context
+ */
+unsigned int av1_lookahead_depth(struct lookahead_ctx *ctx);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_LOOKAHEAD_H_
diff --git a/third_party/aom/av1/encoder/mathutils.h b/third_party/aom/av1/encoder/mathutils.h
new file mode 100644
index 0000000000..64f9361767
--- /dev/null
+++ b/third_party/aom/av1/encoder/mathutils.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MATHUTILS_H_
+#define AOM_AV1_ENCODER_MATHUTILS_H_
+
+#include <memory.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+ int i, j, k;
+ double c;
+ // Forward elimination
+ for (k = 0; k < n - 1; k++) {
+ // Bring the largest magnitude to the diagonal position
+ for (i = n - 1; i > k; i--) {
+ if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+ for (j = 0; j < n; j++) {
+ c = A[i * stride + j];
+ A[i * stride + j] = A[(i - 1) * stride + j];
+ A[(i - 1) * stride + j] = c;
+ }
+ c = b[i];
+ b[i] = b[i - 1];
+ b[i - 1] = c;
+ }
+ }
+ for (i = k; i < n - 1; i++) {
+ if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+ c = A[(i + 1) * stride + k] / A[k * stride + k];
+ for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+ b[i + 1] -= c * b[k];
+ }
+ }
+ // Backward substitution
+ for (i = n - 1; i >= 0; i--) {
+ if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+ c = 0;
+ for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+ x[i] = (b[i] - c) / A[i * stride + i];
+ }
+
+ return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+static INLINE int least_squares(int n, double *A, int rows, int stride,
+ double *b, double *scratch, double *x) {
+ int i, j, k;
+ double *scratch_ = NULL;
+ double *AtA, *Atb;
+ if (!scratch) {
+ scratch_ = (double *)aom_malloc(sizeof(*scratch) * n * (n + 1));
+ scratch = scratch_;
+ }
+ AtA = scratch;
+ Atb = scratch + n * n;
+
+ for (i = 0; i < n; ++i) {
+ for (j = i; j < n; ++j) {
+ AtA[i * n + j] = 0.0;
+ for (k = 0; k < rows; ++k)
+ AtA[i * n + j] += A[k * stride + i] * A[k * stride + j];
+ AtA[j * n + i] = AtA[i * n + j];
+ }
+ Atb[i] = 0;
+ for (k = 0; k < rows; ++k) Atb[i] += A[k * stride + i] * b[k];
+ }
+ int ret = linsolve(n, AtA, n, Atb, x);
+ if (scratch_) aom_free(scratch_);
+ return ret;
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+ const int m1_rows, const int inner_dim,
+ const int m2_cols) {
+ double sum;
+
+ int row, col, inner;
+ for (row = 0; row < m1_rows; ++row) {
+ for (col = 0; col < m2_cols; ++col) {
+ sum = 0;
+ for (inner = 0; inner < inner_dim; ++inner)
+ sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+ *(res++) = sum;
+ }
+ }
+}
+
+//
+// The functions below are needed only for homography computation
+// Remove if the homography models are not used.
+//
+///////////////////////////////////////////////////////////////////////////////
+// svdcmp
+// Adopted from Numerical Recipes in C
+
+static INLINE double sign(double a, double b) {
+ return ((b) >= 0 ? fabs(a) : -fabs(a));
+}
+
+static INLINE double pythag(double a, double b) {
+ double ct;
+ const double absa = fabs(a);
+ const double absb = fabs(b);
+
+ if (absa > absb) {
+ ct = absb / absa;
+ return absa * sqrt(1.0 + ct * ct);
+ } else {
+ ct = absa / absb;
+ return (absb == 0) ? 0 : absb * sqrt(1.0 + ct * ct);
+ }
+}
+
+static INLINE int svdcmp(double **u, int m, int n, double w[], double **v) {
+ const int max_its = 30;
+ int flag, i, its, j, jj, k, l, nm;
+ double anorm, c, f, g, h, s, scale, x, y, z;
+ double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
+ g = scale = anorm = 0.0;
+ for (i = 0; i < n; i++) {
+ l = i + 1;
+ rv1[i] = scale * g;
+ g = s = scale = 0.0;
+ if (i < m) {
+ for (k = i; k < m; k++) scale += fabs(u[k][i]);
+ if (scale != 0.) {
+ for (k = i; k < m; k++) {
+ u[k][i] /= scale;
+ s += u[k][i] * u[k][i];
+ }
+ f = u[i][i];
+ g = -sign(sqrt(s), f);
+ h = f * g - s;
+ u[i][i] = f - g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
+ f = s / h;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ for (k = i; k < m; k++) u[k][i] *= scale;
+ }
+ }
+ w[i] = scale * g;
+ g = s = scale = 0.0;
+ if (i < m && i != n - 1) {
+ for (k = l; k < n; k++) scale += fabs(u[i][k]);
+ if (scale != 0.) {
+ for (k = l; k < n; k++) {
+ u[i][k] /= scale;
+ s += u[i][k] * u[i][k];
+ }
+ f = u[i][l];
+ g = -sign(sqrt(s), f);
+ h = f * g - s;
+ u[i][l] = f - g;
+ for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
+ for (j = l; j < m; j++) {
+ for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
+ for (k = l; k < n; k++) u[j][k] += s * rv1[k];
+ }
+ for (k = l; k < n; k++) u[i][k] *= scale;
+ }
+ }
+ anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
+ }
+
+ for (i = n - 1; i >= 0; i--) {
+ if (i < n - 1) {
+ if (g != 0.) {
+ for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
+ for (k = l; k < n; k++) v[k][j] += s * v[k][i];
+ }
+ }
+ for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
+ }
+ v[i][i] = 1.0;
+ g = rv1[i];
+ l = i;
+ }
+ for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
+ l = i + 1;
+ g = w[i];
+ for (j = l; j < n; j++) u[i][j] = 0.0;
+ if (g != 0.) {
+ g = 1.0 / g;
+ for (j = l; j < n; j++) {
+ for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
+ f = (s / u[i][i]) * g;
+ for (k = i; k < m; k++) u[k][j] += f * u[k][i];
+ }
+ for (j = i; j < m; j++) u[j][i] *= g;
+ } else {
+ for (j = i; j < m; j++) u[j][i] = 0.0;
+ }
+ ++u[i][i];
+ }
+ for (k = n - 1; k >= 0; k--) {
+ for (its = 0; its < max_its; its++) {
+ flag = 1;
+ for (l = k; l >= 0; l--) {
+ nm = l - 1;
+ if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
+ flag = 0;
+ break;
+ }
+ if ((double)(fabs(w[nm]) + anorm) == anorm) break;
+ }
+ if (flag) {
+ c = 0.0;
+ s = 1.0;
+ for (i = l; i <= k; i++) {
+ f = s * rv1[i];
+ rv1[i] = c * rv1[i];
+ if ((double)(fabs(f) + anorm) == anorm) break;
+ g = w[i];
+ h = pythag(f, g);
+ w[i] = h;
+ h = 1.0 / h;
+ c = g * h;
+ s = -f * h;
+ for (j = 0; j < m; j++) {
+ y = u[j][nm];
+ z = u[j][i];
+ u[j][nm] = y * c + z * s;
+ u[j][i] = z * c - y * s;
+ }
+ }
+ }
+ z = w[k];
+ if (l == k) {
+ if (z < 0.0) {
+ w[k] = -z;
+ for (j = 0; j < n; j++) v[j][k] = -v[j][k];
+ }
+ break;
+ }
+ if (its == max_its - 1) {
+ aom_free(rv1);
+ return 1;
+ }
+ assert(k > 0);
+ x = w[l];
+ nm = k - 1;
+ y = w[nm];
+ g = rv1[nm];
+ h = rv1[k];
+ f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
+ g = pythag(f, 1.0);
+ f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
+ c = s = 1.0;
+ for (j = l; j <= nm; j++) {
+ i = j + 1;
+ g = rv1[i];
+ y = w[i];
+ h = s * g;
+ g = c * g;
+ z = pythag(f, h);
+ rv1[j] = z;
+ c = f / z;
+ s = h / z;
+ f = x * c + g * s;
+ g = g * c - x * s;
+ h = y * s;
+ y *= c;
+ for (jj = 0; jj < n; jj++) {
+ x = v[jj][j];
+ z = v[jj][i];
+ v[jj][j] = x * c + z * s;
+ v[jj][i] = z * c - x * s;
+ }
+ z = pythag(f, h);
+ w[j] = z;
+ if (z != 0.) {
+ z = 1.0 / z;
+ c = f * z;
+ s = h * z;
+ }
+ f = c * g + s * y;
+ x = c * y - s * g;
+ for (jj = 0; jj < m; jj++) {
+ y = u[jj][j];
+ z = u[jj][i];
+ u[jj][j] = y * c + z * s;
+ u[jj][i] = z * c - y * s;
+ }
+ }
+ rv1[l] = 0.0;
+ rv1[k] = f;
+ w[k] = x;
+ }
+ }
+ aom_free(rv1);
+ return 0;
+}
+
+static INLINE int SVD(double *U, double *W, double *V, double *matx, int M,
+ int N) {
+ // Assumes allocation for U is MxN
+ double **nrU = (double **)aom_malloc((M) * sizeof(*nrU));
+ double **nrV = (double **)aom_malloc((N) * sizeof(*nrV));
+ int problem, i;
+
+ problem = !(nrU && nrV);
+ if (!problem) {
+ for (i = 0; i < M; i++) {
+ nrU[i] = &U[i * N];
+ }
+ for (i = 0; i < N; i++) {
+ nrV[i] = &V[i * N];
+ }
+ } else {
+ if (nrU) aom_free(nrU);
+ if (nrV) aom_free(nrV);
+ return 1;
+ }
+
+ /* copy from given matx into nrU */
+ for (i = 0; i < M; i++) {
+ memcpy(&(nrU[i][0]), matx + N * i, N * sizeof(*matx));
+ }
+
+ /* HERE IT IS: do SVD */
+ if (svdcmp(nrU, M, N, W, nrV)) {
+ aom_free(nrU);
+ aom_free(nrV);
+ return 1;
+ }
+
+ /* aom_free Numerical Recipes arrays */
+ aom_free(nrU);
+ aom_free(nrV);
+
+ return 0;
+}
+
+#endif // AOM_AV1_ENCODER_MATHUTILS_H_
diff --git a/third_party/aom/av1/encoder/mbgraph.c b/third_party/aom/av1/encoder/mbgraph.c
new file mode 100644
index 0000000000..1a35ff77c7
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.c
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "config/av1_rtcd.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/blockd.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+
+static unsigned int do_16x16_motion_iteration(AV1_COMP *cpi, const MV *ref_mv,
+ int mb_row, int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ const aom_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ MV ref_full;
+ int cost_list[5];
+
+ // Further step/diamond searches as necessary
+ int step_param = mv_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ av1_set_mv_search_range(&x->mv_limits, ref_mv);
+
+ ref_full.col = ref_mv->col >> 3;
+ ref_full.row = ref_mv->row >> 3;
+
+ /*cpi->sf.search_method == HEX*/
+ av1_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
+ cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
+
+ // Try sub-pixel MC
+ // if (bestsme > error_thresh && bestsme < INT_MAX)
+ if (cpi->common.cur_frame_force_integer_mv == 1) {
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+ } else {
+ int distortion;
+ unsigned int sse;
+ cpi->find_fractional_mv_step(
+ x, &cpi->common, mb_row, mb_col, ref_mv,
+ cpi->common.allow_high_precision_mv, x->errorperbit, &v_fn_ptr, 0,
+ mv_sf->subpel_iters_per_step, cond_cost_list(cpi, cost_list), NULL,
+ NULL, &distortion, &sse, NULL, NULL, 0, 0, 0, 0, 0);
+ }
+
+ if (has_second_ref(xd->mi[0]))
+ xd->mi[0]->mode = NEW_NEWMV;
+ else
+ xd->mi[0]->mode = NEWMV;
+
+ xd->mi[0]->mv[0] = x->best_mv;
+ xd->mi[0]->ref_frame[1] = NONE_FRAME;
+
+ av1_build_inter_predictors_sby(&cpi->common, xd, mb_row, mb_col, NULL,
+ BLOCK_16X16);
+
+ /* restore UMV window */
+ x->mv_limits = tmp_mv_limits;
+
+ return aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+}
+
+static int do_16x16_motion_search(AV1_COMP *cpi, const MV *ref_mv, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err, tmp_err;
+ MV best_mv;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+ best_mv.col = best_mv.row = 0;
+
+ // Test last reference frame using the previous best mv as the
+ // starting point (best reference) for the search
+ tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ err = tmp_err;
+ best_mv = x->best_mv.as_mv;
+ }
+
+ // If the current best reference mv is not centered on 0,0 then do a 0,0
+ // based search as well.
+ if (ref_mv->row != 0 || ref_mv->col != 0) {
+ MV zero_ref_mv = kZeroMv;
+
+ tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
+ if (tmp_err < err) {
+ err = tmp_err;
+ best_mv = x->best_mv.as_mv;
+ }
+ }
+
+ x->best_mv.as_mv = best_mv;
+ return err;
+}
+
+static int do_16x16_zerozero_search(AV1_COMP *cpi, int_mv *dst_mv) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ unsigned int err;
+
+ // Try zero MV first
+ // FIXME should really use something like near/nearest MV and/or MV prediction
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
+
+ dst_mv->as_int = 0;
+
+ return err;
+}
+static int find_best_16x16_intra(AV1_COMP *cpi, PREDICTION_MODE *pbest_mode) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ PREDICTION_MODE best_mode = -1, mode;
+ unsigned int best_err = INT_MAX;
+
+ // calculate SATD for each intra prediction mode;
+ // we're intentionally not doing 4x4, we just want a rough estimate
+ for (mode = INTRA_MODE_START; mode < INTRA_MODE_END; mode++) {
+ unsigned int err;
+
+ xd->mi[0]->mode = mode;
+ av1_predict_intra_block(cm, xd, 16, 16, TX_16X16, mode, 0, 0,
+ FILTER_INTRA_MODES, x->plane[0].src.buf,
+ x->plane[0].src.stride, xd->plane[0].dst.buf,
+ xd->plane[0].dst.stride, 0, 0, 0);
+ err = aom_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
+ xd->plane[0].dst.buf, xd->plane[0].dst.stride);
+
+ // find best
+ if (err < best_err) {
+ best_err = err;
+ best_mode = mode;
+ }
+ }
+
+ if (pbest_mode) *pbest_mode = best_mode;
+
+ return best_err;
+}
+
+static void update_mbgraph_mb_stats(AV1_COMP *cpi, MBGRAPH_MB_STATS *stats,
+ YV12_BUFFER_CONFIG *buf, int mb_y_offset,
+ YV12_BUFFER_CONFIG *golden_ref,
+ const MV *prev_golden_ref_mv,
+ YV12_BUFFER_CONFIG *alt_ref, int mb_row,
+ int mb_col) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int intra_error;
+ AV1_COMMON *cm = &cpi->common;
+
+ // FIXME in practice we're completely ignoring chroma here
+ x->plane[0].src.buf = buf->y_buffer + mb_y_offset;
+ x->plane[0].src.stride = buf->y_stride;
+
+ xd->plane[0].dst.buf = get_frame_new_buffer(cm)->y_buffer + mb_y_offset;
+ xd->plane[0].dst.stride = get_frame_new_buffer(cm)->y_stride;
+
+ // do intra 16x16 prediction
+ intra_error = find_best_16x16_intra(cpi, &stats->ref[INTRA_FRAME].m.mode);
+ if (intra_error <= 0) intra_error = 1;
+ stats->ref[INTRA_FRAME].err = intra_error;
+
+ // Golden frame MV search, if it exists and is different than last frame
+ if (golden_ref) {
+ int g_motion_error;
+ xd->plane[0].pre[0].buf = golden_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = golden_ref->y_stride;
+ g_motion_error =
+ do_16x16_motion_search(cpi, prev_golden_ref_mv, mb_row, mb_col);
+ stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
+ stats->ref[GOLDEN_FRAME].err = g_motion_error;
+ } else {
+ stats->ref[GOLDEN_FRAME].err = INT_MAX;
+ stats->ref[GOLDEN_FRAME].m.mv.as_int = 0;
+ }
+
+ // Do an Alt-ref frame MV search, if it exists and is different than
+ // last/golden frame.
+ if (alt_ref) {
+ int a_motion_error;
+ xd->plane[0].pre[0].buf = alt_ref->y_buffer + mb_y_offset;
+ xd->plane[0].pre[0].stride = alt_ref->y_stride;
+ a_motion_error =
+ do_16x16_zerozero_search(cpi, &stats->ref[ALTREF_FRAME].m.mv);
+
+ stats->ref[ALTREF_FRAME].err = a_motion_error;
+ } else {
+ stats->ref[ALTREF_FRAME].err = INT_MAX;
+ stats->ref[ALTREF_FRAME].m.mv.as_int = 0;
+ }
+}
+
+static void update_mbgraph_frame_stats(AV1_COMP *cpi,
+ MBGRAPH_FRAME_STATS *stats,
+ YV12_BUFFER_CONFIG *buf,
+ YV12_BUFFER_CONFIG *golden_ref,
+ YV12_BUFFER_CONFIG *alt_ref) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ AV1_COMMON *const cm = &cpi->common;
+
+ int mb_col, mb_row, offset = 0;
+ int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+ MV gld_top_mv = kZeroMv;
+ MB_MODE_INFO mi_local;
+
+ av1_zero(mi_local);
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.row_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.row_max = (cm->mb_rows - 1) * 8 + BORDER_MV_PIXELS_B16;
+ xd->up_available = 0;
+ xd->plane[0].dst.stride = buf->y_stride;
+ xd->plane[0].pre[0].stride = buf->y_stride;
+ xd->plane[1].dst.stride = buf->uv_stride;
+ xd->mi[0] = &mi_local;
+ mi_local.sb_type = BLOCK_16X16;
+ mi_local.ref_frame[0] = LAST_FRAME;
+ mi_local.ref_frame[1] = NONE_FRAME;
+
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ MV gld_left_mv = gld_top_mv;
+ int mb_y_in_offset = mb_y_offset;
+ int arf_y_in_offset = arf_y_offset;
+ int gld_y_in_offset = gld_y_offset;
+
+ // Set up limit values for motion vectors to prevent them extending outside
+ // the UMV borders.
+ x->mv_limits.col_min = -BORDER_MV_PIXELS_B16;
+ x->mv_limits.col_max = (cm->mb_cols - 1) * 8 + BORDER_MV_PIXELS_B16;
+ xd->left_available = 0;
+
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &stats->mb_stats[offset + mb_col];
+
+ update_mbgraph_mb_stats(cpi, mb_stats, buf, mb_y_in_offset, golden_ref,
+ &gld_left_mv, alt_ref, mb_row, mb_col);
+ gld_left_mv = mb_stats->ref[GOLDEN_FRAME].m.mv.as_mv;
+ if (mb_col == 0) {
+ gld_top_mv = gld_left_mv;
+ }
+ xd->left_available = 1;
+ mb_y_in_offset += 16;
+ gld_y_in_offset += 16;
+ arf_y_in_offset += 16;
+ x->mv_limits.col_min -= 16;
+ x->mv_limits.col_max -= 16;
+ }
+ xd->up_available = 1;
+ mb_y_offset += buf->y_stride * 16;
+ gld_y_offset += golden_ref->y_stride * 16;
+ if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
+ x->mv_limits.row_min -= 16;
+ x->mv_limits.row_max -= 16;
+ offset += cm->mb_cols;
+ }
+}
+
+// void separate_arf_mbs_byzz
+static void separate_arf_mbs(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int mb_col, mb_row, offset, i;
+ int mi_row, mi_col;
+ int ncnt[4] = { 0 };
+ int n_frames = cpi->mbgraph_n_frames;
+
+ int *arf_not_zz;
+
+ CHECK_MEM_ERROR(
+ cm, arf_not_zz,
+ aom_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
+
+ // We are not interested in results beyond the alt ref itself.
+ if (n_frames > cpi->rc.frames_till_gf_update_due)
+ n_frames = cpi->rc.frames_till_gf_update_due;
+
+ // defer cost to reference frames
+ for (i = n_frames - 1; i >= 0; i--) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+
+ for (offset = 0, mb_row = 0; mb_row < cm->mb_rows;
+ offset += cm->mb_cols, mb_row++) {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ MBGRAPH_MB_STATS *mb_stats = &frame_stats->mb_stats[offset + mb_col];
+
+ int altref_err = mb_stats->ref[ALTREF_FRAME].err;
+ int intra_err = mb_stats->ref[INTRA_FRAME].err;
+ int golden_err = mb_stats->ref[GOLDEN_FRAME].err;
+
+ // Test for altref vs intra and gf and that its mv was 0,0.
+ if (altref_err > 1000 || altref_err > intra_err ||
+ altref_err > golden_err) {
+ arf_not_zz[offset + mb_col]++;
+ }
+ }
+ }
+ }
+
+ // arf_not_zz is indexed by MB, but this loop is indexed by MI to avoid out
+ // of bound access in segmentation_map
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
+ // If any of the blocks in the sequence failed then the MB
+ // goes in segment 0
+ if (arf_not_zz[mi_row / 2 * cm->mb_cols + mi_col / 2]) {
+ ncnt[0]++;
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 0;
+ } else {
+ cpi->segmentation_map[mi_row * cm->mi_cols + mi_col] = 1;
+ ncnt[1]++;
+ }
+ }
+ }
+
+ // Only bother with segmentation if over 10% of the MBs in static segment
+ // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
+ if (1) {
+ // Note % of blocks that are marked as static
+ if (cm->MBs)
+ cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+
+ // This error case should not be reachable as this function should
+ // never be called with the common data structure uninitialized.
+ else
+ cpi->static_mb_pct = 0;
+
+ av1_enable_segmentation(&cm->seg);
+ } else {
+ cpi->static_mb_pct = 0;
+ av1_disable_segmentation(&cm->seg);
+ }
+
+ // Free localy allocated storage
+ aom_free(arf_not_zz);
+}
+
+void av1_update_mbgraph_stats(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ int i, n_frames = av1_lookahead_depth(cpi->lookahead);
+ YV12_BUFFER_CONFIG *golden_ref = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+
+ assert(golden_ref != NULL);
+
+ // we need to look ahead beyond where the ARF transitions into
+ // being a GF - so exit if we don't look ahead beyond that
+ if (n_frames <= cpi->rc.frames_till_gf_update_due) return;
+
+ if (n_frames > MAX_LAG_BUFFERS) n_frames = MAX_LAG_BUFFERS;
+
+ cpi->mbgraph_n_frames = n_frames;
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ memset(frame_stats->mb_stats, 0,
+ cm->mb_rows * cm->mb_cols * sizeof(*cpi->mbgraph_stats[i].mb_stats));
+ }
+
+ // do motion search to find contribution of each reference to data
+ // later on in this GF group
+ // FIXME really, the GF/last MC search should be done forward, and
+ // the ARF MC search backwards, to get optimal results for MV caching
+ for (i = 0; i < n_frames; i++) {
+ MBGRAPH_FRAME_STATS *frame_stats = &cpi->mbgraph_stats[i];
+ struct lookahead_entry *q_cur = av1_lookahead_peek(cpi->lookahead, i);
+
+ assert(q_cur != NULL);
+
+ update_mbgraph_frame_stats(cpi, frame_stats, &q_cur->img, golden_ref,
+ cpi->source);
+ }
+
+ aom_clear_system_state();
+
+ separate_arf_mbs(cpi);
+}
diff --git a/third_party/aom/av1/encoder/mbgraph.h b/third_party/aom/av1/encoder/mbgraph.h
new file mode 100644
index 0000000000..ba08476f7a
--- /dev/null
+++ b/third_party/aom/av1/encoder/mbgraph.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MBGRAPH_H_
+#define AOM_AV1_ENCODER_MBGRAPH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ struct {
+ int err;
+ union {
+ int_mv mv;
+ PREDICTION_MODE mode;
+ } m;
+ } ref[REF_FRAMES];
+} MBGRAPH_MB_STATS;
+
+typedef struct {
+ MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
+
+struct AV1_COMP;
+
+void av1_update_mbgraph_stats(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MBGRAPH_H_
diff --git a/third_party/aom/av1/encoder/mcomp.c b/third_party/aom/av1/encoder/mcomp.c
new file mode 100644
index 0000000000..8f6de9b532
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.c
@@ -0,0 +1,2885 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/common.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+
+// #define NEW_DIAMOND_SEARCH
+
+static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
+ const MV *mv) {
+ return &buf->buf[mv->row * buf->stride + mv->col];
+}
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv) {
+ int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0);
+ int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0);
+ int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL;
+ int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL;
+
+ col_min = AOMMAX(col_min, (MV_LOW >> 3) + 1);
+ row_min = AOMMAX(row_min, (MV_LOW >> 3) + 1);
+ col_max = AOMMIN(col_max, (MV_UPP >> 3) - 1);
+ row_max = AOMMIN(row_max, (MV_UPP >> 3) - 1);
+
+ // Get intersection of UMV window and valid MV window to reduce # of checks
+ // in diamond search.
+ if (mv_limits->col_min < col_min) mv_limits->col_min = col_min;
+ if (mv_limits->col_max > col_max) mv_limits->col_max = col_max;
+ if (mv_limits->row_min < row_min) mv_limits->row_min = row_min;
+ if (mv_limits->row_max > row_max) mv_limits->row_max = row_max;
+}
+
+static void set_subpel_mv_search_range(const MvLimits *mv_limits, int *col_min,
+ int *col_max, int *row_min, int *row_max,
+ const MV *ref_mv) {
+ const int max_mv = MAX_FULL_PEL_VAL * 8;
+ const int minc = AOMMAX(mv_limits->col_min * 8, ref_mv->col - max_mv);
+ const int maxc = AOMMIN(mv_limits->col_max * 8, ref_mv->col + max_mv);
+ const int minr = AOMMAX(mv_limits->row_min * 8, ref_mv->row - max_mv);
+ const int maxr = AOMMIN(mv_limits->row_max * 8, ref_mv->row + max_mv);
+
+ *col_min = AOMMAX(MV_LOW + 1, minc);
+ *col_max = AOMMIN(MV_UPP - 1, maxc);
+ *row_min = AOMMAX(MV_LOW + 1, minr);
+ *row_max = AOMMIN(MV_UPP - 1, maxr);
+}
+
+int av1_init_search_range(int size) {
+ int sr = 0;
+ // Minimum search size no matter what the passed in value.
+ size = AOMMAX(16, size);
+
+ while ((size << sr) < MAX_FULL_PEL_VAL) sr++;
+
+ sr = AOMMIN(sr, MAX_MVSEARCH_STEPS - 2);
+ return sr;
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+ int *const comp_cost[2]) {
+ return joint_cost[av1_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+ comp_cost[1][mv->col];
+}
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
+}
+
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int error_per_bit) {
+ if (mvcost) {
+ const MV diff = { mv->row - ref->row, mv->col - ref->col };
+ return (int)ROUND_POWER_OF_TWO_64(
+ (int64_t)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+ RDDIV_BITS + AV1_PROB_COST_SHIFT - RD_EPB_SHIFT +
+ PIXEL_TRANSFORM_ERROR_SCALE);
+ }
+ return 0;
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
+ int sad_per_bit) {
+ const MV diff = { (mv->row - ref->row) * 8, (mv->col - ref->col) * 8 };
+ return ROUND_POWER_OF_TWO(
+ (unsigned)mv_cost(&diff, x->nmvjointcost, x->mvcost) * sad_per_bit,
+ AV1_PROB_COST_SHIFT);
+}
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride) {
+ int len, ss_count = 1;
+
+ cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+ cfg->ss[0].offset = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 4 search sites per step.
+ const MV ss_mvs[] = { { -len, 0 }, { len, 0 }, { 0, -len }, { 0, len } };
+ int i;
+ for (i = 0; i < 4; ++i) {
+ search_site *const ss = &cfg->ss[ss_count++];
+ ss->mv = ss_mvs[i];
+ ss->offset = ss->mv.row * stride + ss->mv.col;
+ }
+ }
+
+ cfg->ss_count = ss_count;
+ cfg->searches_per_step = 4;
+}
+
+void av1_init3smotion_compensation(search_site_config *cfg, int stride) {
+ int len, ss_count = 1;
+
+ cfg->ss[0].mv.col = cfg->ss[0].mv.row = 0;
+ cfg->ss[0].offset = 0;
+
+ for (len = MAX_FIRST_STEP; len > 0; len /= 2) {
+ // Generate offsets for 8 search sites per step.
+ const MV ss_mvs[8] = { { -len, 0 }, { len, 0 }, { 0, -len },
+ { 0, len }, { -len, -len }, { -len, len },
+ { len, -len }, { len, len } };
+ int i;
+ for (i = 0; i < 8; ++i) {
+ search_site *const ss = &cfg->ss[ss_count++];
+ ss->mv = ss_mvs[i];
+ ss->offset = ss->mv.row * stride + ss->mv.col;
+ }
+ }
+
+ cfg->ss_count = ss_count;
+ cfg->searches_per_step = 8;
+}
+
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
+
+// convert motion vector component to offset for sv[a]f calc
+static INLINE int sp(int x) { return x & 7; }
+
+static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
+ const int offset = (r >> 3) * stride + (c >> 3);
+ return buf + offset;
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ MV this_mv = { r, c }; \
+ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+ if (second_pred == NULL) { \
+ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, &sse); \
+ } else if (mask) { \
+ thismse = vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, second_pred, mask, \
+ mask_stride, invert_mask, &sse); \
+ } else { \
+ if (xd->jcp_param.use_jnt_comp_avg) \
+ thismse = vfp->jsvaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, &sse, second_pred, \
+ &xd->jcp_param); \
+ else \
+ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
+ src_address, src_stride, &sse, second_pred); \
+ } \
+ v += thismse; \
+ if (v < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ MV this_mv = { r, c }; \
+ thismse = upsampled_pref_error( \
+ xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride, \
+ pre(y, y_stride, r, c), y_stride, sp(c), sp(r), second_pred, mask, \
+ mask_stride, invert_mask, w, h, &sse, use_accurate_subpel_search); \
+ v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+ v += thismse; \
+ if (v < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#define FIRST_LEVEL_CHECKS \
+ { \
+ unsigned int left, right, up, down, diag; \
+ CHECK_BETTER(left, tr, tc - hstep); \
+ CHECK_BETTER(right, tr, tc + hstep); \
+ CHECK_BETTER(up, tr - hstep, tc); \
+ CHECK_BETTER(down, tr + hstep, tc); \
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); \
+ switch (whichdir) { \
+ case 0: CHECK_BETTER(diag, tr - hstep, tc - hstep); break; \
+ case 1: CHECK_BETTER(diag, tr - hstep, tc + hstep); break; \
+ case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
+ case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
+ } \
+ }
+
+#define SECOND_LEVEL_CHECKS \
+ { \
+ int kr, kc; \
+ unsigned int second; \
+ if (tr != br && tc != bc) { \
+ kr = br - tr; \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + kr, tc + 2 * kc); \
+ CHECK_BETTER(second, tr + 2 * kr, tc + kc); \
+ } else if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ CHECK_BETTER(second, tr + hstep, tc + 2 * kc); \
+ CHECK_BETTER(second, tr - hstep, tc + 2 * kc); \
+ switch (whichdir) { \
+ case 0: \
+ case 1: CHECK_BETTER(second, tr + hstep, tc + kc); break; \
+ case 2: \
+ case 3: CHECK_BETTER(second, tr - hstep, tc + kc); break; \
+ } \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ CHECK_BETTER(second, tr + 2 * kr, tc + hstep); \
+ CHECK_BETTER(second, tr + 2 * kr, tc - hstep); \
+ switch (whichdir) { \
+ case 0: \
+ case 2: CHECK_BETTER(second, tr + kr, tc + hstep); break; \
+ case 1: \
+ case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
+ } \
+ } \
+ }
+
+// TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
+// SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
+// later in the same way.
+#define SECOND_LEVEL_CHECKS_BEST(k) \
+ { \
+ unsigned int second; \
+ int br0 = br; \
+ int bc0 = bc; \
+ assert(tr == br || tc == bc); \
+ if (tr == br && tc != bc) { \
+ kc = bc - tc; \
+ } else if (tr != br && tc == bc) { \
+ kr = br - tr; \
+ } \
+ CHECK_BETTER##k(second, br0 + kr, bc0); \
+ CHECK_BETTER##k(second, br0, bc0 + kc); \
+ if (br0 != br || bc0 != bc) { \
+ CHECK_BETTER##k(second, br0 + kr, bc0 + kc); \
+ } \
+ }
+
+#define SETUP_SUBPEL_SEARCH \
+ const uint8_t *const src_address = x->plane[0].src.buf; \
+ const int src_stride = x->plane[0].src.stride; \
+ const MACROBLOCKD *xd = &x->e_mbd; \
+ unsigned int besterr = INT_MAX; \
+ unsigned int sse; \
+ unsigned int whichdir; \
+ int thismse; \
+ MV *bestmv = &x->best_mv.as_mv; \
+ const unsigned int halfiters = iters_per_step; \
+ const unsigned int quarteriters = iters_per_step; \
+ const unsigned int eighthiters = iters_per_step; \
+ const int y_stride = xd->plane[0].pre[0].stride; \
+ const int offset = bestmv->row * y_stride + bestmv->col; \
+ const uint8_t *const y = xd->plane[0].pre[0].buf; \
+ \
+ int br = bestmv->row * 8; \
+ int bc = bestmv->col * 8; \
+ int hstep = 4; \
+ int minc, maxc, minr, maxr; \
+ int tr = br; \
+ int tc = bc; \
+ \
+ set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, \
+ ref_mv); \
+ \
+ bestmv->row *= 8; \
+ bestmv->col *= 8;
+
+static unsigned int setup_center_error(
+ const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+ int error_per_bit, const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride, const uint8_t *const y,
+ int y_stride, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h, int offset, int *mvjcost,
+ int *mvcost[2], unsigned int *sse1, int *distortion) {
+ unsigned int besterr;
+ if (second_pred != NULL) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
+ uint8_t *comp_pred = CONVERT_TO_BYTEPTR(comp_pred16);
+ if (mask) {
+ aom_highbd_comp_mask_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride, mask, mask_stride, invert_mask);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ aom_highbd_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride, &xd->jcp_param);
+ else
+ aom_highbd_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
+ if (mask) {
+ aom_comp_mask_pred(comp_pred, second_pred, w, h, y + offset, y_stride,
+ mask, mask_stride, invert_mask);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ aom_jnt_comp_avg_pred(comp_pred, second_pred, w, h, y + offset,
+ y_stride, &xd->jcp_param);
+ else
+ aom_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+ }
+ besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
+ }
+ } else {
+ besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
+ }
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+static INLINE int divide_and_round(int n, int d) {
+ return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
+}
+
+static INLINE int is_cost_list_wellbehaved(int *cost_list) {
+ return cost_list[0] < cost_list[1] && cost_list[0] < cost_list[2] &&
+ cost_list[0] < cost_list[3] && cost_list[0] < cost_list[4];
+}
+
+// Returns surface minima estimate at given precision in 1/2^n bits.
+// Assume a model for the cost surface: S = A(x - x0)^2 + B(y - y0)^2 + C
+// For a given set of costs S0, S1, S2, S3, S4 at points
+// (y, x) = (0, 0), (0, -1), (1, 0), (0, 1) and (-1, 0) respectively,
+// the solution for the location of the minima (x0, y0) is given by:
+// x0 = 1/2 (S1 - S3)/(S1 + S3 - 2*S0),
+// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
+// The code below is an integerized version of that.
+static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
+ *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
+ (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
+ *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
+ (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+}
+
+int av1_find_best_sub_pixel_tree_pruned_evenmore(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, mask, mask_stride, invert_mask, w,
+ h, offset, mvjcost, mvcost, sse1, distortion);
+ (void)halfiters;
+ (void)quarteriters;
+ (void)eighthiters;
+ (void)whichdir;
+ (void)allow_hp;
+ (void)forced_stop;
+ (void)hstep;
+ (void)use_accurate_subpel_search;
+ (void)cm;
+ (void)mi_row;
+ (void)mi_col;
+
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ int ir, ic;
+ unsigned int minpt;
+ get_cost_surf_min(cost_list, &ir, &ic, 2);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ if (allow_hp && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned_more(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+ (void)cm;
+ (void)mi_row;
+ (void)mi_col;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, mask, mask_stride, invert_mask, w,
+ h, offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
+ unsigned int minpt;
+ int ir, ic;
+ get_cost_surf_min(cost_list, &ir, &ic, 1);
+ if (ir != 0 || ic != 0) {
+ CHECK_BETTER(minpt, tr + ir * hstep, tc + ic * hstep);
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ if (allow_hp && forced_stop == 0) {
+ tr = br;
+ tc = bc;
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+int av1_find_best_sub_pixel_tree_pruned(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ SETUP_SUBPEL_SEARCH;
+ (void)use_accurate_subpel_search;
+ (void)cm;
+ (void)mi_row;
+ (void)mi_col;
+
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, mask, mask_stride, invert_mask, w,
+ h, offset, mvjcost, mvcost, sse1, distortion);
+ if (cost_list && cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
+ cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
+ cost_list[4] != INT_MAX) {
+ unsigned int left, right, up, down, diag;
+ whichdir = (cost_list[1] < cost_list[3] ? 0 : 1) +
+ (cost_list[2] < cost_list[4] ? 0 : 2);
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(down, tr + hstep, tc);
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ }
+ } else {
+ FIRST_LEVEL_CHECKS;
+ if (halfiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+
+ // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+ if (forced_stop != 2) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (quarteriters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+
+ if (allow_hp && forced_stop == 0) {
+ hstep >>= 1;
+ FIRST_LEVEL_CHECKS;
+ if (eighthiters > 1) {
+ SECOND_LEVEL_CHECKS;
+ }
+ tr = br;
+ tc = bc;
+ }
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+/* clang-format off */
+static const MV search_step_table[12] = {
+ // left, right, up, down
+ { 0, -4 }, { 0, 4 }, { -4, 0 }, { 4, 0 },
+ { 0, -2 }, { 0, 2 }, { -2, 0 }, { 2, 0 },
+ { 0, -1 }, { 0, 1 }, { -1, 0 }, { 1, 0 }
+};
+/* clang-format on */
+
+static int upsampled_pref_error(MACROBLOCKD *xd, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *const mv,
+ const aom_variance_fn_ptr_t *vfp,
+ const uint8_t *const src, const int src_stride,
+ const uint8_t *const y, int y_stride,
+ int subpel_x_q3, int subpel_y_q3,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ unsigned int *sse, int subpel_search) {
+ unsigned int besterr;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred16);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_highbd_comp_mask_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask, mask_stride, invert_mask, xd->bd,
+ subpel_search);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ aom_highbd_jnt_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, &xd->jcp_param, subpel_search);
+ else
+ aom_highbd_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred8, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, xd->bd, subpel_search);
+ }
+ } else {
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
+ }
+ besterr = vfp->vf(pred8, w, src, src_stride, sse);
+ } else {
+ DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+ if (second_pred != NULL) {
+ if (mask) {
+ aom_comp_mask_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+ second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, mask,
+ mask_stride, invert_mask, subpel_search);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ aom_jnt_comp_avg_upsampled_pred(
+ xd, cm, mi_row, mi_col, mv, pred, second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, &xd->jcp_param, subpel_search);
+ else
+ aom_comp_avg_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred,
+ second_pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, subpel_search);
+ }
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, subpel_search);
+ }
+
+ besterr = vfp->vf(pred, w, src, src_stride, sse);
+ }
+ return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *bestmv, const MV *ref_mv, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+ const int src_stride, const uint8_t *const y, int y_stride,
+ const uint8_t *second_pred, const uint8_t *mask, int mask_stride,
+ int invert_mask, int w, int h, int offset, int *mvjcost, int *mvcost[2],
+ unsigned int *sse1, int *distortion, int subpel_search) {
+ unsigned int besterr =
+ upsampled_pref_error(xd, cm, mi_row, mi_col, bestmv, vfp, src, src_stride,
+ y + offset, y_stride, 0, 0, second_pred, mask,
+ mask_stride, invert_mask, w, h, sse1, subpel_search);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+// when use_accurate_subpel_search == 0
+static INLINE unsigned int estimate_upsampled_pref_error(
+ MACROBLOCKD *xd, const aom_variance_fn_ptr_t *vfp, const uint8_t *const src,
+ const int src_stride, const uint8_t *const pre, int y_stride,
+ int subpel_x_q3, int subpel_y_q3, const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride, int invert_mask, unsigned int *sse) {
+ if (second_pred == NULL) {
+ return vfp->svf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse);
+ } else if (mask) {
+ return vfp->msvf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ second_pred, mask, mask_stride, invert_mask, sse);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ return vfp->jsvaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src,
+ src_stride, sse, second_pred, &xd->jcp_param);
+ else
+ return vfp->svaf(pre, y_stride, subpel_x_q3, subpel_y_q3, src, src_stride,
+ sse, second_pred);
+ }
+}
+
+int av1_find_best_sub_pixel_tree(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2], int *distortion,
+ unsigned int *sse1, const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ const uint8_t *const src_address = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int thismse;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ MV *bestmv = &x->best_mv.as_mv;
+ const int offset = bestmv->row * y_stride + bestmv->col;
+ const uint8_t *const y = xd->plane[0].pre[0].buf;
+
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter, round = 3 - forced_stop;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ int minc, maxc, minr, maxr;
+
+ set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
+
+ if (!allow_hp)
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+
+ if (use_accurate_subpel_search)
+ besterr = upsampled_setup_center_error(
+ xd, cm, mi_row, mi_col, bestmv, ref_mv, error_per_bit, vfp, src_address,
+ src_stride, y, y_stride, second_pred, mask, mask_stride, invert_mask, w,
+ h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
+ else
+ besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+ src_address, src_stride, y, y_stride,
+ second_pred, mask, mask_stride, invert_mask, w,
+ h, offset, mvjcost, mvcost, sse1, distortion);
+
+ (void)cost_list; // to silence compiler warning
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_accurate_subpel_search) {
+ thismse = upsampled_pref_error(
+ xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
+ } else {
+ thismse = estimate_upsampled_pref_error(
+ xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+ y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+ invert_mask, &sse);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_accurate_subpel_search) {
+ thismse = upsampled_pref_error(
+ xd, cm, mi_row, mi_col, &this_mv, vfp, src_address, src_stride,
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), second_pred,
+ mask, mask_stride, invert_mask, w, h, &sse,
+ use_accurate_subpel_search);
+ } else {
+ thismse = estimate_upsampled_pref_error(
+ xd, vfp, src_address, src_stride, pre(y, y_stride, tr, tc),
+ y_stride, sp(tc), sp(tr), second_pred, mask, mask_stride,
+ invert_mask, &sse);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 1 && best_idx != -1) {
+ if (use_accurate_subpel_search) {
+ SECOND_LEVEL_CHECKS_BEST(1);
+ } else {
+ SECOND_LEVEL_CHECKS_BEST(0);
+ }
+ }
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+#undef PRE
+#undef CHECK_BETTER
+
+unsigned int av1_compute_motion_cost(const AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ const MV *this_mv) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const uint8_t *const src = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ uint8_t *const dst = xd->plane[0].dst.buf;
+ const int dst_stride = xd->plane[0].dst.stride;
+ const aom_variance_fn_ptr_t *vfp = &cpi->fn_ptr[bsize];
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ unsigned int mse;
+ unsigned int sse;
+
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+ mse = vfp->vf(dst, dst_stride, src, src_stride, &sse);
+ mse += mv_err_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit);
+ return mse;
+}
+
+// Refine MV in a small range
+unsigned int av1_refine_warped_mv(const AV1_COMP *cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int *pts0, int *pts_inref0,
+ int total_samples) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const MV neighbors[8] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 },
+ { 0, -2 }, { 2, 0 }, { 0, 2 }, { -2, 0 } };
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ int16_t br = mbmi->mv[0].as_mv.row;
+ int16_t bc = mbmi->mv[0].as_mv.col;
+ int16_t *tr = &mbmi->mv[0].as_mv.row;
+ int16_t *tc = &mbmi->mv[0].as_mv.col;
+ WarpedMotionParams best_wm_params = mbmi->wm_params;
+ int best_num_proj_ref = mbmi->num_proj_ref;
+ unsigned int bestmse;
+ int minc, maxc, minr, maxr;
+ const int start = cm->allow_high_precision_mv ? 0 : 4;
+ int ite;
+
+ set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr,
+ &ref_mv.as_mv);
+
+ // Calculate the center position's error
+ assert(bc >= minc && bc <= maxc && br >= minr && br <= maxr);
+ bestmse = av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col,
+ &mbmi->mv[0].as_mv);
+
+ // MV search
+ for (ite = 0; ite < 2; ++ite) {
+ int best_idx = -1;
+ int idx;
+
+ for (idx = start; idx < start + 4; ++idx) {
+ unsigned int thismse;
+
+ *tr = br + neighbors[idx].row;
+ *tc = bc + neighbors[idx].col;
+
+ if (*tc >= minc && *tc <= maxc && *tr >= minr && *tr <= maxr) {
+ MV this_mv = { *tr, *tc };
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ if (total_samples > 1)
+ mbmi->num_proj_ref =
+ selectSamples(&this_mv, pts, pts_inref, total_samples, bsize);
+
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize, *tr,
+ *tc, &mbmi->wm_params, mi_row, mi_col)) {
+ thismse =
+ av1_compute_motion_cost(cpi, x, bsize, mi_row, mi_col, &this_mv);
+
+ if (thismse < bestmse) {
+ best_idx = idx;
+ best_wm_params = mbmi->wm_params;
+ best_num_proj_ref = mbmi->num_proj_ref;
+ bestmse = thismse;
+ }
+ }
+ }
+ }
+
+ if (best_idx == -1) break;
+
+ if (best_idx >= 0) {
+ br += neighbors[best_idx].row;
+ bc += neighbors[best_idx].col;
+ }
+ }
+
+ *tr = br;
+ *tc = bc;
+ mbmi->wm_params = best_wm_params;
+ mbmi->num_proj_ref = best_num_proj_ref;
+ return bestmse;
+}
+
+static INLINE int check_bounds(const MvLimits *mv_limits, int row, int col,
+ int range) {
+ return ((row - range) >= mv_limits->row_min) &
+ ((row + range) <= mv_limits->row_max) &
+ ((col - range) >= mv_limits->col_min) &
+ ((col + range) <= mv_limits->col_max);
+}
+
+static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->col >= mv_limits->col_min) && (mv->col <= mv_limits->col_max) &&
+ (mv->row >= mv_limits->row_min) && (mv->row <= mv_limits->row_max);
+}
+
+#define CHECK_BETTER \
+ { \
+ if (thissad < bestsad) { \
+ if (use_mvcost) \
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit); \
+ if (thissad < bestsad) { \
+ bestsad = thissad; \
+ best_site = i; \
+ } \
+ } \
+ }
+
+#define MAX_PATTERN_SCALES 11
+#define MAX_PATTERN_CANDIDATES 8 // max number of canddiates per scale
+#define PATTERN_CANDIDATES_REF 3 // number of refinement candidates
+
+// Calculate and return a sad+mvcost list around an integer best pel.
+static INLINE void calc_int_cost_list(const MACROBLOCK *x,
+ const MV *const ref_mv, int sadpb,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv, int *cost_list) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+ int i;
+ unsigned int sse;
+ const MV this_mv = { br, bc };
+
+ cost_list[0] =
+ fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+ in_what->stride, &sse) +
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &neighbor_mv),
+ in_what->stride, &sse) +
+ mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost,
+ x->mvcost, x->errorperbit);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV neighbor_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &neighbor_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] =
+ fn_ptr->vf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &neighbor_mv), in_what->stride,
+ &sse) +
+ mv_err_cost(&neighbor_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit);
+ }
+ }
+}
+
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+ const MV *const ref_mv, int sadpb,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *best_mv, int *cost_list,
+ const int use_mvcost, const int bestsad) {
+ static const MV neighbors[4] = { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+ const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int i;
+ const int br = best_mv->row;
+ const int bc = best_mv->col;
+
+ if (cost_list[0] == INT_MAX) {
+ cost_list[0] = bestsad;
+ if (check_bounds(&x->mv_limits, br, bc, 1)) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ cost_list[i + 1] =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ } else {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv))
+ cost_list[i + 1] = INT_MAX;
+ else
+ cost_list[i + 1] =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ }
+ }
+ } else {
+ if (use_mvcost) {
+ for (i = 0; i < 4; i++) {
+ const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
+ if (cost_list[i + 1] != INT_MAX) {
+ cost_list[i + 1] += mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+ }
+ }
+ }
+ }
+}
+
+// Generic pattern search function that searches over multiple scales.
+// Each scale can have a different number of candidates and shape of
+// candidates as indicated in the num_candidates and candidates arrays
+// passed into this function
+//
+static int pattern_search(
+ MACROBLOCK *x, MV *start_mv, int search_param, int sad_per_bit,
+ int do_init_search, int *cost_list, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv,
+ const int num_candidates[MAX_PATTERN_SCALES],
+ const MV candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES]) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
+ 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ };
+ int i, s, t;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const int last_is_4 = num_candidates[0] == 4;
+ int br, bc;
+ int bestsad = INT_MAX;
+ int thissad;
+ int k = -1;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ assert(search_param < MAX_MVSEARCH_STEPS);
+ int best_init_s = search_param_to_steps[search_param];
+ // adjust ref_mv to make sure it is within MV range
+ clamp_mv(start_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ br = start_mv->row;
+ bc = start_mv->col;
+ if (cost_list != NULL) {
+ cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
+ INT_MAX;
+ }
+
+ // Work out the start point for the search
+ bestsad = vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, start_mv), in_what->stride) +
+ mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
+
+ // Search all possible scales upto the search param around the center point
+ // pick the scale of the point that is best as the starting scale of
+ // further steps around it.
+ if (do_init_search) {
+ s = best_init_s;
+ best_init_s = -1;
+ for (t = 0; t <= s; ++t) {
+ int best_site = -1;
+ if (check_bounds(&x->mv_limits, br, bc, 1 << t)) {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[t]; i++) {
+ const MV this_mv = { br + candidates[t][i].row,
+ bc + candidates[t][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+ if (best_site == -1) {
+ continue;
+ } else {
+ best_init_s = t;
+ k = best_site;
+ }
+ }
+ if (best_init_s != -1) {
+ br += candidates[best_init_s][k].row;
+ bc += candidates[best_init_s][k].col;
+ }
+ }
+
+ // If the center point is still the best, just skip this and move to
+ // the refinement step.
+ if (best_init_s != -1) {
+ const int last_s = (last_is_4 && cost_list != NULL);
+ int best_site = -1;
+ s = best_init_s;
+
+ for (; s >= last_s; s--) {
+ // No need to search all points the 1st time if initial search was used
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site == -1) {
+ continue;
+ } else {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+
+ do {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ } while (best_site != -1);
+ }
+
+ // Note: If we enter the if below, then cost_list must be non-NULL.
+ if (s == 0) {
+ cost_list[0] = bestsad;
+ if (!do_init_search || s != best_init_s) {
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < num_candidates[s]; i++) {
+ const MV this_mv = { br + candidates[s][i].row,
+ bc + candidates[s][i].col };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) continue;
+ cost_list[i + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ br += candidates[s][best_site].row;
+ bc += candidates[s][best_site].col;
+ k = best_site;
+ }
+ }
+ while (best_site != -1) {
+ int next_chkpts_indices[PATTERN_CANDIDATES_REF];
+ best_site = -1;
+ next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
+ next_chkpts_indices[1] = k;
+ next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
+ cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] = INT_MAX;
+ cost_list[((k + 2) % 4) + 1] = cost_list[0];
+ cost_list[0] = bestsad;
+
+ if (check_bounds(&x->mv_limits, br, bc, 1 << s)) {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ } else {
+ for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
+ const MV this_mv = {
+ br + candidates[s][next_chkpts_indices[i]].row,
+ bc + candidates[s][next_chkpts_indices[i]].col
+ };
+ if (!is_mv_in(&x->mv_limits, &this_mv)) {
+ cost_list[next_chkpts_indices[i] + 1] = INT_MAX;
+ continue;
+ }
+ cost_list[next_chkpts_indices[i] + 1] = thissad =
+ vfp->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &this_mv), in_what->stride);
+ CHECK_BETTER
+ }
+ }
+
+ if (best_site != -1) {
+ k = next_chkpts_indices[best_site];
+ br += candidates[s][k].row;
+ bc += candidates[s][k].col;
+ }
+ }
+ }
+ }
+
+ // Returns the one-away integer pel cost/sad around the best as follows:
+ // cost_list[0]: cost/sad at the best integer pel
+ // cost_list[1]: cost/sad at delta {0, -1} (left) from the best integer pel
+ // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+ // cost_list[3]: cost/sad at delta { 0, 1} (right) from the best integer pel
+ // cost_list[4]: cost/sad at delta {-1, 0} (top) from the best integer pel
+ if (cost_list) {
+ const MV best_int_mv = { br, bc };
+ if (last_is_4) {
+ calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_int_mv, cost_list,
+ use_mvcost, bestsad);
+ } else {
+ calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_int_mv,
+ cost_list);
+ }
+ }
+ x->best_mv.as_mv.row = br;
+ x->best_mv.as_mv.col = bc;
+ return bestsad;
+}
+
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->vf(what->buf, what->stride, get_buf_from_mv(in_what, best_mv),
+ in_what->stride, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ if (xd->jcp_param.use_jnt_comp_avg)
+ return vfp->jsvaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+ what->buf, what->stride, &unused, second_pred,
+ &xd->jcp_param) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+ else
+ return vfp->svaf(get_buf_from_mv(in_what, best_mv), in_what->stride, 0, 0,
+ what->buf, what->stride, &unused, second_pred) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->msvf(what->buf, what->stride, 0, 0,
+ get_buf_from_mv(in_what, best_mv), in_what->stride,
+ second_pred, mask, mask_stride, invert_mask, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // First scale has 8-closest points, the rest have 6 points in hex shape
+ // at increasing scales
+ static const int hex_num_candidates[MAX_PATTERN_SCALES] = { 8, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6 };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV hex_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 }, { -1, 1 },
+ { -1, 0 } },
+ { { -1, -2 }, { 1, -2 }, { 2, 0 }, { 1, 2 }, { -1, 2 }, { -2, 0 } },
+ { { -2, -4 }, { 2, -4 }, { 4, 0 }, { 2, 4 }, { -2, 4 }, { -4, 0 } },
+ { { -4, -8 }, { 4, -8 }, { 8, 0 }, { 4, 8 }, { -4, 8 }, { -8, 0 } },
+ { { -8, -16 }, { 8, -16 }, { 16, 0 }, { 8, 16 }, { -8, 16 }, { -16, 0 } },
+ { { -16, -32 }, { 16, -32 }, { 32, 0 }, { 16, 32 }, { -16, 32 },
+ { -32, 0 } },
+ { { -32, -64 }, { 32, -64 }, { 64, 0 }, { 32, 64 }, { -32, 64 },
+ { -64, 0 } },
+ { { -64, -128 }, { 64, -128 }, { 128, 0 }, { 64, 128 }, { -64, 128 },
+ { -128, 0 } },
+ { { -128, -256 }, { 128, -256 }, { 256, 0 }, { 128, 256 }, { -128, 256 },
+ { -256, 0 } },
+ { { -256, -512 }, { 256, -512 }, { 512, 0 }, { 256, 512 }, { -256, 512 },
+ { -512, 0 } },
+ { { -512, -1024 }, { 512, -1024 }, { 1024, 0 }, { 512, 1024 },
+ { -512, 1024 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ hex_num_candidates, hex_candidates);
+}
+
+static int bigdia_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // First scale has 4-closest points, the rest have 8 points in diamond
+ // shape at increasing scales
+ static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
+ 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ bigdia_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { 0, -1 }, { 1, 0 }, { 0, 1 }, { -1, 0 } },
+ { { -1, -1 }, { 0, -2 }, { 1, -1 }, { 2, 0 }, { 1, 1 }, { 0, 2 },
+ { -1, 1 }, { -2, 0 } },
+ { { -2, -2 }, { 0, -4 }, { 2, -2 }, { 4, 0 }, { 2, 2 }, { 0, 4 },
+ { -2, 2 }, { -4, 0 } },
+ { { -4, -4 }, { 0, -8 }, { 4, -4 }, { 8, 0 }, { 4, 4 }, { 0, 8 },
+ { -4, 4 }, { -8, 0 } },
+ { { -8, -8 }, { 0, -16 }, { 8, -8 }, { 16, 0 }, { 8, 8 }, { 0, 16 },
+ { -8, 8 }, { -16, 0 } },
+ { { -16, -16 }, { 0, -32 }, { 16, -16 }, { 32, 0 }, { 16, 16 },
+ { 0, 32 }, { -16, 16 }, { -32, 0 } },
+ { { -32, -32 }, { 0, -64 }, { 32, -32 }, { 64, 0 }, { 32, 32 },
+ { 0, 64 }, { -32, 32 }, { -64, 0 } },
+ { { -64, -64 }, { 0, -128 }, { 64, -64 }, { 128, 0 }, { 64, 64 },
+ { 0, 128 }, { -64, 64 }, { -128, 0 } },
+ { { -128, -128 }, { 0, -256 }, { 128, -128 }, { 256, 0 }, { 128, 128 },
+ { 0, 256 }, { -128, 128 }, { -256, 0 } },
+ { { -256, -256 }, { 0, -512 }, { 256, -256 }, { 512, 0 }, { 256, 256 },
+ { 0, 512 }, { -256, 256 }, { -512, 0 } },
+ { { -512, -512 }, { 0, -1024 }, { 512, -512 }, { 1024, 0 },
+ { 512, 512 }, { 0, 1024 }, { -512, 512 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ bigdia_num_candidates, bigdia_candidates);
+}
+
+static int square_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ // All scales have 8 closest points in square shape
+ static const int square_num_candidates[MAX_PATTERN_SCALES] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ };
+ // Note that the largest candidate step at each scale is 2^scale
+ /* clang-format off */
+ static const MV
+ square_candidates[MAX_PATTERN_SCALES][MAX_PATTERN_CANDIDATES] = {
+ { { -1, -1 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, { 0, 1 },
+ { -1, 1 }, { -1, 0 } },
+ { { -2, -2 }, { 0, -2 }, { 2, -2 }, { 2, 0 }, { 2, 2 }, { 0, 2 },
+ { -2, 2 }, { -2, 0 } },
+ { { -4, -4 }, { 0, -4 }, { 4, -4 }, { 4, 0 }, { 4, 4 }, { 0, 4 },
+ { -4, 4 }, { -4, 0 } },
+ { { -8, -8 }, { 0, -8 }, { 8, -8 }, { 8, 0 }, { 8, 8 }, { 0, 8 },
+ { -8, 8 }, { -8, 0 } },
+ { { -16, -16 }, { 0, -16 }, { 16, -16 }, { 16, 0 }, { 16, 16 },
+ { 0, 16 }, { -16, 16 }, { -16, 0 } },
+ { { -32, -32 }, { 0, -32 }, { 32, -32 }, { 32, 0 }, { 32, 32 },
+ { 0, 32 }, { -32, 32 }, { -32, 0 } },
+ { { -64, -64 }, { 0, -64 }, { 64, -64 }, { 64, 0 }, { 64, 64 },
+ { 0, 64 }, { -64, 64 }, { -64, 0 } },
+ { { -128, -128 }, { 0, -128 }, { 128, -128 }, { 128, 0 }, { 128, 128 },
+ { 0, 128 }, { -128, 128 }, { -128, 0 } },
+ { { -256, -256 }, { 0, -256 }, { 256, -256 }, { 256, 0 }, { 256, 256 },
+ { 0, 256 }, { -256, 256 }, { -256, 0 } },
+ { { -512, -512 }, { 0, -512 }, { 512, -512 }, { 512, 0 }, { 512, 512 },
+ { 0, 512 }, { -512, 512 }, { -512, 0 } },
+ { { -1024, -1024 }, { 0, -1024 }, { 1024, -1024 }, { 1024, 0 },
+ { 1024, 1024 }, { 0, 1024 }, { -1024, 1024 }, { -1024, 0 } },
+ };
+ /* clang-format on */
+ return pattern_search(x, start_mv, search_param, sad_per_bit, do_init_search,
+ cost_list, vfp, use_mvcost, center_mv,
+ square_num_candidates, square_candidates);
+}
+
+static int fast_hex_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit,
+ int do_init_search, // must be zero for fast_hex
+ int *cost_list, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost, const MV *center_mv) {
+ return av1_hex_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv);
+}
+
+static int fast_dia_search(MACROBLOCK *x, MV *ref_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv) {
+ return bigdia_search(x, ref_mv, AOMMAX(MAX_MVSEARCH_STEPS - 2, search_param),
+ sad_per_bit, do_init_search, cost_list, vfp, use_mvcost,
+ center_mv);
+}
+
+#undef CHECK_BETTER
+
+// Exhuastive motion search around a given centre position with a given
+// step size.
+static int exhuastive_mesh_search(MACROBLOCK *x, MV *ref_mv, MV *best_mv,
+ int range, int step, int sad_per_bit,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ MV fcenter_mv = { center_mv->row, center_mv->col };
+ unsigned int best_sad = INT_MAX;
+ int r, c, i;
+ int start_col, end_col, start_row, end_row;
+ int col_step = (step > 1) ? step : 4;
+
+ assert(step >= 1);
+
+ clamp_mv(&fcenter_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ *best_mv = fcenter_mv;
+ best_sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &fcenter_mv), in_what->stride) +
+ mvsad_err_cost(x, &fcenter_mv, ref_mv, sad_per_bit);
+ start_row = AOMMAX(-range, x->mv_limits.row_min - fcenter_mv.row);
+ start_col = AOMMAX(-range, x->mv_limits.col_min - fcenter_mv.col);
+ end_row = AOMMIN(range, x->mv_limits.row_max - fcenter_mv.row);
+ end_col = AOMMIN(range, x->mv_limits.col_max - fcenter_mv.col);
+
+ for (r = start_row; r <= end_row; r += step) {
+ for (c = start_col; c <= end_col; c += col_step) {
+ // Step > 1 means we are not checking every location in this pass.
+ if (step > 1) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
+ in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ } else {
+ // 4 sads in a single call if we are checking every location
+ if (c + 3 <= end_col) {
+ unsigned int sads[4];
+ const uint8_t *addrs[4];
+ for (i = 0; i < 4; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ addrs[i] = get_buf_from_mv(in_what, &mv);
+ }
+ fn_ptr->sdx4df(what->buf, what->stride, addrs, in_what->stride, sads);
+
+ for (i = 0; i < 4; ++i) {
+ if (sads[i] < best_sad) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ const unsigned int sad =
+ sads[i] + mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ }
+ } else {
+ for (i = 0; i < end_col - c; ++i) {
+ const MV mv = { fcenter_mv.row + r, fcenter_mv.col + c + i };
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, ref_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ x->second_best_mv.as_mv = *best_mv;
+ *best_mv = mv;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return best_sad;
+}
+
+int av1_diamond_search_sad_c(MACROBLOCK *x, const search_site_config *cfg,
+ MV *ref_mv, MV *best_mv, int search_param,
+ int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ int i, j, step;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *what = x->plane[0].src.buf;
+ const int what_stride = x->plane[0].src.stride;
+ const uint8_t *in_what;
+ const int in_what_stride = xd->plane[0].pre[0].stride;
+ const uint8_t *best_address;
+
+ unsigned int bestsad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+
+ int ref_row;
+ int ref_col;
+
+ // search_param determines the length of the initial step and hence the number
+ // of iterations.
+ // 0 = initial step (MAX_FIRST_STEP) pel
+ // 1 = (MAX_FIRST_STEP/2) pel,
+ // 2 = (MAX_FIRST_STEP/4) pel...
+ const search_site *ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ ref_row = ref_mv->row;
+ ref_col = ref_mv->col;
+ *num00 = 0;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
+ // Work out the start point for the search
+ in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+ best_address = in_what;
+
+ // Check the starting position
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ int all_in = 1, t;
+
+ // All_in is true if every one of the points we are checking are within
+ // the bounds of the image.
+ all_in &= ((best_mv->row + ss[i].mv.row) > x->mv_limits.row_min);
+ all_in &= ((best_mv->row + ss[i + 1].mv.row) < x->mv_limits.row_max);
+ all_in &= ((best_mv->col + ss[i + 2].mv.col) > x->mv_limits.col_min);
+ all_in &= ((best_mv->col + ss[i + 3].mv.col) < x->mv_limits.col_max);
+
+ // If all the pixels are within the bounds we don't check whether the
+ // search point is valid in this loop, otherwise we check each point
+ // for validity..
+ if (all_in) {
+ unsigned int sad_array[4];
+
+ for (j = 0; j < cfg->searches_per_step; j += 4) {
+ unsigned char const *block_offset[4];
+
+ for (t = 0; t < 4; t++)
+ block_offset[t] = ss[i + t].offset + best_address;
+
+ fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+ sad_array);
+
+ for (t = 0; t < 4; t++, i++) {
+ if (sad_array[t] < bestsad) {
+ const MV this_mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+ sad_array[t] +=
+ mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad_array[t] < bestsad) {
+ bestsad = sad_array[t];
+ best_site = i;
+ }
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ // Trap illegal vectors
+ const MV this_mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss[i].offset + best_address;
+ unsigned int thissad =
+ fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = i;
+ }
+ }
+ }
+ i++;
+ }
+ }
+ if (best_site != last_site) {
+ x->second_best_mv.as_mv = *best_mv;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+ best_mv->col + ss[best_site].mv.col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ const uint8_t *const check_here = ss[best_site].offset + best_address;
+ unsigned int thissad =
+ fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+ if (thissad < bestsad) {
+ thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what) {
+ (*num00)++;
+ }
+ }
+ return bestsad;
+}
+
+/* do_refine: If last step (1-away) of n-step search doesn't pick the center
+ point as the best match, we will do a final 1-away diamond
+ refining search */
+static int full_pixel_diamond(const AV1_COMP *const cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine, int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv) {
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+ step_param, sadpb, &n, fn_ptr, ref_mv);
+ if (bestsme < INT_MAX)
+ bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ x->best_mv.as_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
+ step_param + n, sadpb, &num00, fn_ptr,
+ ref_mv);
+ if (thissme < INT_MAX)
+ thissme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ x->best_mv.as_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = x->best_mv.as_mv;
+ thissme = av1_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
+ ref_mv);
+ if (thissme < INT_MAX)
+ thissme = av1_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ x->best_mv.as_mv = best_mv;
+ }
+ }
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
+ }
+ return bestsme;
+}
+
+#define MIN_RANGE 7
+#define MAX_RANGE 256
+#define MIN_INTERVAL 1
+// Runs an limited range exhaustive mesh search using a pattern set
+// according to the encode speed profile.
+static int full_pixel_exhaustive(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const MV *centre_mv_full, int sadpb,
+ int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MV temp_mv = { centre_mv_full->row, centre_mv_full->col };
+ MV f_ref_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
+ int bestsme;
+ int i;
+ int interval = sf->mesh_patterns[0].interval;
+ int range = sf->mesh_patterns[0].range;
+ int baseline_interval_divisor;
+
+ // Keep track of number of exhaustive calls (this frame in this thread).
+ ++(*x->ex_search_count_ptr);
+
+ // Trap illegal values for interval and range for this function.
+ if ((range < MIN_RANGE) || (range > MAX_RANGE) || (interval < MIN_INTERVAL) ||
+ (interval > range))
+ return INT_MAX;
+
+ baseline_interval_divisor = range / interval;
+
+ // Check size of proposed first range against magnitude of the centre
+ // value used as a starting point.
+ range = AOMMAX(range, (5 * AOMMAX(abs(temp_mv.row), abs(temp_mv.col))) / 4);
+ range = AOMMIN(range, MAX_RANGE);
+ interval = AOMMAX(interval, range / baseline_interval_divisor);
+
+ // initial search
+ bestsme = exhuastive_mesh_search(x, &f_ref_mv, &temp_mv, range, interval,
+ sadpb, fn_ptr, &temp_mv);
+
+ if ((interval > MIN_INTERVAL) && (range > MIN_RANGE)) {
+ // Progressive searches with range and step size decreasing each time
+ // till we reach a step size of 1. Then break out.
+ for (i = 1; i < MAX_MESH_STEP; ++i) {
+ // First pass with coarser step and longer range
+ bestsme = exhuastive_mesh_search(
+ x, &f_ref_mv, &temp_mv, sf->mesh_patterns[i].range,
+ sf->mesh_patterns[i].interval, sadpb, fn_ptr, &temp_mv);
+
+ if (sf->mesh_patterns[i].interval == 1) break;
+ }
+ }
+
+ if (bestsme < INT_MAX)
+ bestsme = av1_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
+ *dst_mv = temp_mv;
+
+ // Return cost list.
+ if (cost_list) {
+ calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+ }
+ return bestsme;
+}
+
+int av1_refining_search_sad(MACROBLOCK *x, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
+ unsigned int best_sad =
+ fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+ const int all_in = ((ref_mv->row - 1) > x->mv_limits.row_min) &
+ ((ref_mv->row + 1) < x->mv_limits.row_max) &
+ ((ref_mv->col - 1) > x->mv_limits.col_min) &
+ ((ref_mv->col + 1) < x->mv_limits.col_max);
+
+ if (all_in) {
+ unsigned int sads[4];
+ const uint8_t *const positions[4] = { best_address - in_what->stride,
+ best_address - 1, best_address + 1,
+ best_address + in_what->stride };
+
+ fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+
+ for (j = 0; j < 4; ++j) {
+ if (sads[j] < best_sad) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ sads[j] += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sads[j] < best_sad) {
+ best_sad = sads[j];
+ best_site = j;
+ }
+ }
+ }
+ } else {
+ for (j = 0; j < 4; ++j) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad =
+ fn_ptr->sdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ x->second_best_mv.as_mv = *ref_mv;
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ best_address = get_buf_from_mv(in_what, ref_mv);
+ }
+ }
+
+ return best_sad;
+}
+
+// This function is called when we do joint motion search in comp_inter_inter
+// mode, or when searching for one component of an ext-inter compound mode.
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask, const MV *center_mv,
+ const uint8_t *second_pred) {
+ static const search_neighbors neighbors[8] = {
+ { { -1, 0 }, -1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { 0, -1 }, 0 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 0, 1 }, 0 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 0 }, 1 * SEARCH_GRID_STRIDE_8P + 0 },
+ { { -1, -1 }, -1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { 1, -1 }, 1 * SEARCH_GRID_STRIDE_8P - 1 },
+ { { -1, 1 }, -1 * SEARCH_GRID_STRIDE_8P + 1 },
+ { { 1, 1 }, 1 * SEARCH_GRID_STRIDE_8P + 1 }
+ };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const what = &x->plane[0].src;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ MV *best_mv = &x->best_mv.as_mv;
+ unsigned int best_sad = INT_MAX;
+ int i, j;
+ uint8_t do_refine_search_grid[SEARCH_GRID_STRIDE_8P * SEARCH_GRID_STRIDE_8P] =
+ { 0 };
+ int grid_center = SEARCH_GRID_CENTER_8P;
+ int grid_coord = grid_center;
+
+ clamp_mv(best_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ if (mask) {
+ best_sad = fn_ptr->msdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, best_mv), in_what->stride,
+ second_pred, mask, mask_stride, invert_mask) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ best_sad = fn_ptr->jsdaf(what->buf, what->stride,
+ get_buf_from_mv(in_what, best_mv),
+ in_what->stride, second_pred, &xd->jcp_param) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+ else
+ best_sad = fn_ptr->sdaf(what->buf, what->stride,
+ get_buf_from_mv(in_what, best_mv),
+ in_what->stride, second_pred) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
+ }
+
+ do_refine_search_grid[grid_coord] = 1;
+
+ for (i = 0; i < search_range; ++i) {
+ int best_site = -1;
+
+ for (j = 0; j < 8; ++j) {
+ grid_coord = grid_center + neighbors[j].coord_offset;
+ if (do_refine_search_grid[grid_coord] == 1) {
+ continue;
+ }
+ const MV mv = { best_mv->row + neighbors[j].coord.row,
+ best_mv->col + neighbors[j].coord.col };
+
+ do_refine_search_grid[grid_coord] = 1;
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad;
+ if (mask) {
+ sad = fn_ptr->msdf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride,
+ second_pred, mask, mask_stride, invert_mask);
+ } else {
+ if (xd->jcp_param.use_jnt_comp_avg)
+ sad = fn_ptr->jsdaf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride,
+ second_pred, &xd->jcp_param);
+ else
+ sad = fn_ptr->sdaf(what->buf, what->stride,
+ get_buf_from_mv(in_what, &mv), in_what->stride,
+ second_pred);
+ }
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ best_mv->row += neighbors[best_site].coord.row;
+ best_mv->col += neighbors[best_site].coord.col;
+ grid_center += neighbors[best_site].coord_offset;
+ }
+ }
+ return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(const AV1_COMP *const cpi, MACROBLOCK *x) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const int max_ex =
+ AOMMAX(MIN_EX_SEARCH_LIMIT,
+ (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+ return sf->allow_exhaustive_searches &&
+ (sf->exhaustive_searches_thresh < INT_MAX) &&
+ (*x->ex_search_count_ptr <= max_ex) && !cpi->rc.is_src_frame_alt_ref;
+}
+
+int av1_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ MV *mvp_full, int step_param, int method,
+ int run_mesh_search, int error_per_bit,
+ int *cost_list, const MV *ref_mv, int var_max, int rd,
+ int x_pos, int y_pos, int intra) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const aom_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+ int var = 0;
+
+ if (cost_list) {
+ cost_list[0] = INT_MAX;
+ cost_list[1] = INT_MAX;
+ cost_list[2] = INT_MAX;
+ cost_list[3] = INT_MAX;
+ cost_list[4] = INT_MAX;
+ }
+
+ // Keep track of number of searches (this frame in this thread).
+ ++(*x->m_search_count_ptr);
+
+ switch (method) {
+ case FAST_DIAMOND:
+ var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv);
+ break;
+ case FAST_HEX:
+ var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+ cost_list, fn_ptr, 1, ref_mv);
+ break;
+ case HEX:
+ var = av1_hex_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case SQUARE:
+ var = square_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case BIGDIA:
+ var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
+ fn_ptr, 1, ref_mv);
+ break;
+ case NSTEP:
+ var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+ MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+ cost_list, fn_ptr, ref_mv);
+
+ // Should we allow a follow on exhaustive search?
+ if (is_exhaustive_allowed(cpi, x)) {
+ int exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex =
+ full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+ cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+
+ if (var_ex < var) {
+ var = var_ex;
+ x->best_mv.as_mv = tmp_mv_ex;
+ }
+ }
+ }
+ break;
+ default: assert(0 && "Invalid search method.");
+ }
+
+ // Should we allow a follow on exhaustive search?
+ if (!run_mesh_search) {
+ if (method == NSTEP) {
+ if (is_exhaustive_allowed(cpi, x)) {
+ int exhuastive_thr = sf->exhaustive_searches_thresh;
+ exhuastive_thr >>=
+ 10 - (mi_size_wide_log2[bsize] + mi_size_high_log2[bsize]);
+ // Threshold variance for an exhaustive full search.
+ if (var > exhuastive_thr) run_mesh_search = 1;
+ }
+ }
+ }
+
+ if (run_mesh_search) {
+ int var_ex;
+ MV tmp_mv_ex;
+ var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv, error_per_bit,
+ cost_list, fn_ptr, ref_mv, &tmp_mv_ex);
+ if (var_ex < var) {
+ var = var_ex;
+ x->best_mv.as_mv = tmp_mv_ex;
+ }
+ }
+
+ if (method != NSTEP && rd && var < var_max)
+ var = av1_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+ do {
+ if (!intra || !av1_use_hash_me(&cpi->common)) break;
+
+ // already single ME
+ // get block size and original buffer of current block
+ const int block_height = block_size_high[bsize];
+ const int block_width = block_size_wide[bsize];
+ if (block_height == block_width && x_pos >= 0 && y_pos >= 0) {
+ if (block_width == 4 || block_width == 8 || block_width == 16 ||
+ block_width == 32 || block_width == 64 || block_width == 128) {
+ uint8_t *what = x->plane[0].src.buf;
+ const int what_stride = x->plane[0].src.stride;
+ uint32_t hash_value1, hash_value2;
+ MV best_hash_mv;
+ int best_hash_cost = INT_MAX;
+
+ // for the hashMap
+ hash_table *ref_frame_hash =
+ intra
+ ? &cpi->common.cur_frame->hash_table
+ : av1_get_ref_frame_hash_map(cpi, x->e_mbd.mi[0]->ref_frame[0]);
+
+ av1_get_block_hash_value(
+ what, what_stride, block_width, &hash_value1, &hash_value2,
+ x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH, x);
+
+ const int count = av1_hash_table_count(ref_frame_hash, hash_value1);
+ // for intra, at lest one matching can be found, itself.
+ if (count <= (intra ? 1 : 0)) {
+ break;
+ }
+
+ Iterator iterator =
+ av1_hash_get_first_iterator(ref_frame_hash, hash_value1);
+ for (int i = 0; i < count; i++, iterator_increment(&iterator)) {
+ block_hash ref_block_hash = *(block_hash *)(iterator_get(&iterator));
+ if (hash_value2 == ref_block_hash.hash_value2) {
+ // For intra, make sure the prediction is from valid area.
+ if (intra) {
+ const int mi_col = x_pos / MI_SIZE;
+ const int mi_row = y_pos / MI_SIZE;
+ const MV dv = { 8 * (ref_block_hash.y - y_pos),
+ 8 * (ref_block_hash.x - x_pos) };
+ if (!av1_is_dv_valid(dv, &cpi->common, &x->e_mbd, mi_row, mi_col,
+ bsize, cpi->common.seq_params.mib_size_log2))
+ continue;
+ }
+ MV hash_mv;
+ hash_mv.col = ref_block_hash.x - x_pos;
+ hash_mv.row = ref_block_hash.y - y_pos;
+ if (!is_mv_in(&x->mv_limits, &hash_mv)) continue;
+ const int refCost =
+ av1_get_mvpred_var(x, &hash_mv, ref_mv, fn_ptr, 1);
+ if (refCost < best_hash_cost) {
+ best_hash_cost = refCost;
+ best_hash_mv = hash_mv;
+ }
+ }
+ }
+ if (best_hash_cost < var) {
+ x->second_best_mv = x->best_mv;
+ x->best_mv.as_mv = best_hash_mv;
+ var = best_hash_cost;
+ }
+ }
+ }
+ } while (0);
+
+ return var;
+}
+
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c) \
+ (unsigned int)(mvcost \
+ ? ((mvjcost[((r) != rr) * 2 + ((c) != rc)] + \
+ mvcost[0][((r)-rr)] + (int64_t)mvcost[1][((c)-rc)]) * \
+ error_per_bit + \
+ 4096) >> \
+ 13 \
+ : 0)
+
+#define CHECK_BETTER(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ thismse = (DIST(r, c)); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \
+ MV this_mv = { r, c }; \
+ thismse = upsampled_obmc_pref_error(xd, cm, mi_row, mi_col, &this_mv, \
+ mask, vfp, z, pre(y, y_stride, r, c), \
+ y_stride, sp(c), sp(r), w, h, &sse, \
+ use_accurate_subpel_search); \
+ if ((v = MVC(r, c) + thismse) < besterr) { \
+ besterr = v; \
+ br = r; \
+ bc = c; \
+ *distortion = thismse; \
+ *sse1 = sse; \
+ } \
+ } else { \
+ v = INT_MAX; \
+ }
+
+static unsigned int setup_obmc_center_error(
+ const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+ const uint8_t *const y, int y_stride, int offset, int *mvjcost,
+ int *mvcost[2], unsigned int *sse1, int *distortion) {
+ unsigned int besterr;
+ besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+static int upsampled_obmc_pref_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *const mv, const int32_t *mask, const aom_variance_fn_ptr_t *vfp,
+ const int32_t *const wsrc, const uint8_t *const y, int y_stride,
+ int subpel_x_q3, int subpel_y_q3, int w, int h, unsigned int *sse,
+ int subpel_search) {
+ unsigned int besterr;
+
+ DECLARE_ALIGNED(16, uint8_t, pred[2 * MAX_SB_SQUARE]);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint8_t *pred8 = CONVERT_TO_BYTEPTR(pred);
+ aom_highbd_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred8, w, h,
+ subpel_x_q3, subpel_y_q3, y, y_stride, xd->bd,
+ subpel_search);
+ besterr = vfp->ovf(pred8, w, wsrc, mask, sse);
+ } else {
+ aom_upsampled_pred(xd, cm, mi_row, mi_col, mv, pred, w, h, subpel_x_q3,
+ subpel_y_q3, y, y_stride, subpel_search);
+
+ besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+ }
+ return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+ MACROBLOCKD *xd, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const int32_t *mask, const MV *bestmv, const MV *ref_mv, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, const int32_t *const wsrc,
+ const uint8_t *const y, int y_stride, int w, int h, int offset,
+ int *mvjcost, int *mvcost[2], unsigned int *sse1, int *distortion,
+ int subpel_search) {
+ unsigned int besterr = upsampled_obmc_pref_error(
+ xd, cm, mi_row, mi_col, bestmv, mask, vfp, wsrc, y + offset, y_stride, 0,
+ 0, w, h, sse1, subpel_search);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+ return besterr;
+}
+
+int av1_find_best_obmc_sub_pixel_tree_up(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second, int use_accurate_subpel_search) {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ const int *const z = wsrc;
+ const int *const src_address = z;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ unsigned int besterr = INT_MAX;
+ unsigned int sse;
+ unsigned int thismse;
+
+ int rr = ref_mv->row;
+ int rc = ref_mv->col;
+ int br = bestmv->row * 8;
+ int bc = bestmv->col * 8;
+ int hstep = 4;
+ int iter;
+ int round = 3 - forced_stop;
+ int tr = br;
+ int tc = bc;
+ const MV *search_step = search_step_table;
+ int idx, best_idx = -1;
+ unsigned int cost_array[5];
+ int kr, kc;
+ const int w = block_size_wide[mbmi->sb_type];
+ const int h = block_size_high[mbmi->sb_type];
+ int offset;
+ int y_stride;
+ const uint8_t *y;
+
+ int minc, maxc, minr, maxr;
+
+ set_subpel_mv_search_range(&x->mv_limits, &minc, &maxc, &minr, &maxr, ref_mv);
+
+ y = pd->pre[is_second].buf;
+ y_stride = pd->pre[is_second].stride;
+ offset = bestmv->row * y_stride + bestmv->col;
+
+ if (!allow_hp)
+ if (round == 3) round = 2;
+
+ bestmv->row *= 8;
+ bestmv->col *= 8;
+ // use_accurate_subpel_search can be 0 or 1 or 2
+ if (use_accurate_subpel_search)
+ besterr = upsampled_setup_obmc_center_error(
+ xd, cm, mi_row, mi_col, mask, bestmv, ref_mv, error_per_bit, vfp, z, y,
+ y_stride, w, h, offset, mvjcost, mvcost, sse1, distortion,
+ use_accurate_subpel_search);
+ else
+ besterr = setup_obmc_center_error(mask, bestmv, ref_mv, error_per_bit, vfp,
+ z, y, y_stride, offset, mvjcost, mvcost,
+ sse1, distortion);
+
+ for (iter = 0; iter < round; ++iter) {
+ // Check vertical and horizontal sub-pixel positions.
+ for (idx = 0; idx < 4; ++idx) {
+ tr = br + search_step[idx].row;
+ tc = bc + search_step[idx].col;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+ if (use_accurate_subpel_search) {
+ thismse = upsampled_obmc_pref_error(
+ xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
+ } else {
+ thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc),
+ sp(tr), src_address, mask, &sse);
+ }
+
+ cost_array[idx] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost,
+ mvcost, error_per_bit);
+ if (cost_array[idx] < besterr) {
+ best_idx = idx;
+ besterr = cost_array[idx];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+ }
+
+ // Check diagonal sub-pixel position
+ kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+ kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+ tc = bc + kc;
+ tr = br + kr;
+ if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+ MV this_mv = { tr, tc };
+
+ if (use_accurate_subpel_search) {
+ thismse = upsampled_obmc_pref_error(
+ xd, cm, mi_row, mi_col, &this_mv, mask, vfp, src_address,
+ pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr), w, h, &sse,
+ use_accurate_subpel_search);
+ } else {
+ thismse = vfp->osvf(pre(y, y_stride, tr, tc), y_stride, sp(tc), sp(tr),
+ src_address, mask, &sse);
+ }
+
+ cost_array[4] = thismse + mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost,
+ error_per_bit);
+
+ if (cost_array[4] < besterr) {
+ best_idx = 4;
+ besterr = cost_array[4];
+ *distortion = thismse;
+ *sse1 = sse;
+ }
+ } else {
+ cost_array[idx] = INT_MAX;
+ }
+
+ if (best_idx < 4 && best_idx >= 0) {
+ br += search_step[best_idx].row;
+ bc += search_step[best_idx].col;
+ } else if (best_idx == 4) {
+ br = tr;
+ bc = tc;
+ }
+
+ if (iters_per_step > 1 && best_idx != -1) {
+ if (use_accurate_subpel_search) {
+ SECOND_LEVEL_CHECKS_BEST(1);
+ } else {
+ SECOND_LEVEL_CHECKS_BEST(0);
+ }
+ }
+
+ tr = br;
+ tc = bc;
+
+ search_step += 4;
+ hstep >>= 1;
+ best_idx = -1;
+ }
+
+ // These lines insure static analysis doesn't warn that
+ // tr and tc aren't used after the above point.
+ (void)tr;
+ (void)tc;
+
+ bestmv->row = br;
+ bestmv->col = bc;
+
+ return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x, const int32_t *wsrc,
+ const int32_t *mask, const MV *best_mv,
+ const MV *center_mv,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV mv = { best_mv->row * 8, best_mv->col * 8 };
+ unsigned int unused;
+
+ return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride, wsrc,
+ mask, &unused) +
+ (use_mvcost ? mv_err_cost(&mv, center_mv, x->nmvjointcost, x->mvcost,
+ x->errorperbit)
+ : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x, const int32_t *wsrc,
+ const int32_t *mask, MV *ref_mv, int error_per_bit,
+ int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+ in_what->stride, wsrc, mask) +
+ mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+ int i, j;
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 4; j++) {
+ const MV mv = { ref_mv->row + neighbors[j].row,
+ ref_mv->col + neighbors[j].col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+ in_what->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->row += neighbors[best_site].row;
+ ref_mv->col += neighbors[best_site].col;
+ }
+ }
+ return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x, const search_site_config *cfg,
+ const int32_t *wsrc, const int32_t *mask,
+ MV *ref_mv, MV *best_mv, int search_param,
+ int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, int is_second) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+ // search_param determines the length of the initial step and hence the number
+ // of iterations
+ // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+ // (MAX_FIRST_STEP/4) pel... etc.
+ const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+ const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+ const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
+ const uint8_t *best_address, *in_what_ref;
+ int best_sad = INT_MAX;
+ int best_site = 0;
+ int last_site = 0;
+ int i, j, step;
+
+ clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+ best_address = in_what_ref;
+ *num00 = 0;
+ *best_mv = *ref_mv;
+
+ // Check the starting position
+ best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
+ mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+ i = 1;
+
+ for (step = 0; step < tot_steps; step++) {
+ for (j = 0; j < cfg->searches_per_step; j++) {
+ const MV mv = { best_mv->row + ss[i].mv.row,
+ best_mv->col + ss[i].mv.col };
+ if (is_mv_in(&x->mv_limits, &mv)) {
+ int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+ wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_site = i;
+ }
+ }
+ }
+
+ i++;
+ }
+
+ if (best_site != last_site) {
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+ while (1) {
+ const MV this_mv = { best_mv->row + ss[best_site].mv.row,
+ best_mv->col + ss[best_site].mv.col };
+ if (is_mv_in(&x->mv_limits, &this_mv)) {
+ int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+ in_what->stride, wsrc, mask);
+ if (sad < best_sad) {
+ sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+ if (sad < best_sad) {
+ best_sad = sad;
+ best_mv->row += ss[best_site].mv.row;
+ best_mv->col += ss[best_site].mv.col;
+ best_address += ss[best_site].offset;
+ continue;
+ }
+ }
+ }
+ break;
+ }
+#endif
+ } else if (best_address == in_what_ref) {
+ (*num00)++;
+ }
+ }
+ return best_sad;
+}
+
+static int obmc_full_pixel_diamond(const AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv,
+ int is_second) {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ MV temp_mv;
+ int thissme, n, num00 = 0;
+ int bestsme =
+ obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full, &temp_mv,
+ step_param, sadpb, &n, fn_ptr, ref_mv, is_second);
+ if (bestsme < INT_MAX)
+ bestsme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ *dst_mv = temp_mv;
+
+ // If there won't be more n-step search, check to see if refining search is
+ // needed.
+ if (n > further_steps) do_refine = 0;
+
+ while (n < further_steps) {
+ ++n;
+
+ if (num00) {
+ num00--;
+ } else {
+ thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg, wsrc, mask, mvp_full,
+ &temp_mv, step_param + n, sadpb, &num00,
+ fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, &temp_mv, ref_mv, fn_ptr,
+ 1, is_second);
+
+ // check to see if refining search is needed.
+ if (num00 > further_steps - n) do_refine = 0;
+
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = temp_mv;
+ }
+ }
+ }
+
+ // final 1-away diamond refining search
+ if (do_refine) {
+ const int search_range = 8;
+ MV best_mv = *dst_mv;
+ thissme = obmc_refining_search_sad(x, wsrc, mask, &best_mv, sadpb,
+ search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, &best_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ if (thissme < bestsme) {
+ bestsme = thissme;
+ *dst_mv = best_mv;
+ }
+ }
+ return bestsme;
+}
+
+int av1_obmc_full_pixel_search(const AV1_COMP *cpi, MACROBLOCK *x, MV *mvp_full,
+ int step_param, int sadpb, int further_steps,
+ int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second) {
+ if (cpi->sf.obmc_full_pixel_search_level == 0) {
+ return obmc_full_pixel_diamond(cpi, x, mvp_full, step_param, sadpb,
+ further_steps, do_refine, fn_ptr, ref_mv,
+ dst_mv, is_second);
+ } else {
+ const int32_t *wsrc = x->wsrc_buf;
+ const int32_t *mask = x->mask_buf;
+ const int search_range = 8;
+ *dst_mv = *mvp_full;
+ clamp_mv(dst_mv, x->mv_limits.col_min, x->mv_limits.col_max,
+ x->mv_limits.row_min, x->mv_limits.row_max);
+ int thissme = obmc_refining_search_sad(
+ x, wsrc, mask, dst_mv, sadpb, search_range, fn_ptr, ref_mv, is_second);
+ if (thissme < INT_MAX)
+ thissme = get_obmc_mvpred_var(x, wsrc, mask, dst_mv, ref_mv, fn_ptr, 1,
+ is_second);
+ return thissme;
+ }
+}
+
+// Note(yunqingwang): The following 2 functions are only used in the motion
+// vector unit test, which return extreme motion vectors allowed by the MV
+// limits.
+#define COMMON_MV_TEST \
+ SETUP_SUBPEL_SEARCH; \
+ \
+ (void)error_per_bit; \
+ (void)vfp; \
+ (void)src_address; \
+ (void)src_stride; \
+ (void)y; \
+ (void)y_stride; \
+ (void)second_pred; \
+ (void)w; \
+ (void)h; \
+ (void)use_accurate_subpel_search; \
+ (void)offset; \
+ (void)mvjcost; \
+ (void)mvcost; \
+ (void)sse1; \
+ (void)distortion; \
+ \
+ (void)halfiters; \
+ (void)quarteriters; \
+ (void)eighthiters; \
+ (void)whichdir; \
+ (void)forced_stop; \
+ (void)hstep; \
+ \
+ (void)tr; \
+ (void)tc; \
+ (void)sse; \
+ (void)thismse; \
+ (void)cost_list;
+// Return the maximum MV.
+int av1_return_max_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *ref_mv,
+ int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ COMMON_MV_TEST;
+ (void)mask;
+ (void)mask_stride;
+ (void)invert_mask;
+ (void)minr;
+ (void)minc;
+
+ (void)cm;
+ (void)mi_row;
+ (void)mi_col;
+
+ bestmv->row = maxr;
+ bestmv->col = maxc;
+ besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
+// Return the minimum MV.
+int av1_return_min_sub_pixel_mv(MACROBLOCK *x, const AV1_COMMON *const cm,
+ int mi_row, int mi_col, const MV *ref_mv,
+ int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, int iters_per_step,
+ int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred, const uint8_t *mask,
+ int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search) {
+ COMMON_MV_TEST;
+ (void)maxr;
+ (void)maxc;
+ (void)mask;
+ (void)mask_stride;
+ (void)invert_mask;
+
+ (void)cm;
+ (void)mi_row;
+ (void)mi_col;
+
+ bestmv->row = minr;
+ bestmv->col = minc;
+ besterr = 0;
+ // In the sub-pel motion search, if hp is not used, then the last bit of mv
+ // has to be 0.
+ lower_mv_precision(bestmv, allow_hp, 0);
+ return besterr;
+}
diff --git a/third_party/aom/av1/encoder/mcomp.h b/third_party/aom/av1/encoder/mcomp.h
new file mode 100644
index 0000000000..a975218b09
--- /dev/null
+++ b/third_party/aom/av1/encoder/mcomp.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_MCOMP_H_
+#define AOM_AV1_ENCODER_MCOMP_H_
+
+#include "av1/encoder/block.h"
+#include "aom_dsp/variance.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The maximum number of steps in a step search given the largest
+// allowed initial step
+#define MAX_MVSEARCH_STEPS 11
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
+// Maximum size of the first step in full pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1))
+// Allowed motion vector pixel distance outside image border
+// for Block_16x16
+#define BORDER_MV_PIXELS_B16 (16 + AOM_INTERP_EXTEND)
+
+#define SEARCH_RANGE_8P 3
+#define SEARCH_GRID_STRIDE_8P (2 * SEARCH_RANGE_8P + 1)
+#define SEARCH_GRID_CENTER_8P \
+ (SEARCH_RANGE_8P * SEARCH_GRID_STRIDE_8P + SEARCH_RANGE_8P)
+
+// motion search site
+typedef struct search_site {
+ MV mv;
+ int offset;
+} search_site;
+
+typedef struct search_site_config {
+ search_site ss[8 * MAX_MVSEARCH_STEPS + 1];
+ int ss_count;
+ int searches_per_step;
+} search_site_config;
+
+typedef struct {
+ MV coord;
+ int coord_offset;
+} search_neighbors;
+
+void av1_init_dsmotion_compensation(search_site_config *cfg, int stride);
+void av1_init3smotion_compensation(search_site_config *cfg, int stride);
+
+void av1_set_mv_search_range(MvLimits *mv_limits, const MV *mv);
+
+int av1_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
+ int *mvcost[2], int weight);
+
+// Utility to compute variance + MV rate cost for a given MV
+int av1_get_mvpred_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost);
+int av1_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost);
+int av1_get_mvpred_mask_var(const MACROBLOCK *x, const MV *best_mv,
+ const MV *center_mv, const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask, const aom_variance_fn_ptr_t *vfp,
+ int use_mvcost);
+
+struct AV1_COMP;
+struct SPEED_FEATURES;
+
+int av1_init_search_range(int size);
+
+int av1_refining_search_sad(struct macroblock *x, MV *ref_mv, int sad_per_bit,
+ int distance, const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv);
+
+// Runs sequence of diamond searches in smaller steps for RD.
+int av1_full_pixel_diamond(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine, int *cost_list,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv);
+
+int av1_hex_search(MACROBLOCK *x, MV *start_mv, int search_param,
+ int sad_per_bit, int do_init_search, int *cost_list,
+ const aom_variance_fn_ptr_t *vfp, int use_mvcost,
+ const MV *center_mv);
+
+typedef int(fractional_mv_step_fp)(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp,
+ int forced_stop, // 0 - full, 1 - qtr only, 2 - half only
+ int iters_per_step, int *cost_list, int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1, const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride, int invert_mask, int w, int h,
+ int use_accurate_subpel_search);
+
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_more;
+extern fractional_mv_step_fp av1_find_best_sub_pixel_tree_pruned_evenmore;
+extern fractional_mv_step_fp av1_return_max_sub_pixel_mv;
+extern fractional_mv_step_fp av1_return_min_sub_pixel_mv;
+
+typedef int (*av1_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
+ int sad_per_bit, int distance,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *center_mv, MV *best_mv);
+
+typedef int (*av1_diamond_search_fn_t)(
+ MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
+ int search_param, int sad_per_bit, int *num00,
+ const aom_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+
+int av1_refining_search_8p_c(MACROBLOCK *x, int error_per_bit, int search_range,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask, const MV *center_mv,
+ const uint8_t *second_pred);
+
+int av1_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *mvp_full, int step_param,
+ int method, int run_mesh_search, int error_per_bit,
+ int *cost_list, const MV *ref_mv, int var_max, int rd,
+ int x_pos, int y_pos, int intra);
+
+int av1_obmc_full_pixel_search(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ MV *mvp_full, int step_param, int sadpb,
+ int further_steps, int do_refine,
+ const aom_variance_fn_ptr_t *fn_ptr,
+ const MV *ref_mv, MV *dst_mv, int is_second);
+int av1_find_best_obmc_sub_pixel_tree_up(
+ MACROBLOCK *x, const AV1_COMMON *const cm, int mi_row, int mi_col,
+ MV *bestmv, const MV *ref_mv, int allow_hp, int error_per_bit,
+ const aom_variance_fn_ptr_t *vfp, int forced_stop, int iters_per_step,
+ int *mvjcost, int *mvcost[2], int *distortion, unsigned int *sse1,
+ int is_second, int use_accurate_subpel_search);
+
+unsigned int av1_compute_motion_cost(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, const MV *this_mv);
+unsigned int av1_refine_warped_mv(const struct AV1_COMP *cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, int *pts0,
+ int *pts_inref0, int total_samples);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_MCOMP_H_
diff --git a/third_party/aom/av1/encoder/mips/msa/error_msa.c b/third_party/aom/av1/encoder/mips/msa/error_msa.c
new file mode 100644
index 0000000000..2e86dee430
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/error_msa.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+#define BLOCK_ERROR_BLOCKSIZE_MSA(BSize) \
+ static int64_t block_error_##BSize##size_msa( \
+ const int16_t *coeff_ptr, const int16_t *dq_coeff_ptr, int64_t *ssz) { \
+ int64_t err = 0; \
+ uint32_t loop_cnt; \
+ v8i16 coeff, dq_coeff, coeff_r_h, coeff_l_h; \
+ v4i32 diff_r, diff_l, coeff_r_w, coeff_l_w; \
+ v2i64 sq_coeff_r, sq_coeff_l; \
+ v2i64 err0, err_dup0, err1, err_dup1; \
+ \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DOTP_SW2_SD(coeff_r_w, coeff_l_w, coeff_r_w, coeff_l_w, sq_coeff_r, \
+ sq_coeff_l); \
+ DOTP_SW2_SD(diff_r, diff_l, diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ \
+ for (loop_cnt = ((BSize >> 4) - 1); loop_cnt--;) { \
+ coeff = LD_SH(coeff_ptr); \
+ dq_coeff = LD_SH(dq_coeff_ptr); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff = LD_SH(coeff_ptr + 8); \
+ dq_coeff = LD_SH(dq_coeff_ptr + 8); \
+ UNPCK_SH_SW(coeff, coeff_r_w, coeff_l_w); \
+ ILVRL_H2_SH(coeff, dq_coeff, coeff_r_h, coeff_l_h); \
+ HSUB_UH2_SW(coeff_r_h, coeff_l_h, diff_r, diff_l); \
+ DPADD_SD2_SD(coeff_r_w, coeff_l_w, sq_coeff_r, sq_coeff_l); \
+ DPADD_SD2_SD(diff_r, diff_l, err0, err1); \
+ \
+ coeff_ptr += 16; \
+ dq_coeff_ptr += 16; \
+ } \
+ \
+ err_dup0 = __msa_splati_d(sq_coeff_r, 1); \
+ err_dup1 = __msa_splati_d(sq_coeff_l, 1); \
+ sq_coeff_r += err_dup0; \
+ sq_coeff_l += err_dup1; \
+ *ssz = __msa_copy_s_d(sq_coeff_r, 0); \
+ *ssz += __msa_copy_s_d(sq_coeff_l, 0); \
+ \
+ err_dup0 = __msa_splati_d(err0, 1); \
+ err_dup1 = __msa_splati_d(err1, 1); \
+ err0 += err_dup0; \
+ err1 += err_dup1; \
+ err = __msa_copy_s_d(err0, 0); \
+ err += __msa_copy_s_d(err1, 0); \
+ \
+ return err; \
+ }
+
+/* clang-format off */
+BLOCK_ERROR_BLOCKSIZE_MSA(16)
+BLOCK_ERROR_BLOCKSIZE_MSA(64)
+BLOCK_ERROR_BLOCKSIZE_MSA(256)
+BLOCK_ERROR_BLOCKSIZE_MSA(1024)
+/* clang-format on */
+
+int64_t av1_block_error_msa(const tran_low_t *coeff_ptr,
+ const tran_low_t *dq_coeff_ptr, intptr_t blk_size,
+ int64_t *ssz) {
+ int64_t err;
+ const int16_t *coeff = (const int16_t *)coeff_ptr;
+ const int16_t *dq_coeff = (const int16_t *)dq_coeff_ptr;
+
+ switch (blk_size) {
+ case 16: err = block_error_16size_msa(coeff, dq_coeff, ssz); break;
+ case 64: err = block_error_64size_msa(coeff, dq_coeff, ssz); break;
+ case 256: err = block_error_256size_msa(coeff, dq_coeff, ssz); break;
+ case 1024: err = block_error_1024size_msa(coeff, dq_coeff, ssz); break;
+ default:
+ err = av1_block_error_c(coeff_ptr, dq_coeff_ptr, blk_size, ssz);
+ break;
+ }
+
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
new file mode 100644
index 0000000000..085c08bfb8
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/fdct4x4_msa.c
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "av1/common/enums.h"
+
+void av1_fwht4x4_msa(const int16_t *input, int16_t *output,
+ int32_t src_stride) {
+ v8i16 in0, in1, in2, in3, in4;
+
+ LD_SH4(input, src_stride, in0, in1, in2, in3);
+
+ in0 += in1;
+ in3 -= in2;
+ in4 = (in0 - in3) >> 1;
+ SUB2(in4, in1, in4, in2, in1, in2);
+ in0 -= in2;
+ in3 += in1;
+
+ TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1);
+
+ in0 += in2;
+ in1 -= in3;
+ in4 = (in0 - in1) >> 1;
+ SUB2(in4, in2, in4, in3, in2, in3);
+ in0 -= in3;
+ in1 += in2;
+
+ SLLI_4V(in0, in1, in2, in3, 2);
+
+ TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2);
+
+ ST4x2_UB(in0, output, 4);
+ ST4x2_UB(in3, output + 4, 4);
+ ST4x2_UB(in1, output + 8, 4);
+ ST4x2_UB(in2, output + 12, 4);
+}
diff --git a/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
new file mode 100644
index 0000000000..531ae090a3
--- /dev/null
+++ b/third_party/aom/av1/encoder/mips/msa/temporal_filter_msa.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/mips/macros_msa.h"
+
+static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr, uint32_t stride,
+ uint8_t *frm2_ptr, int32_t filt_sth,
+ int32_t filt_wgt, uint32_t *acc,
+ uint16_t *cnt) {
+ uint32_t row;
+ uint64_t f0, f1, f2, f3;
+ v16i8 frm2, frm1 = { 0 };
+ v16i8 frm4, frm3 = { 0 };
+ v16u8 frm_r, frm_l;
+ v8i16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 2; row--;) {
+ LD4(frm1_ptr, stride, f0, f1, f2, f3);
+ frm1_ptr += (4 * stride);
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 32;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ INSERT_D2_SB(f0, f1, frm1);
+ INSERT_D2_SB(f2, f3, frm3);
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ UNPCK_UB_SH(frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+ UNPCK_UB_SH(frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+ }
+}
+
+static void temporal_filter_apply_16size_msa(uint8_t *frm1_ptr, uint32_t stride,
+ uint8_t *frm2_ptr,
+ int32_t filt_sth, int32_t filt_wgt,
+ uint32_t *acc, uint16_t *cnt) {
+ uint32_t row;
+ v16i8 frm1, frm2, frm3, frm4;
+ v16u8 frm_r, frm_l;
+ v16i8 zero = { 0 };
+ v8u16 frm2_r, frm2_l;
+ v8i16 diff0, diff1, mod0_h, mod1_h;
+ v4i32 cnst3, cnst16, filt_wt, strength;
+ v4i32 mod0_w, mod1_w, mod2_w, mod3_w;
+ v4i32 diff0_r, diff0_l, diff1_r, diff1_l;
+ v4i32 frm2_rr, frm2_rl, frm2_lr, frm2_ll;
+ v4i32 acc0, acc1, acc2, acc3;
+ v8i16 cnt0, cnt1;
+
+ filt_wt = __msa_fill_w(filt_wgt);
+ strength = __msa_fill_w(filt_sth);
+ cnst3 = __msa_ldi_w(3);
+ cnst16 = __msa_ldi_w(16);
+
+ for (row = 8; row--;) {
+ LD_SB2(frm1_ptr, stride, frm1, frm3);
+ frm1_ptr += stride;
+
+ LD_SB2(frm2_ptr, 16, frm2, frm4);
+ frm2_ptr += 16;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm1, frm2, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm2, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ LD_SW2(acc, 4, acc0, acc1);
+ LD_SW2(acc + 8, 4, acc2, acc3);
+ LD_SH2(cnt, 8, cnt0, cnt1);
+
+ ILVRL_B2_UB(frm3, frm4, frm_r, frm_l);
+ HSUB_UB2_SH(frm_r, frm_l, diff0, diff1);
+ UNPCK_SH_SW(diff0, diff0_r, diff0_l);
+ UNPCK_SH_SW(diff1, diff1_r, diff1_l);
+ MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ MUL4(mod0_w, cnst3, mod1_w, cnst3, mod2_w, cnst3, mod3_w, cnst3, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+ SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength);
+
+ diff0_r = (mod0_w < cnst16);
+ diff0_l = (mod1_w < cnst16);
+ diff1_r = (mod2_w < cnst16);
+ diff1_l = (mod3_w < cnst16);
+
+ SUB4(cnst16, mod0_w, cnst16, mod1_w, cnst16, mod2_w, cnst16, mod3_w, mod0_w,
+ mod1_w, mod2_w, mod3_w);
+
+ mod0_w = diff0_r & mod0_w;
+ mod1_w = diff0_l & mod1_w;
+ mod2_w = diff1_r & mod2_w;
+ mod3_w = diff1_l & mod3_w;
+
+ MUL4(mod0_w, filt_wt, mod1_w, filt_wt, mod2_w, filt_wt, mod3_w, filt_wt,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h);
+ ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h);
+ ST_SH2(mod0_h, mod1_h, cnt, 8);
+ cnt += 16;
+
+ ILVRL_B2_UH(zero, frm4, frm2_r, frm2_l);
+ UNPCK_SH_SW(frm2_r, frm2_rr, frm2_rl);
+ UNPCK_SH_SW(frm2_l, frm2_lr, frm2_ll);
+ MUL4(mod0_w, frm2_rr, mod1_w, frm2_rl, mod2_w, frm2_lr, mod3_w, frm2_ll,
+ mod0_w, mod1_w, mod2_w, mod3_w);
+ ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w,
+ mod2_w, mod3_w);
+ ST_SW2(mod0_w, mod1_w, acc, 4);
+ acc += 8;
+ ST_SW2(mod2_w, mod3_w, acc, 4);
+ acc += 8;
+
+ frm1_ptr += stride;
+ frm2_ptr += 16;
+ }
+}
+
+void av1_temporal_filter_apply_msa(uint8_t *frame1_ptr, uint32_t stride,
+ uint8_t *frame2_ptr, uint32_t blk_w,
+ uint32_t blk_h, int32_t strength,
+ int32_t filt_wgt, uint32_t *accu,
+ uint16_t *cnt) {
+ if (8 == (blk_w * blk_h)) {
+ temporal_filter_apply_8size_msa(frame1_ptr, stride, frame2_ptr, strength,
+ filt_wgt, accu, cnt);
+ } else if (16 == (blk_w * blk_h)) {
+ temporal_filter_apply_16size_msa(frame1_ptr, stride, frame2_ptr, strength,
+ filt_wgt, accu, cnt);
+ } else {
+ av1_temporal_filter_apply_c(frame1_ptr, stride, frame2_ptr, blk_w, blk_h,
+ strength, filt_wgt, accu, cnt);
+ }
+}
diff --git a/third_party/aom/av1/encoder/ml.c b/third_party/aom/av1/encoder/ml.c
new file mode 100644
index 0000000000..d21def43a8
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/encoder/ml.h"
+
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+ float *output) {
+ int num_input_nodes = nn_config->num_inputs;
+ int buf_index = 0;
+ float buf[2][NN_MAX_NODES_PER_LAYER];
+ const float *input_nodes = features;
+
+ // Propagate hidden layers.
+ const int num_layers = nn_config->num_hidden_layers;
+ assert(num_layers <= NN_MAX_HIDDEN_LAYERS);
+ for (int layer = 0; layer < num_layers; ++layer) {
+ const float *weights = nn_config->weights[layer];
+ const float *bias = nn_config->bias[layer];
+ float *output_nodes = buf[buf_index];
+ const int num_output_nodes = nn_config->num_hidden_nodes[layer];
+ assert(num_output_nodes < NN_MAX_NODES_PER_LAYER);
+ for (int node = 0; node < num_output_nodes; ++node) {
+ float val = 0.0f;
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += weights[i] * input_nodes[i];
+ val += bias[node];
+ // ReLU as activation function.
+ val = val > 0.0f ? val : 0.0f; // Could use AOMMAX().
+ output_nodes[node] = val;
+ weights += num_input_nodes;
+ }
+ num_input_nodes = num_output_nodes;
+ input_nodes = output_nodes;
+ buf_index = 1 - buf_index;
+ }
+
+ // Final output layer.
+ const float *weights = nn_config->weights[num_layers];
+ for (int node = 0; node < nn_config->num_outputs; ++node) {
+ const float *bias = nn_config->bias[num_layers];
+ float val = 0.0f;
+ for (int i = 0; i < num_input_nodes; ++i)
+ val += weights[i] * input_nodes[i];
+ output[node] = val + bias[node];
+ weights += num_input_nodes;
+ }
+}
+
+void av1_nn_softmax(const float *input, float *output, int n) {
+ // Softmax function is invariant to adding the same constant
+ // to all input values, so we subtract the maximum input to avoid
+ // possible overflow.
+ float max_inp = input[0];
+ for (int i = 1; i < n; i++) max_inp = AOMMAX(max_inp, input[i]);
+ float sum_out = 0.0f;
+ for (int i = 0; i < n; i++) {
+ output[i] = (float)exp(input[i] - max_inp);
+ sum_out += output[i];
+ }
+ for (int i = 0; i < n; i++) output[i] /= sum_out;
+}
diff --git a/third_party/aom/av1/encoder/ml.h b/third_party/aom/av1/encoder/ml.h
new file mode 100644
index 0000000000..cb8ef2871b
--- /dev/null
+++ b/third_party/aom/av1/encoder/ml.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_ML_H_
+#define AOM_AV1_ENCODER_ML_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NN_MAX_HIDDEN_LAYERS 10
+#define NN_MAX_NODES_PER_LAYER 128
+
+typedef struct {
+ int num_inputs; // Number of input nodes, i.e. features.
+ int num_outputs; // Number of output nodes.
+ int num_hidden_layers; // Number of hidden layers, maximum 10.
+ // Number of nodes for each hidden layer.
+ int num_hidden_nodes[NN_MAX_HIDDEN_LAYERS];
+ // Weight parameters, indexed by layer.
+ const float *weights[NN_MAX_HIDDEN_LAYERS + 1];
+ // Bias parameters, indexed by layer.
+ const float *bias[NN_MAX_HIDDEN_LAYERS + 1];
+} NN_CONFIG;
+
+// Calculate prediction based on the given input features and neural net config.
+// Assume there are no more than NN_MAX_NODES_PER_LAYER nodes in each hidden
+// layer.
+void av1_nn_predict(const float *features, const NN_CONFIG *nn_config,
+ float *output);
+
+// Applies the softmax normalization function to the input
+// to get a valid probability distribution in the output:
+// output[i] = exp(input[i]) / sum_{k \in [0,n)}(exp(input[k]))
+void av1_nn_softmax(const float *input, float *output, int n);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_ML_H_
diff --git a/third_party/aom/av1/encoder/palette.c b/third_party/aom/av1/encoder/palette.c
new file mode 100644
index 0000000000..e61cd02ce4
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/random.h"
+
+#define AV1_K_MEANS_DIM 1
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+#define AV1_K_MEANS_DIM 2
+#include "av1/encoder/k_means_template.h"
+#undef AV1_K_MEANS_DIM
+
+static int int_comparer(const void *a, const void *b) {
+ return (*(int *)a - *(int *)b);
+}
+
+int av1_remove_duplicates(int *centroids, int num_centroids) {
+ int num_unique; // number of unique centroids
+ int i;
+ qsort(centroids, num_centroids, sizeof(*centroids), int_comparer);
+ // Remove duplicates.
+ num_unique = 1;
+ for (i = 1; i < num_centroids; ++i) {
+ if (centroids[i] != centroids[i - 1]) { // found a new unique centroid
+ centroids[num_unique++] = centroids[i];
+ }
+ }
+ return num_unique;
+}
+
+static int delta_encode_cost(const int *colors, int num, int bit_depth,
+ int min_val) {
+ if (num <= 0) return 0;
+ int bits_cost = bit_depth;
+ if (num == 1) return bits_cost;
+ bits_cost += 2;
+ int max_delta = 0;
+ int deltas[PALETTE_MAX_SIZE];
+ const int min_bits = bit_depth - 3;
+ for (int i = 1; i < num; ++i) {
+ const int delta = colors[i] - colors[i - 1];
+ deltas[i - 1] = delta;
+ assert(delta >= min_val);
+ if (delta > max_delta) max_delta = delta;
+ }
+ int bits_per_delta = AOMMAX(av1_ceil_log2(max_delta + 1 - min_val), min_bits);
+ assert(bits_per_delta <= bit_depth);
+ int range = (1 << bit_depth) - colors[0] - min_val;
+ for (int i = 0; i < num - 1; ++i) {
+ bits_cost += bits_per_delta;
+ range -= deltas[i];
+ bits_per_delta = AOMMIN(bits_per_delta, av1_ceil_log2(range));
+ }
+ return bits_cost;
+}
+
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors) {
+ if (n_cache <= 0) {
+ for (int i = 0; i < n_colors; ++i) out_cache_colors[i] = colors[i];
+ return n_colors;
+ }
+ memset(cache_color_found, 0, n_cache * sizeof(*cache_color_found));
+ int n_in_cache = 0;
+ int in_cache_flags[PALETTE_MAX_SIZE];
+ memset(in_cache_flags, 0, sizeof(in_cache_flags));
+ for (int i = 0; i < n_cache && n_in_cache < n_colors; ++i) {
+ for (int j = 0; j < n_colors; ++j) {
+ if (colors[j] == color_cache[i]) {
+ in_cache_flags[j] = 1;
+ cache_color_found[i] = 1;
+ ++n_in_cache;
+ break;
+ }
+ }
+ }
+ int j = 0;
+ for (int i = 0; i < n_colors; ++i)
+ if (!in_cache_flags[i]) out_cache_colors[j++] = colors[i];
+ assert(j == n_colors - n_in_cache);
+ return j;
+}
+
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count,
+ int *min_bits) {
+ const int n = pmi->palette_size[1];
+ const int max_val = 1 << bit_depth;
+ int max_d = 0;
+ *min_bits = bit_depth - 4;
+ *zero_count = 0;
+ for (int i = 1; i < n; ++i) {
+ const int delta = pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] -
+ pmi->palette_colors[2 * PALETTE_MAX_SIZE + i - 1];
+ const int v = abs(delta);
+ const int d = AOMMIN(v, max_val - v);
+ if (d > max_d) max_d = d;
+ if (d == 0) ++(*zero_count);
+ }
+ return AOMMAX(av1_ceil_log2(max_d + 1), *min_bits);
+}
+
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[0];
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache =
+ av1_index_color_cache(color_cache, n_cache, pmi->palette_colors, n,
+ cache_color_found, out_cache_colors);
+ const int total_bits =
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 1);
+ return av1_cost_literal(total_bits);
+}
+
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ uint16_t *color_cache, int n_cache,
+ int bit_depth) {
+ const int n = pmi->palette_size[1];
+ int total_bits = 0;
+ // U channel palette color cost.
+ int out_cache_colors[PALETTE_MAX_SIZE];
+ uint8_t cache_color_found[2 * PALETTE_MAX_SIZE];
+ const int n_out_cache = av1_index_color_cache(
+ color_cache, n_cache, pmi->palette_colors + PALETTE_MAX_SIZE, n,
+ cache_color_found, out_cache_colors);
+ total_bits +=
+ n_cache + delta_encode_cost(out_cache_colors, n_out_cache, bit_depth, 0);
+
+ // V channel palette color cost.
+ int zero_count = 0, min_bits_v = 0;
+ const int bits_v =
+ av1_get_palette_delta_bits_v(pmi, bit_depth, &zero_count, &min_bits_v);
+ const int bits_using_delta =
+ 2 + bit_depth + (bits_v + 1) * (n - 1) - zero_count;
+ const int bits_using_raw = bit_depth * n;
+ total_bits += 1 + AOMMIN(bits_using_delta, bits_using_raw);
+ return av1_cost_literal(total_bits);
+}
diff --git a/third_party/aom/av1/encoder/palette.h b/third_party/aom/av1/encoder/palette.h
new file mode 100644
index 0000000000..8b88c4755c
--- /dev/null
+++ b/third_party/aom/av1/encoder/palette.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PALETTE_H_
+#define AOM_AV1_ENCODER_PALETTE_H_
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim
+
+void AV1_K_MEANS_RENAME(av1_calc_indices, 1)(const int *data,
+ const int *centroids,
+ uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_calc_indices, 2)(const int *data,
+ const int *centroids,
+ uint8_t *indices, int n, int k);
+void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int *data, int *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+void AV1_K_MEANS_RENAME(av1_k_means, 2)(const int *data, int *centroids,
+ uint8_t *indices, int n, int k,
+ int max_itr);
+
+// Given 'n' 'data' points and 'k' 'centroids' each of dimension 'dim',
+// calculate the centroid 'indices' for the data points.
+static INLINE void av1_calc_indices(const int *data, const int *centroids,
+ uint8_t *indices, int n, int k, int dim) {
+ if (dim == 1) {
+ AV1_K_MEANS_RENAME(av1_calc_indices, 1)(data, centroids, indices, n, k);
+ } else if (dim == 2) {
+ AV1_K_MEANS_RENAME(av1_calc_indices, 2)(data, centroids, indices, n, k);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+// Given 'n' 'data' points and an initial guess of 'k' 'centroids' each of
+// dimension 'dim', runs up to 'max_itr' iterations of k-means algorithm to get
+// updated 'centroids' and the centroid 'indices' for elements in 'data'.
+// Note: the output centroids are rounded off to nearest integers.
+static INLINE void av1_k_means(const int *data, int *centroids,
+ uint8_t *indices, int n, int k, int dim,
+ int max_itr) {
+ if (dim == 1) {
+ AV1_K_MEANS_RENAME(av1_k_means, 1)(data, centroids, indices, n, k, max_itr);
+ } else if (dim == 2) {
+ AV1_K_MEANS_RENAME(av1_k_means, 2)(data, centroids, indices, n, k, max_itr);
+ } else {
+ assert(0 && "Untemplated k means dimension");
+ }
+}
+
+// Given a list of centroids, returns the unique number of centroids 'k', and
+// puts these unique centroids in first 'k' indices of 'centroids' array.
+// Ideally, the centroids should be rounded to integers before calling this
+// method.
+int av1_remove_duplicates(int *centroids, int num_centroids);
+
+// Given a color cache and a set of base colors, find if each cache color is
+// present in the base colors, record the binary results in "cache_color_found".
+// Record the colors that are not in the color cache in "out_cache_colors".
+int av1_index_color_cache(const uint16_t *color_cache, int n_cache,
+ const uint16_t *colors, int n_colors,
+ uint8_t *cache_color_found, int *out_cache_colors);
+
+// Return the number of bits used to transmit each v palette color delta;
+// assign zero_count with the number of deltas being 0.
+int av1_get_palette_delta_bits_v(const PALETTE_MODE_INFO *const pmi,
+ int bit_depth, int *zero_count, int *min_bits);
+
+// Return the rate cost for transmitting luma palette color values.
+int av1_palette_color_cost_y(const PALETTE_MODE_INFO *const pmi,
+ uint16_t *color_cache, int n_cache, int bit_depth);
+
+// Return the rate cost for transmitting chroma palette color values.
+int av1_palette_color_cost_uv(const PALETTE_MODE_INFO *const pmi,
+ uint16_t *color_cache, int n_cache,
+ int bit_depth);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PALETTE_H_
diff --git a/third_party/aom/av1/encoder/partition_model_weights.h b/third_party/aom/av1/encoder/partition_model_weights.h
new file mode 100644
index 0000000000..437ea43f9e
--- /dev/null
+++ b/third_party/aom/av1/encoder/partition_model_weights.h
@@ -0,0 +1,2448 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define FEATURE_SIZE 10
+#define LABEL_SIZE 16
+// nn model for ab partition pruning, 128x128.
+static const float av1_ab_partition_nn_weights_128_layer0[FEATURE_SIZE * 64] = {
+ -0.715251f, -0.015767f, -0.667353f, -0.345255f, 0.177887f, -0.469759f,
+ 0.426152f, 0.489798f, 0.469865f, 0.773821f, 0.088517f, 0.074585f,
+ 0.838754f, 0.048449f, -0.007584f, 0.638968f, 0.233305f, -0.319236f,
+ -0.257124f, -0.170869f, 0.137180f, 0.114852f, -0.721241f, -0.947962f,
+ -0.411298f, 0.494306f, -0.060435f, -0.648421f, -0.126624f, 0.072686f,
+ -0.143904f, -0.115839f, -0.175527f, -0.117728f, 0.040686f, -0.189925f,
+ 0.134361f, -0.258070f, -0.177558f, 0.158049f, 0.168668f, -0.062919f,
+ 0.341986f, 0.038100f, -0.435577f, -0.321255f, 0.203213f, 0.213061f,
+ 0.533304f, 0.359296f, -0.079558f, 0.004637f, 0.663904f, 0.043779f,
+ 0.383018f, 1.136559f, -0.084155f, 0.333057f, -0.199011f, 0.152059f,
+ -0.078419f, -0.167752f, -0.093651f, 0.083171f, -0.190143f, 0.086195f,
+ -0.280632f, -0.160663f, -0.017298f, 0.122628f, -0.138116f, 0.062927f,
+ 0.222462f, 0.626979f, 0.426928f, 0.117170f, -0.240457f, 0.053750f,
+ 0.038017f, 0.007359f, -0.017595f, 0.101407f, 0.332891f, 0.074933f,
+ 0.306498f, 0.219380f, -0.151638f, -0.247976f, 0.343405f, 0.121256f,
+ 0.049173f, 0.171474f, -0.139608f, -1.016599f, -0.345553f, -0.901138f,
+ 0.243401f, 0.059928f, -0.089396f, -0.195565f, 0.364705f, -0.020400f,
+ -1.383672f, 0.413018f, 0.536950f, -0.020904f, -1.335306f, -0.732290f,
+ 0.102885f, 0.315290f, -0.208521f, -0.081811f, 0.182300f, 0.125712f,
+ -0.593833f, -0.220639f, -0.314155f, 0.188327f, 0.118503f, 0.524427f,
+ -1.083859f, -1.130640f, 0.390352f, -0.045591f, 0.113160f, -0.009149f,
+ -0.096183f, 0.115829f, 0.377752f, 0.318396f, -0.591983f, 0.004797f,
+ -0.497377f, -0.342248f, 0.079546f, -0.025249f, -0.295972f, 0.615501f,
+ -0.464372f, 0.418315f, -0.173556f, 0.105217f, 0.298073f, 0.082478f,
+ 0.033223f, 0.977341f, -0.372982f, -0.052337f, 0.154124f, 0.396787f,
+ 0.536654f, -0.139061f, -0.223702f, 0.229666f, -0.846766f, 0.107723f,
+ 0.563839f, -0.483141f, 0.304813f, -0.765283f, 0.070964f, 0.151101f,
+ 0.275188f, 0.490303f, 1.175892f, 0.085377f, -0.191200f, 0.544532f,
+ -0.365075f, 0.167546f, 0.052183f, -0.220529f, -0.212227f, -0.144988f,
+ -0.273356f, -0.062023f, 0.103993f, -0.238493f, -0.161204f, -0.054611f,
+ -0.166672f, 0.128327f, 0.461751f, -0.545822f, 0.739798f, 0.594386f,
+ -0.163192f, -0.332501f, 0.363834f, -0.065043f, 0.474812f, -0.138811f,
+ 0.170924f, -0.778142f, -0.316474f, -0.508065f, -0.039986f, -0.478001f,
+ 0.340591f, 0.041783f, 0.055419f, 0.015155f, -0.981830f, -1.355237f,
+ 0.347516f, 1.155327f, 0.081319f, 0.274163f, -0.327230f, -0.113478f,
+ 0.556552f, -0.055986f, 0.217318f, -0.445351f, 0.325759f, 0.526547f,
+ -0.657434f, -0.572214f, -0.037087f, 0.081384f, 0.064518f, 0.014892f,
+ 0.215279f, 1.834504f, -0.242107f, 0.079810f, 0.129558f, 0.079588f,
+ -0.035189f, -0.221745f, -0.163414f, 0.043978f, -1.028662f, -0.623609f,
+ 1.130336f, 0.664661f, -0.063975f, -0.415863f, 0.018581f, 0.157758f,
+ 0.200570f, 0.063420f, 0.901039f, -0.746286f, 0.196230f, -0.290592f,
+ 0.042373f, -0.502500f, 0.183638f, 0.103394f, -0.298858f, 0.145436f,
+ 0.196916f, 0.108319f, -0.448572f, -0.881385f, 0.302497f, 0.121679f,
+ -0.021327f, 0.025150f, 0.481306f, -0.359634f, 0.350257f, -0.228647f,
+ -0.669860f, 0.260025f, -0.034182f, 0.619247f, -0.158826f, -0.405864f,
+ 0.674112f, -0.027885f, -0.325274f, -0.241492f, 0.036024f, -0.437685f,
+ -0.091458f, -0.109295f, -0.350676f, 0.044706f, 0.297059f, 0.016290f,
+ 1.121203f, 1.289062f, -1.299476f, -1.129221f, 0.103752f, 0.131302f,
+ -0.263265f, 0.222155f, -0.229908f, 0.013922f, -0.226001f, -0.248383f,
+ -0.004415f, -0.020958f, 0.055634f, 0.086200f, 0.114556f, -0.184061f,
+ -0.096210f, -0.146466f, -0.249618f, -0.195998f, 0.088758f, 0.023781f,
+ -0.264460f, 0.157026f, -0.235228f, -0.102564f, 0.043463f, -0.187823f,
+ -0.257500f, -0.199049f, -0.242210f, 0.030448f, 0.221604f, 0.151804f,
+ -0.100404f, -0.073931f, 0.144749f, -0.001572f, -1.438079f, -0.233716f,
+ 0.733422f, 1.727080f, -0.036397f, 0.027551f, 0.425321f, 0.085703f,
+ 0.031186f, 0.032333f, -0.675130f, 1.437733f, -0.202392f, -0.525003f,
+ 0.087048f, 0.328194f, -0.079989f, -0.391088f, -0.238732f, -0.120660f,
+ -0.139600f, 0.154665f, 0.026202f, -0.233501f, -0.009046f, -0.149187f,
+ -0.199646f, 0.115375f, 0.209762f, -0.014875f, 0.124038f, -0.119985f,
+ 1.079625f, -0.461513f, 0.614114f, 0.021003f, 0.439449f, -0.824834f,
+ -0.299701f, 0.193817f, -0.870551f, -1.262313f, -0.079517f, 0.341570f,
+ 0.305310f, -0.089721f, -0.317314f, -0.075631f, 0.127172f, -0.208635f,
+ 1.191922f, 0.163141f, 0.564285f, 0.286352f, 0.480865f, 0.173094f,
+ -0.094034f, -0.071339f, -0.328992f, -0.006382f, 0.314705f, 0.090258f,
+ -0.016099f, 0.193230f, 0.188061f, 0.398144f, 0.722781f, 0.769949f,
+ 0.025442f, -0.162016f, 0.070192f, -0.056946f, -0.100957f, -0.219934f,
+ -0.203492f, -0.015454f, -0.013272f, -0.098008f, 0.051707f, -0.017493f,
+ 0.527446f, 0.083605f, 0.588318f, 0.878215f, 0.028747f, -0.146479f,
+ -0.345170f, -0.136059f, -0.152005f, -0.203634f, 0.232702f, -0.101340f,
+ -0.027733f, -0.282611f, 0.265366f, 0.082362f, -0.265420f, -0.131124f,
+ 0.166303f, 0.040194f, -0.100710f, 0.579151f, -0.530136f, 0.163422f,
+ -0.998821f, -1.565311f, -1.774785f, -2.493372f, 0.116970f, -0.090302f,
+ 1.723272f, 0.552370f, -0.295954f, -0.439095f, -0.266730f, 0.027936f,
+ 0.539616f, -0.234902f, -0.167601f, -0.149877f, -0.242983f, 0.122353f,
+ -0.121620f, -0.205517f, -0.180144f, -0.264208f, 0.151500f, -0.159378f,
+ 0.029145f, -0.050892f, -0.223407f, -0.246239f, 0.043152f, -0.018460f,
+ 0.169972f, -0.187769f, -0.034670f, -0.238330f, 0.288070f, -0.093243f,
+ -0.437105f, -0.573376f, 0.660073f, 0.285727f, 0.408470f, 0.158475f,
+ 0.032699f, 0.056280f, -0.237176f, -0.083003f, 0.105598f, -0.169522f,
+ -0.260420f, -0.121100f, -0.173983f, -0.195693f, -0.232028f, 0.224940f,
+ 0.029124f, 0.009580f, -0.252034f, 0.103087f, 1.156561f, 0.603848f,
+ -0.562805f, -1.652742f, -0.568288f, -1.829395f, 0.046169f, 0.076095f,
+ 1.490819f, 0.415893f, -0.277788f, -0.115787f, 0.093750f, 0.270726f,
+ -0.395983f, -0.353742f, 0.034605f, 0.005342f, 0.184537f, 0.086445f,
+ 0.156417f, 1.476367f, 0.122587f, 0.002145f, 0.431057f, -0.381184f,
+ -1.646457f, -0.014009f, -0.671224f, 0.193726f, -0.019247f, -0.031267f,
+ -0.046208f, 0.298733f, 0.064734f, 0.616984f, 0.039381f, 0.182722f,
+ -0.116670f, 0.233093f, -1.214374f, -0.817970f, -0.064394f, -0.584783f,
+ 0.077697f, -0.266720f, 0.130875f, -0.235295f, -0.265754f, -0.159999f,
+ -0.250114f, -0.183017f, 0.194403f, -0.105808f, -0.169215f, -0.240866f,
+ -0.026662f, -0.045123f, -0.036175f, -0.167471f, -0.192908f, -0.232602f,
+ -0.267036f, -0.112500f, -0.257944f, -0.111909f, -0.802226f, -0.008800f,
+ 0.881460f, -0.678603f, 0.008666f, -0.252053f, -0.341035f, -0.175290f,
+ 0.183012f, 0.385991f, 0.079888f, -0.014039f, -0.148653f, 0.671778f,
+ -0.130219f, 1.086467f, 0.129267f, -0.040400f, -0.201221f, -0.077005f,
+ 0.015890f, 0.000781f, 0.137764f, 1.389546f, 0.172152f, 0.047279f,
+ -0.042783f, 0.127740f, 0.141467f, -0.335738f, -1.396392f, 0.031496f,
+ 0.357385f, 0.343602f, -0.714553f, 0.311014f, 0.132845f, 0.061149f,
+ 0.006796f, 0.568106f, -0.255949f, 0.104134f, -0.993447f, 0.298135f,
+ -0.406590f, -0.049228f, -0.578570f, -0.188561f, -0.107046f, 0.374095f,
+ 0.068481f, 0.036240f, -0.495801f, 0.180574f, -0.766129f, 0.886967f,
+ -0.568868f, -0.936062f, -0.418886f, -0.058735f, -0.511964f, -0.438596f,
+ 0.019016f, -0.015837f, 0.600197f, 0.429773f, 0.315026f, 0.319667f,
+ 0.214617f, -0.017316f, 0.270257f, -0.040524f, 0.695803f, -0.015223f,
+ -1.554965f, 0.356997f, -1.472428f, 0.024637f, -0.562958f, 0.870351f,
+ 0.193635f, 0.036063f, 0.328638f, 0.200274f, -1.634707f, 0.110534f,
+ 0.420104f, -0.072042f, -0.006404f, 0.171680f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer0[64] = {
+ 0.643147f, -1.348826f, 0.431627f, 0.000000f, 0.102717f, -0.772628f,
+ -0.034351f, -0.761977f, -0.638397f, 0.541969f, -0.391311f, 0.563076f,
+ 0.148553f, 0.267217f, -0.788092f, 0.544573f, -0.546280f, 0.000000f,
+ -0.446945f, 0.127732f, 0.270624f, -0.219435f, -1.220203f, 0.324584f,
+ 0.110885f, 0.276547f, 0.179726f, -0.375160f, 0.026401f, -0.032595f,
+ 0.000000f, -0.047932f, -0.648602f, -0.512637f, -0.031661f, -0.236761f,
+ 0.476453f, -0.028021f, -0.013673f, -0.015578f, -0.920077f, 0.000000f,
+ 0.915351f, -0.209962f, 0.000000f, -0.025731f, 0.218288f, 0.000000f,
+ 0.047726f, -0.813077f, -1.263281f, 0.239087f, 0.278614f, -0.030753f,
+ 0.000000f, 0.346744f, -0.948543f, -1.174211f, 0.216377f, 0.498913f,
+ 0.853918f, 0.002504f, -0.190403f, 0.452050f,
+};
+
+static const float av1_ab_partition_nn_weights_128_layer1[64 * LABEL_SIZE] = {
+ 0.179769f, 1.499417f, -0.445135f, -0.142278f, -0.337661f, 0.682064f,
+ -0.203213f, 0.302171f, 0.226877f, -0.422169f, 1.687586f, 0.783773f,
+ 0.220995f, 0.253482f, 0.370435f, -1.342775f, 0.337229f, -0.271473f,
+ 0.291796f, 1.362227f, -1.751397f, -0.086178f, 0.725496f, -0.118597f,
+ 0.227963f, -0.501577f, 0.223849f, -0.122421f, -0.123437f, -0.051045f,
+ -0.020115f, 0.212711f, 0.246025f, 0.088120f, -0.168995f, 1.740190f,
+ -0.195098f, 0.680339f, -0.589572f, -0.075244f, 0.878766f, 0.064092f,
+ -3.548527f, 0.001660f, 0.107926f, -0.169501f, -0.455212f, 0.123045f,
+ -1.836998f, 0.330365f, 1.301475f, 0.454761f, -0.576552f, -0.190761f,
+ 0.208459f, 0.618483f, 1.383364f, 0.970718f, 0.390174f, 0.406252f,
+ -0.564519f, -0.312062f, 1.345712f, -0.151873f, 0.109290f, 0.408847f,
+ 0.391243f, 0.152024f, 0.181764f, -0.036263f, -0.160466f, 0.153595f,
+ 0.049163f, -0.753012f, -1.804062f, 0.347475f, -2.746580f, 0.575618f,
+ 0.261799f, 0.210505f, -0.302054f, -0.109872f, 0.199506f, -1.182971f,
+ 0.723668f, 0.177758f, -0.338202f, 0.254396f, -0.220023f, 0.043504f,
+ 0.669866f, -0.040816f, -0.402730f, 0.017990f, 0.215523f, -0.216816f,
+ 0.454826f, -0.726067f, -0.018750f, -0.928679f, 0.154315f, -0.465641f,
+ 0.144566f, -0.030064f, -0.054667f, -0.154055f, 0.625384f, 1.323795f,
+ -0.159496f, 0.097072f, -0.463197f, -0.057938f, 0.750290f, -0.233061f,
+ 0.412631f, -0.535223f, -0.151423f, -0.154583f, 0.024721f, -0.494448f,
+ 0.230594f, -0.980138f, -0.653968f, 0.126079f, 0.051814f, -0.053219f,
+ -0.421708f, -0.228853f, 0.237885f, 0.888157f, 0.059655f, 0.241295f,
+ 0.210443f, 0.228238f, 0.119127f, -0.051989f, -0.355408f, 0.182215f,
+ 0.244277f, -0.104577f, -0.558035f, -0.023270f, 0.054571f, 0.700646f,
+ -0.223006f, 0.115523f, 0.023391f, 0.437264f, 0.709477f, -0.531212f,
+ -0.094731f, 0.328161f, -0.105418f, -0.133511f, 0.497168f, -0.030948f,
+ -0.407132f, -0.043943f, 0.155505f, 0.251945f, 0.205010f, 0.167160f,
+ 0.083654f, -0.636810f, 0.401315f, -0.398414f, 0.290046f, 0.206846f,
+ 0.042218f, 0.168150f, 0.843181f, -0.671242f, -0.202392f, -0.073301f,
+ 0.142895f, 0.237466f, 0.212145f, -0.091828f, 0.187038f, -0.720841f,
+ -0.616069f, -0.238021f, 0.065365f, 0.434119f, 0.179023f, -0.040107f,
+ -0.430734f, -0.297368f, 0.575954f, 0.382619f, -0.709787f, -0.320810f,
+ 0.242342f, -0.047614f, 0.705216f, 0.098077f, 0.357179f, 0.046017f,
+ 0.115074f, -0.412305f, -0.272304f, 0.048096f, -0.803811f, 0.275000f,
+ 0.642198f, 0.180286f, -0.087178f, -0.112707f, -0.394443f, 0.201989f,
+ 0.241759f, -1.038870f, 0.728124f, 0.800559f, -1.296268f, 0.198612f,
+ -0.053478f, 0.414344f, -0.510529f, 0.124179f, -2.219115f, -0.074583f,
+ -0.143055f, 0.001697f, 0.810811f, -0.657140f, 0.186818f, -0.936414f,
+ 0.539578f, -0.308244f, -0.126624f, -0.204767f, 0.091145f, -0.049340f,
+ 0.252014f, 0.394582f, 0.018764f, -0.060377f, -0.019133f, 0.064083f,
+ 0.069211f, -0.526693f, 0.209850f, -0.481466f, -0.468302f, -0.100407f,
+ 0.241018f, -1.037781f, 0.038539f, -2.113840f, -0.974895f, 0.163187f,
+ 0.425132f, -0.772546f, -1.261254f, -0.217488f, -0.971748f, -0.805640f,
+ -0.745175f, -0.177077f, 0.217658f, 0.381431f, -0.052338f, 0.087176f,
+ -0.165972f, 0.085937f, 0.472564f, -0.796627f, -2.453307f, 0.569664f,
+ -0.233010f, -0.192134f, 0.064339f, -0.111411f, -0.262469f, -0.410022f,
+ 0.519993f, -0.684620f, 0.393460f, -0.277753f, -0.153624f, 0.528984f,
+ -0.415558f, -0.445863f, 0.588512f, -0.142439f, -0.132127f, 0.199776f,
+ -0.579284f, 0.119488f, -0.033590f, -0.503846f, -0.674979f, 0.335125f,
+ 0.020519f, 0.233973f, -0.297998f, -0.051511f, 0.518626f, -0.412782f,
+ -0.074045f, 0.130523f, 0.465751f, -0.117795f, 2.535813f, 0.352108f,
+ -0.499228f, 0.379784f, 0.056699f, 0.173142f, -0.076519f, -0.026666f,
+ 0.017834f, 0.492333f, 0.093364f, 0.037867f, -0.165420f, -0.356429f,
+ -0.562334f, 0.057656f, -0.307544f, 0.085857f, -0.559851f, 0.107230f,
+ -0.398633f, 0.152618f, -0.216835f, -0.024539f, 0.026044f, -0.249519f,
+ -0.563594f, -0.746025f, 0.025265f, -0.298888f, -0.185243f, 0.058794f,
+ 0.233696f, -0.115223f, 0.144617f, -0.864390f, 0.619944f, -0.023980f,
+ 0.019481f, 0.225252f, 0.416552f, -0.115993f, 0.935387f, 0.744386f,
+ 0.053353f, -0.052582f, -0.065650f, 0.228488f, -0.032042f, -0.371252f,
+ -0.003638f, -0.736984f, -0.203776f, 0.030922f, -0.065577f, -0.031643f,
+ -0.049253f, -0.054640f, 0.787134f, 0.545414f, -0.140297f, -0.124274f,
+ -0.110011f, -0.029552f, 0.657005f, 0.214973f, -0.374300f, 0.251642f,
+ 0.276591f, 0.030566f, -0.145470f, 0.350579f, -0.356436f, -0.052694f,
+ -0.063966f, -0.751008f, -1.042392f, 0.328892f, -0.425058f, -0.421571f,
+ -0.571889f, -1.141472f, -0.125216f, 0.212713f, -0.485170f, -0.088791f,
+ 0.124589f, 0.023237f, 0.077635f, 0.020901f, -0.271402f, -0.321424f,
+ -0.513946f, -0.867872f, -0.284593f, 0.106276f, 0.220192f, -0.143532f,
+ -0.014648f, 0.073402f, 0.327256f, -0.139803f, 0.168763f, 0.048199f,
+ -0.122526f, 0.111713f, -0.134257f, 0.810364f, -0.085222f, -0.259221f,
+ -0.239349f, 0.044448f, 0.205031f, 0.413113f, -0.107720f, -0.018816f,
+ -0.247741f, -0.004963f, 0.041170f, -0.158019f, 0.134839f, 0.129502f,
+ 0.800488f, -1.041584f, -0.129336f, 0.170834f, 0.566586f, -0.230443f,
+ 0.437937f, -0.149922f, -0.046665f, -0.094646f, 0.200070f, 0.072943f,
+ -0.076943f, -0.084971f, -0.515843f, -0.146720f, 0.472869f, -0.444731f,
+ -0.100877f, 0.545196f, -1.786626f, -0.482946f, 0.500509f, -0.843257f,
+ 0.200374f, 0.045103f, -0.575718f, -0.164335f, -0.232522f, -0.021825f,
+ -0.139490f, 0.356058f, -0.352075f, 0.061751f, -0.200616f, -1.180921f,
+ -0.181355f, -0.137459f, 0.247574f, 0.181541f, 0.184314f, -0.961482f,
+ 0.493615f, 0.910261f, -2.279238f, 0.648631f, -0.055526f, -0.037137f,
+ 0.038643f, 0.136609f, -0.819373f, -0.040840f, -0.265989f, 0.006877f,
+ 0.454651f, -0.595323f, -0.099500f, -0.263717f, 0.150456f, 0.245077f,
+ -0.268666f, 0.162232f, -0.516451f, -0.024501f, 0.188046f, -0.002262f,
+ 0.261319f, 0.004173f, 0.746982f, 0.174761f, 0.470447f, -0.159558f,
+ -0.385240f, 0.023084f, -0.133520f, -0.220607f, -0.018731f, -0.373558f,
+ -0.707763f, -1.850150f, -0.807404f, -0.168063f, -0.071435f, -0.160740f,
+ -0.478789f, -1.070674f, -0.489740f, -0.255796f, 0.100486f, -0.153361f,
+ 0.334394f, -0.569472f, -0.198118f, 0.255922f, 0.104717f, -0.065179f,
+ 0.111879f, -0.447237f, 1.373623f, -0.190191f, -0.063311f, 0.337529f,
+ -0.138800f, 0.057009f, -0.137006f, 0.641378f, 0.883147f, -0.679655f,
+ 0.267717f, -0.351602f, -0.135225f, 0.229398f, -0.513225f, -1.120345f,
+ 0.528786f, -0.051081f, 0.086653f, 0.140141f, -0.563969f, 0.333402f,
+ -0.174745f, 0.321093f, -0.438641f, -0.005131f, 0.247415f, 0.110120f,
+ -0.076308f, -0.083244f, 0.838944f, -0.113043f, -0.013258f, -0.175028f,
+ -0.179941f, 0.272676f, -0.047946f, -0.088076f, -0.450031f, 0.053929f,
+ -0.083549f, -0.089952f, -0.186253f, 0.257483f, 0.011019f, 0.586435f,
+ 0.060580f, -0.052078f, 0.090277f, -0.780869f, 0.969811f, -0.025349f,
+ -0.281917f, 0.014857f, 0.231863f, -0.228601f, -0.003861f, 0.226550f,
+ 0.141825f, -0.102171f, -0.010387f, 0.220378f, -2.561975f, -0.497071f,
+ -0.315117f, 0.371981f, 0.138247f, 0.625031f, -0.308133f, -0.217876f,
+ 0.005615f, -0.860179f, 0.747491f, 0.006356f, -0.057024f, -0.483189f,
+ 0.055592f, -0.316834f, 0.069858f, 0.218788f, -0.200044f, 0.227588f,
+ 0.215496f, -0.055324f, -0.393147f, -0.394062f, -0.253264f, -0.075619f,
+ -0.152512f, -0.332995f, 0.129053f, 0.178668f, -0.302694f, 0.030678f,
+ 0.925896f, 0.964375f, 0.169021f, -0.218657f, -0.627204f, 0.206437f,
+ -0.521336f, 0.176206f, 0.142733f, 0.139248f, 0.411682f, 0.181544f,
+ 0.224850f, -0.935547f, -0.558208f, 0.348096f, 0.342129f, -0.389340f,
+ -0.236308f, -0.132099f, 0.073642f, 0.089391f, -0.306901f, -0.397842f,
+ 0.444282f, 0.074623f, -0.051075f, -0.106617f, -0.184037f, -0.239046f,
+ -0.138761f, 0.120794f, -0.647577f, -0.336471f, 0.527899f, -0.164234f,
+ -0.028354f, 1.083678f, -0.251534f, -0.145903f, -0.182783f, 0.070976f,
+ -0.199590f, -0.400306f, -0.029763f, -0.548042f, -0.266270f, -0.118084f,
+ -1.152632f, 0.383685f, -0.105895f, -0.096829f, 0.118382f, 0.047447f,
+ -0.019051f, 0.310180f, -0.162793f, -0.029574f, 0.058054f, -0.636017f,
+ 0.490639f, 0.158347f, -0.385701f, -0.147057f, 1.285825f, -1.276083f,
+ -0.021795f, -0.101600f, 0.163254f, 0.267160f, -2.317864f, -0.098598f,
+ -0.296337f, -0.309017f, 0.164127f, -0.270012f, -0.071187f, -0.262270f,
+ 0.075415f, -0.368328f, 0.186728f, -0.158031f, 0.481663f, 0.515950f,
+ -0.162551f, 0.497981f, 0.262196f, 0.168479f, 0.726066f, -0.243856f,
+ -0.058998f, 0.140168f, 0.053242f, -0.624623f, -0.249480f, 0.055197f,
+ -1.376804f, 0.417571f, 0.203784f, 0.174370f, -0.155531f, -0.029400f,
+ -0.491473f, 0.079811f, -0.080123f, 1.345900f, 0.637077f, 0.434862f,
+ -1.787438f, 0.005756f, -0.362706f, 0.179458f, -0.288263f, 0.516788f,
+ -0.921248f, 0.043794f, -0.137729f, -0.196171f, -0.046295f, -0.793781f,
+ -0.156532f, -0.132566f, 0.517989f, -0.154321f, -0.054174f, -0.077900f,
+ -0.373316f, -0.117718f, 0.188986f, -0.476188f, -0.245312f, 0.181439f,
+ -0.161024f, -0.229059f, -3.079907f, -0.225452f, -0.594355f, -0.558027f,
+ -0.135429f, 0.125766f, -0.081314f, -0.350894f, -0.163165f, -1.936507f,
+ -0.205966f, 0.031472f, 0.744446f, -0.006680f, -0.837551f, 0.605862f,
+ -0.854929f, -1.543750f, -0.307704f, -0.240517f, 0.178240f, -0.183586f,
+ -0.010307f, 0.099373f, -0.228278f, 0.175236f, -0.000133f, 0.104491f,
+ -1.540545f, -0.570971f, -0.252885f, 0.483036f, 0.052531f, 0.260214f,
+ -0.515016f, -0.602081f, -0.485690f, -0.730710f, 0.163719f, -1.775975f,
+ -0.298634f, 0.323626f, -0.373579f, -0.872977f, 0.619574f, 0.026862f,
+ -0.122531f, -0.084698f, -2.436297f, 0.483996f, -0.203640f, -0.302157f,
+ -0.150666f, -0.238320f, 0.089250f, 0.236485f, -0.668654f, -0.122863f,
+ 0.491152f, -0.226444f, -0.181248f, 0.120158f, 0.294027f, 0.250056f,
+ 0.307601f, 0.357875f, -1.746455f, -0.175670f, 0.385447f, -0.108808f,
+ -0.090235f, -0.642504f, -0.486004f, -0.055160f, -0.068692f, 0.009736f,
+ 0.607555f, -0.489426f, 0.150624f, 0.598114f, -0.128816f, -0.445793f,
+ -0.066524f, -0.254380f, 0.227106f, -0.406495f, -0.121632f, -0.275960f,
+ -0.136494f, 0.339457f, -1.318132f, -0.417572f, -2.614077f, 0.324603f,
+ -0.001211f, 0.375192f, -0.473448f, -0.162510f, 0.099329f, -0.277965f,
+ 0.101221f, -0.060263f, 0.121867f, -1.042140f, 0.440851f, 0.078898f,
+ -0.209007f, -0.243699f, 0.715197f, -0.093997f, 0.086022f, -0.178203f,
+ -2.275496f, -0.098413f, 0.199352f, -0.526791f, -0.162086f, -0.197806f,
+ -0.231657f, -0.269202f, -0.794294f, -0.223461f, 0.503584f, 0.416236f,
+ 0.064082f, 0.197655f, 0.340871f, -0.186645f, -0.291498f, 0.433938f,
+ -1.110063f, 0.003751f, 0.392738f, 0.069360f, 0.102088f, -0.302128f,
+ -1.518457f, 0.106939f, 0.404527f, -0.306868f, -0.286928f, 0.729276f,
+ -0.531710f, 0.745048f, -0.168837f, -1.953886f, -0.258828f, -0.190252f,
+ 0.241877f, -0.916744f, -0.030326f, -0.070541f, -0.271037f, 0.211303f,
+ -0.489957f, 0.100850f, 0.323999f, -0.802837f, -0.462408f, -0.079350f,
+ -0.029374f, 0.131213f, -0.825032f, 0.040202f, 0.351821f, 0.002869f,
+ -0.132516f, -0.471264f, -0.297002f, 0.263913f, 0.033478f, 0.146161f,
+ 0.533229f, -0.228608f, -0.200639f, -0.170955f, -0.915037f, 0.724491f,
+ 0.005151f, 0.018584f, -0.029771f, -0.396038f, -0.159236f, 0.038691f,
+ -1.197056f, 0.146302f, 0.226840f, -0.852126f, 0.031214f, 0.108880f,
+ 0.562000f, -0.134633f, -0.713343f, -0.342252f, -1.764521f, -0.114653f,
+ 0.515073f, -0.080515f, -0.121155f, -0.865139f, -0.833694f, -0.368553f,
+ 0.347673f, 0.623379f, 0.722067f, -0.492458f, -0.513263f, 0.585167f,
+ 0.721518f, -0.693499f, 0.343725f, -0.273861f, -0.040230f, -0.785664f,
+ -0.157500f, -0.308445f, 0.054062f, 0.600131f, -0.860887f, 0.434470f,
+ -0.191382f, -0.306150f, -0.243965f, 0.705444f, 0.007789f, -0.146154f,
+ -0.054499f, -0.073500f, -1.067364f, 0.404936f, -2.864590f, 0.182323f,
+ 0.326126f, 0.102405f, -0.135800f, 1.128095f, -0.012267f, -0.023996f,
+ -0.264834f, -0.108967f, -1.176746f, -0.926666f, 0.082999f, -0.498361f,
+ 0.083560f, -0.210074f, 0.019225f, -0.201614f, -0.904760f, 0.181421f,
+ 0.586384f, -0.177706f, 0.065471f, 0.168552f, 0.054705f, 0.045241f,
+ 0.048057f, -0.410957f, -2.188854f, -0.169812f, 0.015521f, 0.176856f,
+ -0.179331f, -0.352640f, -0.491735f, -1.743206f, 0.044227f, 0.010454f,
+ 0.823643f, -0.119781f, -0.098359f, 0.093119f,
+};
+
+static const float av1_ab_partition_nn_bias_128_layer1[LABEL_SIZE] = {
+ -0.433195f, -0.120488f, -0.116721f, 0.112134f, 0.118170f, -0.259769f,
+ -0.077530f, 0.394044f, 0.279167f, -0.317988f, 0.189538f, 0.314776f,
+ 0.325655f, -0.107123f, 0.591049f, 0.358744f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_128_layer0,
+ av1_ab_partition_nn_weights_128_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_128_layer0,
+ av1_ab_partition_nn_bias_128_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 64x64.
+static const float av1_ab_partition_nn_weights_64_layer0[FEATURE_SIZE * 64] = {
+ -0.495347f, -0.049498f, -0.026804f, 0.030474f, -0.289308f, -0.264193f,
+ -0.141121f, -0.072562f, -0.391665f, -0.051491f, -0.234761f, 0.027155f,
+ -0.038217f, 0.014872f, -0.289728f, -0.233577f, -0.415875f, -0.343615f,
+ -0.442543f, -0.482492f, 0.073510f, 0.007503f, 2.162329f, -0.362849f,
+ 2.145915f, -0.883135f, 0.185636f, -0.062859f, -0.465574f, -0.486205f,
+ -0.056710f, -0.330642f, -0.321860f, 0.042321f, -0.348965f, 0.003542f,
+ -0.291365f, -0.078164f, -0.345093f, -0.220272f, -0.471270f, -0.763853f,
+ 0.246622f, 0.199651f, -0.663420f, -0.154152f, -1.220383f, 0.047138f,
+ 0.816811f, 0.083247f, -0.218839f, 0.038143f, -0.063436f, 0.015517f,
+ -0.307320f, -0.166956f, -0.169499f, -0.399005f, -0.234638f, -0.162266f,
+ 0.050425f, -0.221723f, -0.256942f, -0.287285f, 0.144011f, -0.033245f,
+ 0.083649f, 0.119428f, -0.056706f, -0.117805f, 0.021866f, -0.257300f,
+ -0.201378f, -0.217484f, -0.413780f, -0.145793f, 0.082792f, -0.347247f,
+ 0.042539f, -0.302697f, 1.652316f, 0.000701f, -0.482843f, -0.160332f,
+ -0.450099f, 0.212399f, -4.715360f, -5.336774f, -5.375758f, -6.048339f,
+ 0.085956f, -0.037767f, 1.052409f, -0.931924f, -2.221907f, 0.268946f,
+ 0.015512f, 1.237094f, -1.092185f, 0.418247f, -0.082143f, -0.076914f,
+ -0.060749f, -0.325440f, -0.296960f, -0.066815f, -0.158477f, -0.373945f,
+ -0.122322f, -0.113495f, -0.097978f, -0.192816f, -0.270418f, 0.035840f,
+ -0.015458f, -0.121071f, -0.279582f, -0.067683f, 0.097855f, 0.019839f,
+ 0.451127f, 0.004376f, 1.410392f, 3.255835f, -0.344815f, 0.145202f,
+ 0.204132f, 0.171948f, -0.527736f, -0.110353f, 0.901448f, 0.003238f,
+ -3.822090f, 0.235462f, 1.024823f, -0.821244f, 0.876056f, 2.553762f,
+ -3.478597f, -2.076582f, -0.265515f, -0.055923f, -0.156980f, -0.164097f,
+ -0.246040f, 0.039430f, -0.071769f, -0.118847f, -0.304053f, -0.281541f,
+ -0.226021f, -0.263091f, -0.127359f, -0.249410f, -0.051023f, 0.083911f,
+ 0.084721f, 0.168089f, -0.272169f, -0.204998f, -0.008303f, -0.173998f,
+ 0.079376f, -0.197426f, -0.199052f, -0.118794f, -0.063753f, -0.094769f,
+ 0.066176f, -0.175832f, -0.238752f, -0.287960f, -0.134307f, -0.185953f,
+ -0.385845f, 0.119769f, -0.006567f, -0.382126f, -0.214221f, 0.038449f,
+ -0.253484f, -0.282766f, -0.020249f, -0.193929f, 0.016281f, -0.114423f,
+ -0.145940f, -0.281621f, -0.007588f, -0.131470f, -0.189012f, -0.185699f,
+ -0.279011f, -0.008132f, 0.208463f, 0.020569f, -0.206803f, -0.213408f,
+ -0.206131f, -0.290245f, 0.069701f, -0.000371f, -0.307572f, -0.451785f,
+ -0.300838f, -0.453186f, -0.301691f, 0.046327f, -0.312668f, 0.058272f,
+ -0.303131f, -0.376252f, 0.108384f, -0.086623f, -0.100630f, -0.027330f,
+ -0.003969f, 0.089502f, -0.200722f, -0.107889f, 0.061843f, -0.008478f,
+ -0.265057f, -0.271132f, -0.073562f, 0.129337f, -0.283698f, -0.353414f,
+ 0.076420f, -0.244280f, -0.119537f, -0.105366f, -0.184692f, -0.038817f,
+ -0.478507f, -0.118808f, -0.472979f, -0.305884f, -0.462813f, -0.189581f,
+ -0.011932f, -0.585700f, 0.253212f, -1.061900f, -0.205116f, -0.336407f,
+ -0.762199f, 0.577737f, 0.230832f, 0.434440f, -0.096713f, 0.038552f,
+ -0.147800f, -0.213553f, 0.041740f, -0.281907f, -0.026154f, -0.082356f,
+ -0.331871f, -0.408247f, -0.129022f, -0.037550f, -0.310233f, -0.320883f,
+ -0.391963f, -0.467392f, 0.027453f, -0.394761f, -0.045544f, 0.076052f,
+ 0.483985f, 0.067093f, 0.141361f, 0.576772f, 0.859718f, 2.566515f,
+ -0.025476f, 0.769738f, -0.680235f, -1.683309f, -2.394131f, -0.000714f,
+ -0.615021f, -0.195856f, -0.434035f, -0.295010f, -0.668659f, -0.245959f,
+ 0.551148f, 1.777227f, -0.461630f, 0.043093f, 0.012293f, -0.255841f,
+ -0.097070f, -0.371156f, -0.146323f, -0.015508f, -0.103873f, -0.087476f,
+ -0.297266f, -0.128699f, -0.149555f, 0.016534f, -0.375498f, -0.346759f,
+ -0.455156f, -0.147509f, -0.427076f, -0.354431f, -0.158025f, -0.164604f,
+ -0.237038f, -0.010314f, -0.092884f, -0.397084f, -0.217980f, -0.127184f,
+ -0.048421f, -0.144133f, 0.889073f, 0.012606f, 3.007608f, -0.602584f,
+ -1.849480f, -0.373159f, -1.890695f, -3.609938f, 0.811923f, -1.867208f,
+ -0.244326f, -0.018012f, -0.211192f, -0.220196f, 0.169363f, 0.119141f,
+ -0.230715f, 0.083247f, 0.020367f, -0.128629f, -0.217455f, -0.159640f,
+ 1.815952f, -0.369238f, -1.186447f, -0.658753f, -0.511026f, -0.096934f,
+ 0.662971f, 0.486475f, 0.159746f, -0.018932f, 3.692397f, 1.384353f,
+ -0.401984f, -0.248380f, -0.140861f, 0.215248f, -0.023711f, 0.059679f,
+ -0.072260f, 0.004271f, 0.039545f, -0.347971f, -0.081851f, -0.474896f,
+ -0.181572f, 0.066736f, -0.157822f, -0.163760f, -0.171113f, -0.089935f,
+ -0.338281f, -0.421444f, -0.306687f, -0.085283f, -0.377953f, -0.138750f,
+ -0.102701f, -0.312336f, 0.149831f, 0.007229f, -0.155700f, -0.173611f,
+ 4.074261f, 1.342306f, -1.272712f, 1.570899f, -0.545093f, -0.317605f,
+ -0.189440f, -0.133910f, -0.273190f, -0.108020f, -0.166107f, 0.021413f,
+ -0.239130f, -0.067211f, 0.041957f, -0.039234f, -1.003587f, -0.094412f,
+ 0.532512f, -0.870538f, -1.118023f, -1.160983f, -0.736307f, -0.418752f,
+ 0.419466f, 0.492122f, -0.004368f, -0.022096f, -1.115132f, 0.150886f,
+ 2.396852f, 2.660000f, -0.376537f, 0.468628f, 0.149413f, -0.074898f,
+ -0.067154f, 0.021245f, 0.127857f, 0.294189f, 0.508056f, 0.390232f,
+ -3.899177f, -3.414681f, -3.929195f, -4.160545f, -0.274323f, -0.052583f,
+ -0.003545f, -0.433084f, -0.404891f, -0.145051f, -0.312367f, 0.004579f,
+ -0.398724f, -0.372068f, -0.234279f, 0.017799f, -0.424760f, -0.646717f,
+ -0.047568f, 2.924664f, -0.644165f, 0.359349f, -0.294800f, 0.591746f,
+ -0.404710f, -0.092358f, -0.250729f, 0.030829f, -0.147149f, -0.476023f,
+ -0.071803f, -0.482516f, -0.293117f, -0.215923f, -0.373122f, -0.085315f,
+ -0.377052f, -0.449899f, -0.056452f, 0.138081f, -0.085350f, -0.308391f,
+ 0.106661f, 0.176234f, 0.258869f, -0.230172f, -0.233029f, -0.241208f,
+ -0.067509f, -0.223172f, -0.118353f, -0.302478f, -0.579632f, -0.561326f,
+ -0.158114f, -0.223167f, -0.026689f, 0.051863f, 0.212834f, -0.304714f,
+ -0.169071f, -0.193695f, -0.075682f, -0.170860f, -0.241008f, -0.044648f,
+ 0.280815f, -0.002585f, -0.283552f, -0.037701f, -0.681169f, -0.274535f,
+ -0.380595f, 0.109504f, -0.111141f, -0.437685f, -0.094459f, 0.144206f,
+ -0.106139f, -0.211832f, -0.054742f, -0.172813f, -0.295905f, -0.071907f,
+ -0.418429f, -0.183240f, 0.031319f, -0.095785f, -0.315447f, 0.069404f,
+ -0.422910f, -0.029867f, -0.357321f, -0.199976f, -0.337707f, -0.070188f,
+ -0.178198f, 0.177208f, 0.134688f, -0.081933f, -0.229452f, -0.208872f,
+ 0.026287f, -0.364040f, -0.063696f, -0.227443f, -0.234401f, -0.205699f,
+ -0.267238f, -0.494125f, -0.056255f, 0.053715f, -0.487754f, 0.014818f,
+ 0.087383f, -0.077556f, -0.168085f, -0.436851f, -0.276286f, -0.137845f,
+ -0.107606f, -0.103653f, -0.233766f, -0.419083f, 0.169185f, 0.010186f,
+ -0.001587f, 0.086735f, -2.465718f, 1.482185f, 1.621193f, -2.081680f,
+ 1.386553f, -3.204335f, -0.267111f, -0.004508f, 0.164712f, 0.274147f,
+ 1.724306f, -2.273659f, 0.749574f, -0.891905f, 0.105965f, -0.030428f,
+ -0.416018f, -0.300762f, 0.122911f, -0.316908f, -0.292504f, 0.138666f,
+ -0.161327f, -0.042143f, -0.249128f, 0.149210f, -0.088987f, -0.654101f,
+ -1.501843f, 0.216777f, 0.955914f, 0.524158f, -1.642561f, -1.643626f,
+ 0.864797f, -0.425451f, -2.115764f, -0.012502f, 0.065172f, 1.297270f,
+ 0.018845f, 1.167276f, -0.470970f, -0.244995f, 0.374782f, -1.811056f,
+ -0.055430f, -0.024102f, -0.376519f, -0.339640f, -0.119177f, -0.277995f,
+ -0.290095f, -0.081362f, -0.144139f, -0.118037f, -0.180357f, -0.217559f,
+ -0.370683f, 0.172816f, -0.265069f, 0.194321f, -0.273478f, 0.037442f,
+ -0.235552f, -0.078625f, -0.447541f, 0.016836f, -0.271123f, -0.171481f,
+ -0.321477f, -0.184826f, -0.442981f, -0.227273f, -0.370666f, -0.237232f,
+ -0.257493f, -0.225714f, -0.153716f, -0.283487f, -0.155399f, 0.067697f,
+ 0.230343f, -0.034318f, -0.022687f, -0.047090f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer0[64] = {
+ -0.212182f, -0.233725f, -0.758846f, -0.158162f, 0.614743f, -0.150944f,
+ -0.075727f, -0.208414f, 1.054996f, 0.713758f, -0.300051f, -0.151482f,
+ -2.443570f, 0.430590f, -0.129001f, -0.160733f, -0.230547f, -0.143228f,
+ -0.140577f, -0.086812f, -0.212298f, -0.159557f, -0.055647f, -0.211423f,
+ 0.578161f, -0.220318f, -0.210107f, -3.111584f, 0.604419f, -0.232622f,
+ -0.209924f, -0.130794f, -0.084097f, -0.036005f, 0.294594f, -2.535531f,
+ -0.209783f, -0.211189f, -2.766337f, 0.000000f, 0.450177f, -1.754884f,
+ 3.262664f, -0.209691f, -0.614886f, -0.211257f, -0.109096f, -0.190492f,
+ -0.109007f, -0.026910f, -0.136035f, -0.212321f, -0.139320f, -0.212233f,
+ -0.305430f, 0.739171f, 0.991277f, -0.088150f, 0.086313f, -0.023379f,
+ -0.125366f, -0.063576f, -0.212169f, -0.047463f,
+};
+
+static const float av1_ab_partition_nn_weights_64_layer1[64 * LABEL_SIZE] = {
+ -0.036800f, 0.528721f, 0.490767f, 0.144409f, 1.103640f, 0.361910f,
+ -0.180069f, 0.068033f, -14.868382f, 0.359013f, 0.322567f, -0.199212f,
+ 0.906164f, -0.488254f, 0.149653f, -0.216394f, -0.099347f, 0.004936f,
+ -0.111391f, 0.074848f, -0.041709f, 0.147627f, -0.018905f, 0.096116f,
+ 0.184817f, -0.016241f, 0.115739f, 2.376754f, 0.637097f, 0.052954f,
+ 0.136428f, 0.225267f, -0.181873f, -0.142876f, 0.684048f, 0.658791f,
+ 0.105795f, 0.241705f, 1.381114f, -0.209379f, 1.145949f, 0.795293f,
+ -9.361877f, 0.198302f, 0.539600f, 0.092317f, -0.081695f, 0.200777f,
+ 0.102334f, 0.081583f, 0.060948f, -0.025110f, 0.160951f, -0.020170f,
+ 0.234006f, -0.029369f, 0.375036f, 0.270209f, -0.556529f, 1.402949f,
+ 0.101777f, -0.027331f, 0.004502f, -0.153166f, -0.116651f, 0.151573f,
+ -0.022187f, 0.144044f, -0.108719f, -0.129942f, -0.270321f, 0.227363f,
+ 1.892330f, -0.661052f, -0.219398f, -0.229417f, -0.856438f, -1.196988f,
+ -0.081774f, 0.078847f, -0.207057f, -0.048947f, 0.152073f, -0.243056f,
+ -0.233329f, -0.288689f, -0.158333f, -0.141177f, -0.715436f, 0.016947f,
+ -0.093752f, 0.204984f, -1.209782f, 0.155683f, 0.092239f, 0.146495f,
+ 0.813146f, -0.027757f, 0.330982f, 2.173948f, -0.028867f, -0.141815f,
+ 0.292708f, -0.204794f, 0.014496f, 1.032799f, 1.312155f, 0.107020f,
+ 0.824752f, -0.013945f, 0.184829f, -0.041633f, 0.215300f, -0.476088f,
+ -0.053213f, 0.126862f, -0.020777f, 0.082893f, -0.223727f, -0.923063f,
+ 0.466529f, 0.082140f, -0.845758f, -1.140791f, -0.262033f, 0.138491f,
+ 0.151717f, -0.182479f, -0.131128f, 0.055411f, 0.106771f, 0.125552f,
+ 0.297184f, -0.257403f, -0.059884f, -0.274903f, 2.694357f, -0.108244f,
+ 0.025377f, 0.043092f, -0.558317f, 3.517159f, -0.270833f, -0.240676f,
+ 0.205100f, -0.057068f, -0.140445f, -0.193449f, -0.030061f, -0.286762f,
+ -0.467523f, -0.012647f, 0.190564f, 0.022394f, -0.101479f, 0.339684f,
+ -0.902743f, -0.169578f, -0.178029f, -0.041836f, -3.952108f, -0.028298f,
+ -0.221137f, -0.733895f, -0.223895f, 0.039012f, 0.687867f, 0.021423f,
+ 0.113063f, 0.676087f, -0.961000f, -0.064847f, 0.712856f, -0.192765f,
+ -0.001132f, 0.016689f, -0.236020f, -0.766186f, -0.175729f, 0.012879f,
+ -0.251064f, -0.105523f, -0.039212f, -0.347584f, 0.304352f, -0.034174f,
+ -0.364258f, -0.685252f, -0.266115f, -0.247345f, -0.155905f, 0.152283f,
+ -0.156315f, 0.174082f, -0.757654f, 0.102303f, -2.192316f, -0.245815f,
+ 0.119882f, -0.086542f, 1.987246f, -1.353163f, -0.374813f, -0.233504f,
+ -1.980895f, 0.692093f, -0.168351f, 0.172700f, -0.009052f, -0.015734f,
+ 0.106679f, -0.060472f, -0.256813f, -0.074874f, -0.207488f, -0.329515f,
+ -0.418268f, -0.017940f, -0.036081f, 0.064719f, -1.488016f, 0.020591f,
+ -0.176325f, -0.141074f, 0.944494f, 0.150237f, -0.249805f, -0.277280f,
+ 0.012686f, 0.132483f, 0.116123f, 0.013737f, -0.116091f, 0.750340f,
+ 3.251343f, -0.188864f, 1.096992f, 0.058467f, -0.041433f, -0.037937f,
+ -0.133294f, -0.137908f, -0.171132f, 0.106362f, 0.069383f, -0.052662f,
+ -0.177883f, -0.408049f, 0.680221f, -0.117035f, -0.904240f, -1.395228f,
+ 0.154527f, 0.134427f, 0.022767f, -0.158886f, -0.230316f, 0.161096f,
+ 0.362213f, -0.235060f, -0.941620f, 0.055912f, -0.049458f, -0.166632f,
+ 0.481418f, 0.930146f, 0.041108f, 0.033674f, 1.372066f, -1.847709f,
+ 0.003324f, 0.259534f, 0.177014f, -0.202761f, -0.262017f, -0.190852f,
+ -0.102839f, 0.028338f, 0.187193f, -0.041684f, 0.123973f, -0.198576f,
+ -0.110369f, -1.431400f, 0.208369f, -0.302370f, -0.248549f, 0.062985f,
+ 0.673409f, 0.036662f, -0.711340f, -0.120584f, -0.189789f, 0.098812f,
+ 2.947819f, 0.216567f, -0.414472f, -0.181742f, 1.873779f, -0.222726f,
+ -0.782870f, 0.007889f, 0.015062f, -0.554328f, 0.182928f, -0.191430f,
+ 0.123636f, -0.215460f, -0.225245f, 0.251516f, -0.013025f, -1.359595f,
+ -0.750602f, 0.342667f, -0.141899f, -0.687493f, -0.072639f, 0.048018f,
+ -0.242107f, -0.031917f, -0.287472f, -0.046088f, 0.832197f, -0.016576f,
+ -1.553349f, -0.216341f, 0.023077f, -0.410867f, 4.243743f, -0.514878f,
+ -0.066007f, -0.160696f, -0.262678f, -0.648790f, -0.430586f, 0.199940f,
+ -0.202496f, -0.222241f, -0.016406f, -0.121473f, 0.000828f, -0.081584f,
+ -0.152641f, -0.190166f, 0.644400f, 0.040196f, -0.302104f, -1.143654f,
+ -0.160327f, -0.320780f, -0.187006f, 0.037311f, 0.440618f, -0.070733f,
+ -0.117785f, 1.527539f, -0.419310f, 0.001300f, 1.389956f, -0.036366f,
+ -0.269203f, 0.612265f, 2.721897f, -0.086836f, -0.446999f, 0.012525f,
+ -0.078317f, -0.287052f, -0.111188f, -0.085181f, -0.164667f, -0.010466f,
+ -0.569722f, -0.018888f, -0.101663f, -1.147130f, -0.465204f, 0.114524f,
+ -2.192402f, -0.221325f, 0.375748f, 0.206284f, -0.261548f, -0.246257f,
+ -0.143004f, -0.069981f, -0.057306f, -0.116481f, -0.435903f, -0.314970f,
+ 0.013210f, -0.010175f, 4.630571f, -0.473226f, -0.197199f, -0.028204f,
+ 0.122907f, 2.475548f, 0.025011f, -0.092603f, -0.127561f, -0.151330f,
+ -0.077295f, 0.245016f, -0.045005f, 0.183396f, -0.330556f, -0.384887f,
+ 0.356374f, -0.016618f, -0.463353f, -1.291546f, -0.071986f, -0.311599f,
+ 0.072385f, -0.430786f, -2.094788f, 0.202733f, -0.910109f, -1.336543f,
+ -0.086800f, -0.096413f, 1.544383f, 0.031860f, -0.796211f, 0.762786f,
+ 3.250022f, -0.441798f, -0.698537f, 0.062839f, 0.033525f, -0.362996f,
+ 0.027022f, -1.131264f, -0.228926f, 0.053885f, -0.338628f, 0.155037f,
+ -0.046844f, -0.888172f, -0.241767f, 0.084965f, -0.617743f, -0.049896f,
+ -0.036894f, -0.304783f, -0.002639f, 0.137957f, 0.052121f, -0.131161f,
+ -0.117200f, -0.253380f, -0.205561f, -0.302450f, -0.047397f, -0.330518f,
+ 3.613420f, -1.525951f, -0.026738f, 0.209150f, -2.103534f, 2.019689f,
+ -0.366199f, -0.095260f, 0.027417f, -0.242512f, 0.162579f, 0.052113f,
+ -0.293851f, -0.068138f, -0.005799f, -0.344696f, -0.114824f, -0.431107f,
+ -0.120058f, -1.139926f, -1.048379f, 0.036446f, -0.323020f, -0.432945f,
+ 0.454151f, -0.140058f, 0.050649f, -0.094900f, -0.017278f, -0.238719f,
+ 1.193153f, 0.120447f, -0.496061f, 0.917431f, 2.936126f, -0.115521f,
+ -0.347397f, -0.435325f, -0.004383f, -0.211864f, 0.162383f, -1.040726f,
+ 0.089537f, -0.128579f, -0.133505f, 0.107129f, -0.435657f, -0.180388f,
+ 0.043650f, 0.018709f, -0.773242f, -0.687192f, -0.120633f, -0.063626f,
+ 0.029912f, 0.113972f, -0.403502f, -0.127640f, -0.269625f, 0.129794f,
+ -0.188539f, 0.041641f, 0.029769f, -0.198374f, 1.401407f, 0.353887f,
+ -0.219925f, 0.260515f, 1.157034f, -2.992044f, -0.097618f, -0.064417f,
+ -0.203626f, -0.008217f, -0.112339f, -0.227407f, -0.155118f, 0.247705f,
+ -0.012304f, -0.248447f, -0.913463f, -0.064788f, -0.214619f, -0.251761f,
+ -0.386861f, -0.040574f, -0.163219f, -0.100700f, 1.488274f, -0.071684f,
+ -0.033626f, -0.006497f, -0.246945f, -0.145221f, -3.747390f, 0.149609f,
+ -0.263326f, -0.297385f, -1.039896f, -0.083174f, -0.025473f, -0.235586f,
+ -0.001087f, 0.254286f, 0.265106f, 0.007325f, 0.199239f, 0.134103f,
+ -0.578211f, -0.259801f, -0.062373f, 2.368348f, 0.560556f, -0.252260f,
+ 0.889997f, -0.447872f, -0.059218f, -0.095315f, -0.061667f, 0.183580f,
+ -0.157479f, 0.055387f, -0.831734f, 0.007606f, -1.104906f, 0.301180f,
+ -0.117115f, 0.212959f, 4.727223f, -0.243833f, -0.397495f, -0.025021f,
+ -0.367587f, -2.082058f, -0.217699f, 0.148111f, 0.252430f, 0.111088f,
+ -0.260692f, 0.095124f, -0.407774f, -0.322169f, 0.002927f, 0.126169f,
+ -1.272325f, -0.279772f, -0.373680f, -0.485177f, -0.605458f, 0.021225f,
+ -0.092031f, -0.226585f, 1.895162f, 0.037866f, -0.275475f, 1.614360f,
+ -0.014972f, -0.277679f, -3.449082f, -0.092060f, -0.747873f, 0.020716f,
+ 2.776178f, -0.049963f, 0.183999f, -0.295259f, -0.028868f, 0.221895f,
+ 0.001265f, 0.336823f, 0.219372f, 0.112824f, 0.408132f, -0.017940f,
+ -0.311666f, 1.489606f, -0.058093f, -0.305659f, -0.491933f, -0.143847f,
+ 0.166115f, 0.042867f, -0.123447f, -0.087099f, -0.305395f, -0.365079f,
+ -0.755801f, -0.160649f, 0.736260f, -0.008611f, 0.095836f, -0.017345f,
+ 5.697515f, -0.498971f, -0.125280f, 0.199907f, 0.300053f, 0.605026f,
+ -0.228225f, -0.259523f, 0.016384f, 0.146973f, 0.210258f, 0.226766f,
+ -0.075178f, -0.050924f, 0.188496f, -0.415266f, -0.484880f, -0.236384f,
+ 0.071931f, -0.331863f, -0.601243f, -0.232479f, -0.285272f, 0.123789f,
+ -1.341333f, 0.037082f, -0.315202f, -1.587215f, -0.271576f, 0.003216f,
+ -4.437186f, -0.256205f, -0.576589f, -0.114147f, 2.153916f, -0.369618f,
+ 0.271415f, 0.145036f, -0.158731f, -0.240938f, -0.187369f, 0.036325f,
+ 0.254771f, 0.211488f, -0.240297f, 0.098417f, -0.415011f, 2.334793f,
+ -0.127252f, 0.020069f, -0.168755f, -0.448922f, -0.219207f, 0.016232f,
+ -0.221935f, -0.269500f, -0.100636f, 0.102545f, -0.809376f, -0.054979f,
+ 0.360713f, -0.326541f, 0.112933f, 0.138073f, 4.229404f, -0.763801f,
+ -0.305429f, 0.199955f, -1.787713f, 0.272866f, 0.109895f, 0.138466f,
+ -0.250259f, -0.167162f, -0.212588f, -0.217589f, -0.067125f, -0.077490f,
+ -0.208970f, -0.006863f, -0.671146f, -0.298320f, -0.165509f, 0.044597f,
+ -1.408624f, -0.213957f, -0.220947f, 0.129718f, 1.316777f, -0.098928f,
+ -0.008121f, -0.558293f, -0.297290f, -0.218873f, -4.346638f, -0.228174f,
+ -0.204710f, -0.388864f, 2.697919f, 0.025260f, 0.857020f, 0.009921f,
+ 0.036915f, -0.320275f, -0.087937f, 0.022636f, 0.236667f, 0.135496f,
+ -0.059616f, -0.192955f, 0.009470f, 2.139589f, -0.200449f, 0.129818f,
+ 1.017444f, -0.608299f, 0.257914f, -0.134306f, -0.033327f, 0.002855f,
+ -0.338598f, 0.015559f, 0.117362f, -0.166760f, 0.086903f, -0.167666f,
+ 0.193523f, 0.033852f, -1.147686f, 0.489468f, -0.006969f, 0.125630f,
+ 1.557907f, -1.604449f, -0.071114f, 0.096178f, 0.007065f, 0.200013f,
+ 0.213393f, 0.168466f, -0.100568f, -0.117861f, -0.161542f, -0.072561f,
+ -1.069871f, -0.470138f, -0.352578f, -1.503513f, -0.001394f, -0.380109f,
+ 0.065089f, -0.281668f, 0.988953f, -0.002778f, -0.659026f, -0.470692f,
+ -0.407292f, 0.011710f, -1.362085f, 0.184738f, -0.135786f, -1.374241f,
+ 4.487930f, -0.067274f, -0.956404f, -0.233995f, 0.224527f, -0.454556f,
+ 0.037900f, -0.281658f, 0.208224f, -0.254753f, 0.045740f, 0.051444f,
+ -0.388281f, 0.257112f, -0.485030f, -0.082659f, 0.148103f, -1.007456f,
+ -0.022295f, 0.036984f, -0.369401f, -0.076943f, -0.007636f, -0.293022f,
+ 0.470466f, 0.199012f, -2.158182f, 0.036577f, -0.014725f, -0.229516f,
+ 2.236929f, 0.030945f, -0.400045f, 0.109348f, 0.214691f, -0.891516f,
+ -0.251379f, -0.217358f, 0.013733f, 0.205573f, -0.151725f, -0.191782f,
+ -0.339630f, -0.163905f, -0.119191f, -0.032516f, 0.503015f, 0.025772f,
+ 0.029094f, -1.146153f, 0.216723f, -0.330023f, 0.064695f, -0.262521f,
+ 0.425612f, -0.093080f, -0.489648f, 1.051293f, -0.092332f, 0.095557f,
+ -0.874132f, 0.218483f, -0.127648f, -1.605802f, 2.763617f, -0.186734f,
+ -1.243166f, -0.193514f, -0.173748f, 0.337822f, 0.183873f, -0.251594f,
+ -0.211582f, 0.144081f, 0.029620f, -0.024853f, -0.385140f, 0.467341f,
+ -0.928316f, -0.195442f, 0.917783f, 0.357084f, 0.174445f, -0.073659f,
+ -0.012811f, -0.115420f, -0.181147f, -0.364449f, -0.567395f, -0.012969f,
+ -1.680714f, 0.065323f, 0.198063f, -0.244201f, 1.428545f, -0.432539f,
+ -0.208931f, -0.091205f, 0.957125f, 0.813519f, -0.262677f, 0.246852f,
+ 0.015536f, 0.055026f, 0.067054f, 0.262103f, -0.358115f, -0.095206f,
+ -0.267522f, -0.402710f, -0.680397f, -0.123627f, -0.385590f, -1.504680f,
+ -0.169513f, -0.215338f, 0.043633f, -0.079052f, -0.464410f, 0.122894f,
+ -0.278231f, -2.456445f, -0.159917f, -0.015597f, -0.735449f, -0.078854f,
+ -0.400290f, -1.153870f, 3.657228f, -0.287093f, -1.174355f, -0.102001f,
+ -0.288281f, 0.185209f, -0.145228f, -0.200449f, -0.099914f, -0.138354f,
+ 0.254428f, -0.161751f, -0.118206f, 0.296043f, -0.482613f, 0.080932f,
+ 1.097605f, -0.010190f, 0.232439f, 0.447617f, -0.133508f, 0.115763f,
+ -0.388589f, 0.174695f, -0.236014f, 0.006284f, -1.374129f, 0.092015f,
+ -0.241419f, -0.231667f, 2.763950f, -0.922932f, -0.061605f, 0.208740f,
+ -1.597190f, 1.353325f, -0.198528f, 0.250498f, -0.013950f, -0.203861f,
+ -0.254563f, 0.081931f, -0.413369f, 0.011844f, 0.080961f, -0.231161f,
+ -1.234909f, -0.440843f, -0.174980f, -0.315283f, -0.337474f, -0.123243f,
+ -0.310001f, -0.271028f, 0.364179f, 0.022845f, -0.535517f, -0.772936f,
+ -0.188435f, 0.039667f, -0.807463f, 0.266550f, -0.288857f, -1.630789f,
+ 1.280155f, 0.065712f, -0.279960f, -0.300056f, 0.258440f, -0.073781f,
+ 0.213878f, 0.042196f, 0.021360f, 0.211698f, -0.003751f, -0.192673f,
+ -0.137008f, 0.247878f, -0.470604f, 0.073164f, 1.523241f, 0.734755f,
+ -0.114126f, -0.193834f, -0.025759f, 0.263183f,
+};
+
+static const float av1_ab_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.343508f, -0.706936f, -0.160676f, -0.877101f, -0.517567f, -0.253254f,
+ -0.148074f, 0.923430f, -0.364770f, 0.203550f, 0.401216f, 0.938246f,
+ -0.872737f, 0.718723f, 0.703398f, 2.560015f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_64_layer0,
+ av1_ab_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_64_layer0,
+ av1_ab_partition_nn_bias_64_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 32x32.
+static const float av1_ab_partition_nn_weights_32_layer0[FEATURE_SIZE * 64] = {
+ -0.323723f, -0.214013f, -0.007772f, -0.458851f, -0.125542f, -0.123860f,
+ -0.410973f, -0.209389f, -0.087580f, -0.272881f, -0.168500f, -1.130845f,
+ 0.344916f, -0.475017f, -0.362262f, -0.195662f, -0.566124f, 0.782163f,
+ 0.411575f, -0.013378f, -0.318650f, -0.124678f, -0.612909f, -0.315788f,
+ -0.263990f, -0.508783f, -0.048938f, -0.416407f, -0.402648f, -0.156644f,
+ 0.225887f, -0.000493f, 2.682241f, 0.871204f, 0.059014f, 0.803542f,
+ -1.407028f, -1.154669f, 1.388148f, -0.293348f, -0.003669f, -0.009607f,
+ 1.330030f, -0.337841f, 2.118617f, 1.033059f, -0.084788f, 0.212904f,
+ 0.082405f, -0.070579f, -0.494005f, -0.173392f, 0.039546f, -0.463865f,
+ 0.077163f, -0.434066f, 0.030835f, -0.427139f, -0.560520f, -0.031606f,
+ -0.368541f, -0.027458f, 0.370574f, 0.461418f, 1.087682f, -0.572137f,
+ -1.509596f, -0.765697f, -0.499383f, -0.277998f, -0.106492f, -0.129564f,
+ -0.169133f, -0.269834f, -0.114270f, -0.275431f, 0.016339f, -0.156744f,
+ -0.267922f, 0.171216f, 0.110556f, 0.002954f, -0.200327f, -0.187663f,
+ 3.691601f, 1.234152f, 0.186315f, -0.125370f, -0.211235f, -0.554432f,
+ -0.131072f, -0.124982f, -0.130339f, -0.235350f, 0.018903f, 0.012896f,
+ -0.159372f, -0.269571f, -0.025709f, -0.221251f, 0.061919f, 0.016307f,
+ 0.384673f, -0.134525f, -1.599126f, -0.416459f, -0.743052f, 0.670249f,
+ -0.169709f, 0.421681f, -0.033360f, -0.072817f, 0.003647f, -0.110632f,
+ -0.158651f, -0.095136f, 0.223759f, 0.165767f, -0.269129f, -0.196075f,
+ -0.023183f, -0.293420f, 0.014875f, 0.018688f, -0.153407f, -0.172009f,
+ -0.259947f, -0.124015f, 0.173653f, -0.089103f, -0.021001f, -0.334230f,
+ 0.027177f, 0.103371f, -0.183860f, -0.204051f, -0.023721f, -0.192297f,
+ -0.143771f, -0.247106f, 0.218116f, -0.013240f, 2.831783f, 1.483928f,
+ -0.877025f, -0.313462f, -0.411320f, -0.447825f, 0.605977f, 0.234684f,
+ -0.119150f, -0.075182f, -0.330463f, 0.071503f, -0.254924f, -0.360071f,
+ -0.037022f, 0.063261f, -0.148759f, -0.238254f, -0.462018f, -0.027166f,
+ 0.065318f, -0.235743f, -0.257194f, -0.094784f, 0.022423f, 0.055925f,
+ 0.086672f, -0.021010f, 0.009965f, -0.001648f, -0.104917f, -0.387443f,
+ -0.102673f, -0.281706f, 0.145923f, -0.233391f, -0.378365f, -0.145584f,
+ -0.077751f, -0.121166f, 1.134565f, -0.097500f, -0.749202f, -0.544566f,
+ -1.361374f, -0.102494f, 1.089275f, 0.375299f, -0.105091f, 0.037641f,
+ -0.054248f, -0.282691f, -0.377797f, -0.066427f, -0.253815f, -0.329677f,
+ -0.339326f, -0.128217f, -0.282905f, 0.014937f, 1.067185f, -0.171764f,
+ 0.484458f, 0.396706f, -0.557055f, -0.891596f, -0.257839f, -0.720879f,
+ -0.218449f, -0.004755f, 1.572857f, 0.006229f, 1.962895f, -0.029746f,
+ -4.137691f, -2.185991f, -2.763477f, -0.520437f, -0.208708f, 0.006444f,
+ -1.263078f, -0.304560f, 1.072374f, 2.556429f, 0.312850f, 0.257488f,
+ -0.634264f, 0.156769f, -0.188943f, 0.040295f, -0.389915f, 0.085250f,
+ -0.248525f, 0.045667f, -0.776115f, -0.274680f, -0.448145f, -0.566161f,
+ -1.285316f, 0.079060f, 0.389124f, -0.510401f, -0.015299f, -0.664661f,
+ 0.099901f, -0.470694f, -0.051593f, -1.076381f, -0.442104f, -0.197867f,
+ -0.330011f, -0.448523f, -0.301018f, -0.442093f, -0.491953f, -0.582091f,
+ -0.064569f, -0.156516f, 0.543522f, -0.005924f, 0.161432f, 0.974793f,
+ 0.273712f, 1.104850f, -0.290312f, 0.313417f, -0.125370f, 0.136234f,
+ -0.191227f, -0.165054f, 0.011872f, -0.298871f, 0.095740f, 0.142760f,
+ -0.215771f, -0.031437f, 0.101041f, -0.085620f, 0.435387f, 0.002786f,
+ 1.971375f, 0.018392f, -1.771940f, -0.401433f, 0.808263f, -3.350013f,
+ 2.296952f, -1.024403f, -0.041645f, -0.034799f, -0.024078f, -0.347301f,
+ -0.276088f, -0.455907f, 0.266021f, 0.087348f, -0.146566f, 0.040492f,
+ -0.539866f, -0.206851f, -0.387874f, -0.125508f, -0.496676f, -0.373845f,
+ -0.472356f, -0.357082f, -0.081254f, -0.456466f, 0.554713f, 0.002185f,
+ -4.225019f, 0.344025f, 0.728796f, -0.262936f, 1.383924f, 1.577300f,
+ -2.653320f, -2.516156f, -0.301604f, -0.204105f, -0.138252f, -0.587536f,
+ -0.097889f, -0.352414f, -0.288276f, -0.184340f, -0.122741f, -0.243376f,
+ 0.031970f, -0.373402f, -0.396079f, 0.045566f, 0.072595f, -0.222681f,
+ -0.243802f, -0.340129f, -0.258494f, -0.192041f, -0.386112f, -0.240940f,
+ -0.047268f, -0.555802f, -0.032514f, -0.241341f, -0.167463f, -0.478308f,
+ -0.205936f, -0.316275f, 0.103729f, -0.197893f, -0.128029f, -0.218796f,
+ -0.167362f, -0.111814f, -0.126062f, -0.394260f, -0.025357f, -0.402697f,
+ -0.587395f, -0.400385f, -0.259664f, -0.415588f, -0.338503f, -0.399166f,
+ -0.270504f, 0.234505f, 0.272144f, 0.266938f, -0.392395f, -0.011717f,
+ -0.384221f, -0.473446f, -0.038420f, -0.241101f, -0.234402f, -0.275567f,
+ -0.410454f, -0.377599f, -0.179099f, -0.138432f, -0.248083f, -0.543026f,
+ -0.428043f, -0.239895f, -0.333193f, -0.103346f, -0.039038f, -0.171109f,
+ -0.119432f, -0.222351f, 0.000450f, 0.208724f, -0.510526f, -0.144656f,
+ -0.316721f, -0.344846f, -0.244794f, -0.129134f, -0.045634f, -0.400183f,
+ 0.043714f, -0.235414f, 0.115594f, -0.195616f, -0.106693f, -0.124242f,
+ 0.083990f, 0.049110f, -0.196130f, -0.059860f, -0.464235f, -0.516443f,
+ -0.101521f, -0.422379f, -0.413955f, -0.042991f, -0.345263f, -0.129264f,
+ -0.106911f, -0.140156f, -0.457841f, -0.199848f, -0.218954f, -0.329850f,
+ -0.364097f, -0.335262f, -0.312254f, -0.299331f, -0.052710f, -0.251019f,
+ -0.023459f, -0.222538f, 0.028849f, -0.088038f, -0.301550f, -0.273566f,
+ 0.067295f, -0.174608f, -0.445784f, -0.158366f, -0.567275f, -0.557652f,
+ -0.353503f, -0.302092f, -0.302049f, -0.551793f, -0.034535f, -0.225190f,
+ -0.210733f, -0.219377f, -0.057197f, -0.430933f, -0.025185f, -0.388150f,
+ -0.086147f, -0.430088f, 0.058466f, -0.152129f, -0.058411f, -0.236392f,
+ -0.547669f, -0.613849f, -0.893774f, -0.351715f, -0.399227f, -0.454909f,
+ -0.324501f, 0.000490f, -0.282167f, -0.073163f, -0.281452f, 0.047932f,
+ -0.175500f, 0.165220f, -0.276212f, 0.062153f, -0.217054f, -0.255487f,
+ -0.146416f, -0.097718f, -0.173809f, -0.559328f, -0.055695f, -0.391193f,
+ -0.132020f, -0.561184f, -0.308666f, -0.474053f, -0.219149f, -0.246558f,
+ -0.158325f, 0.151907f, -0.266835f, -0.144697f, -0.193960f, -0.046587f,
+ -0.220028f, -0.247355f, 0.135584f, 0.016511f, 0.367705f, -1.855877f,
+ 0.435622f, 0.444710f, -3.372301f, -3.030489f, 1.013267f, 0.380951f,
+ -0.170011f, -0.111415f, -0.456146f, -0.107254f, -0.095220f, -0.053078f,
+ -0.135864f, -0.591949f, -0.252810f, -0.324799f, -0.094796f, -0.260969f,
+ -0.391981f, -0.063170f, -0.336130f, -0.470127f, -0.405168f, -0.433219f,
+ -0.309563f, -0.295462f, -0.552270f, -0.012300f, -0.057793f, -0.034494f,
+ -0.446843f, -0.640160f, -1.188681f, -0.791361f, 0.543271f, 1.189112f,
+ 1.458468f, -0.005876f, -0.927475f, 0.062038f, -1.170818f, 0.338227f,
+ -3.007096f, -4.559296f, -4.045457f, -5.953635f, -0.228386f, -0.266890f,
+ -0.092595f, -0.377440f, -0.044534f, -0.053565f, -0.349268f, -0.415030f,
+ -0.310094f, 0.062721f, 0.251422f, -0.014350f, -1.282910f, 1.619560f,
+ 1.180566f, -0.032163f, -1.322951f, -0.603601f, 1.443710f, 0.654650f,
+ -0.393227f, 0.003536f, 0.029725f, -0.108925f, -0.053911f, 0.133977f,
+ -0.036145f, -0.168438f, 0.046989f, -0.331463f, -0.176983f, -0.311922f,
+ -0.272389f, -0.379592f, -0.399993f, -0.297873f, -0.193425f, -0.177524f,
+ -0.258309f, -0.567312f, -0.260217f, -0.241869f, 0.024010f, -0.032867f,
+ -0.039424f, -0.063670f, 0.193808f, -0.303514f, -0.013376f, -0.057761f,
+ 0.187922f, 0.006938f, 0.031810f, 0.180594f, -1.198427f, 2.820662f,
+ 0.154986f, -0.375518f, 0.116925f, -0.795782f, -0.085139f, -0.079365f,
+ -0.197936f, -0.321468f, -0.205271f, -0.558203f, -0.296235f, -0.151193f,
+ -0.158282f, -0.245402f, -0.208504f, -0.042335f, -0.087426f, -0.557129f,
+ -0.381427f, -0.441551f, -0.541011f, -0.060567f, -0.469305f, -0.032326f,
+ -2.453587f, -0.045568f, -0.296932f, 0.613061f, -0.320284f, 0.191620f,
+ -0.827145f, -0.225277f, 0.275800f, 1.696635f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer0[64] = {
+ -0.176206f, 0.660189f, -0.186156f, -2.481963f, -1.564218f, -0.280424f,
+ 0.732684f, -0.135581f, -2.193132f, -0.172771f, 0.605001f, -0.060392f,
+ -0.067190f, -0.132969f, -1.410812f, -0.298701f, -0.105963f, -0.086173f,
+ 0.632779f, 0.005585f, 1.310169f, 1.392136f, -0.563860f, -0.051053f,
+ 0.660998f, -0.214726f, -1.894342f, -0.128288f, -0.330721f, -0.053988f,
+ -0.177726f, 1.200859f, -0.178902f, -0.172620f, -0.184476f, -0.175559f,
+ 0.538503f, -0.322158f, -0.219080f, -0.058208f, -0.171347f, -0.216060f,
+ -0.174950f, -0.295740f, -0.184820f, -0.213896f, 1.317728f, -0.020116f,
+ -0.208096f, 0.000000f, 1.246166f, -0.225421f, -0.181555f, 0.861761f,
+ 1.172429f, -0.172892f, -0.737092f, -0.189904f, -0.179385f, -0.114618f,
+ -1.384604f, -0.201713f, -0.271948f, 0.372351f,
+};
+
+static const float av1_ab_partition_nn_weights_32_layer1[64 * 16] = {
+ -0.037828f, 1.529029f, 0.004927f, 1.475763f, 0.627172f, 0.325872f,
+ -0.990757f, 0.129476f, 0.889958f, -0.082031f, 0.332133f, 0.074422f,
+ -0.176212f, -0.074355f, 0.774378f, 0.110987f, -0.155469f, 0.253310f,
+ 0.882538f, 0.253605f, 0.332436f, -5.389474f, 0.278470f, 0.168644f,
+ 0.914611f, 0.154165f, 0.809262f, -0.174734f, 0.923673f, 0.064716f,
+ -0.070228f, -0.228735f, 0.002312f, 0.112222f, -0.045502f, -0.046004f,
+ 0.514101f, 0.306480f, 0.021232f, -0.015955f, -0.288260f, 0.189177f,
+ -0.104158f, 0.103273f, 0.096910f, -0.086328f, 1.327289f, -0.154247f,
+ 0.056676f, -0.243327f, -0.646676f, 0.177221f, -0.086761f, 0.729729f,
+ -14.710893f, -0.044881f, 0.339003f, -0.134737f, 0.073621f, -0.162913f,
+ 1.215237f, 0.140723f, 0.138630f, 1.241719f, 0.204092f, -0.463080f,
+ -0.176086f, 1.125868f, 1.034814f, 0.225455f, -0.203421f, -0.078787f,
+ -0.527498f, 0.012491f, -0.563307f, -0.170792f, 0.002679f, 0.116153f,
+ 0.211348f, -0.191900f, -0.212505f, 0.263445f, -0.074679f, -0.081441f,
+ -0.815405f, 2.448215f, 0.781299f, 0.149542f, -1.045162f, 0.043014f,
+ 0.217381f, -0.094500f, -0.090427f, 0.025784f, -0.228906f, -2.741798f,
+ 0.230475f, -0.256112f, -0.103297f, 0.159121f, -0.229793f, -0.014883f,
+ -0.104131f, -0.123816f, 0.164148f, -0.052279f, -0.071845f, -0.041197f,
+ 0.208527f, -0.234197f, -0.542336f, 0.020053f, 0.088870f, 0.014346f,
+ 2.502164f, -0.010244f, -0.267792f, 0.844394f, 2.711486f, -0.015262f,
+ -0.868053f, -0.295704f, 0.222289f, -0.000286f, -0.352098f, -0.079000f,
+ 0.021267f, -0.721739f, -0.240558f, -0.384775f, 0.065974f, -2.161058f,
+ 0.195889f, 0.268966f, -0.009329f, 0.014949f, 0.314943f, 0.235885f,
+ 0.072591f, -0.127120f, 0.150784f, 0.105697f, -1.297403f, -0.207509f,
+ -0.217688f, -0.076752f, 0.170952f, -0.294235f, 0.449973f, -1.712690f,
+ 0.860989f, 0.054757f, -0.812627f, -0.105316f, -0.736230f, -0.133192f,
+ -3.741608f, 0.495660f, -0.288936f, 4.654852f, -0.021305f, -0.308916f,
+ 0.049205f, -0.259996f, 0.114248f, -0.252647f, -0.253180f, -0.449314f,
+ 0.022979f, 0.063281f, -0.196154f, 0.078295f, -0.322317f, -0.145142f,
+ 0.300573f, 0.048385f, -0.254787f, 0.123939f, -1.263088f, -0.228565f,
+ -0.389061f, 0.391084f, 2.322438f, 0.075009f, 0.225743f, -0.198808f,
+ -0.280538f, -0.173939f, -0.120543f, -0.070792f, -0.417187f, -0.781056f,
+ -0.102756f, -1.760965f, 0.019149f, -0.867342f, 0.347141f, 0.031588f,
+ 0.302572f, -0.203573f, -0.357320f, -0.096078f, -0.527528f, 0.046699f,
+ -0.108561f, -0.167077f, -2.851509f, -0.307116f, 0.202720f, -0.160280f,
+ -0.215525f, 0.064355f, -0.427220f, 1.516230f, 0.634453f, 0.099400f,
+ -1.013887f, -0.029740f, -0.093426f, -0.044272f, -1.297636f, -0.237614f,
+ -0.160953f, 0.399036f, -0.030685f, -0.113619f, -0.184704f, 0.040519f,
+ -0.588252f, -0.210235f, -0.067623f, -0.031841f, -0.107261f, -0.192582f,
+ -0.253959f, -0.430821f, -0.103184f, -0.280185f, -0.357723f, 0.197761f,
+ -0.175087f, -0.055171f, 1.642014f, -0.192559f, -0.288147f, 0.610311f,
+ 4.688195f, -0.128728f, -0.914869f, -0.108286f, 0.013789f, 0.092125f,
+ 0.019770f, -0.178386f, 0.074164f, -1.152658f, -0.216738f, -0.277286f,
+ 0.012381f, 0.418259f, -0.680727f, -0.221917f, -0.485946f, 0.101672f,
+ 2.009457f, 0.054302f, 1.019838f, -0.116170f, 0.165134f, -0.112567f,
+ 0.852632f, -0.385796f, -0.108666f, 0.053181f, -0.311797f, -0.372875f,
+ -0.675717f, 2.409268f, -0.514720f, -0.214245f, -0.646596f, 0.009756f,
+ 0.203993f, 0.093617f, -0.301290f, 0.253551f, -0.128909f, -1.448442f,
+ -0.186823f, -0.278001f, -0.294993f, -0.176928f, -0.473605f, 0.062049f,
+ -0.212084f, -0.137326f, 0.012505f, 0.087850f, -0.200413f, -0.394119f,
+ -0.132224f, 0.146917f, 0.155746f, 0.198725f, -0.322541f, 0.196391f,
+ -0.945500f, 0.036736f, -0.155646f, -0.677341f, 1.130545f, -0.339554f,
+ 0.411628f, -0.355813f, -0.249843f, 0.213694f, -2.035607f, 0.055694f,
+ -0.111669f, 0.408696f, -0.067043f, -0.048182f, 0.398110f, -0.067542f,
+ 1.459801f, 0.236833f, -0.178806f, 0.168758f, 0.492387f, 0.099691f,
+ -0.776680f, -0.172865f, 0.204225f, 0.193982f, 0.575685f, -0.062248f,
+ 0.011486f, 0.058571f, -0.493391f, 0.026893f, -0.900467f, 3.793129f,
+ -0.634613f, -0.064660f, -0.048262f, 0.361905f, 0.033641f, 0.245171f,
+ -0.064671f, 0.034954f, 0.204358f, -0.904023f, -0.052714f, -0.250134f,
+ 0.136700f, 0.000734f, -0.371720f, 0.226483f, 0.217958f, 0.060559f,
+ 0.180111f, 0.000970f, 0.079556f, -0.096775f, 0.093855f, -0.026224f,
+ -0.243664f, 0.004290f, 0.123281f, -0.239476f, 1.230374f, -0.107826f,
+ -0.101982f, -0.153917f, 5.464427f, 0.304375f, -0.809957f, 0.090564f,
+ -0.278416f, -0.245555f, -2.078421f, 0.243093f, -0.127666f, 0.052451f,
+ -0.126662f, -0.783505f, 0.025149f, -1.422675f, -0.207769f, -0.362547f,
+ 0.115310f, 0.133390f, 1.264754f, -0.027055f, -0.485312f, -0.240717f,
+ -0.239722f, 0.146818f, -1.265043f, -0.235553f, 0.267104f, -0.021357f,
+ -0.435949f, -0.309371f, 0.049920f, 1.302721f, -0.233978f, -0.097551f,
+ -0.240631f, -0.287821f, -0.378380f, -0.273131f, -3.075169f, 0.226404f,
+ -0.029361f, 2.703590f, -0.430659f, 0.067927f, -0.387520f, -0.370630f,
+ -0.229236f, 0.085653f, -0.370956f, -0.065556f, -0.187859f, 0.068309f,
+ -0.109299f, -0.259898f, -0.103644f, -0.271199f, -0.209350f, 0.140993f,
+ -0.196713f, -0.135508f, -1.423209f, -0.406385f, -0.019956f, -0.864694f,
+ 5.963707f, -0.201157f, 0.726377f, -0.011076f, 0.010553f, -0.102918f,
+ -2.230088f, -0.258098f, -0.039547f, -0.029262f, -0.082324f, -0.860222f,
+ -0.094735f, -1.381839f, 0.587298f, -0.173048f, 0.721360f, 0.241900f,
+ 0.764302f, -0.023609f, -1.173755f, 0.103912f, -0.185363f, 0.078435f,
+ -2.245062f, -0.127269f, 0.202234f, 0.158975f, -0.260909f, 0.098608f,
+ -0.348247f, 1.732502f, -0.412298f, -0.269602f, -0.425771f, -0.146243f,
+ -0.530730f, 0.125716f, -1.004419f, 0.145109f, -0.059289f, 1.096304f,
+ 0.012891f, 0.045033f, -0.306875f, 0.003514f, -0.176110f, 0.037544f,
+ -0.441537f, -0.518921f, -0.262149f, -0.060407f, -0.379419f, -0.141245f,
+ -0.128894f, -0.176537f, -1.161318f, -0.249100f, -0.118330f, 0.042816f,
+ 1.173404f, 0.088312f, -0.393568f, -0.175134f, 6.529819f, -0.326652f,
+ -0.631917f, -0.393476f, 0.057781f, -0.217748f, -1.781139f, -0.012614f,
+ -0.212621f, -0.720322f, -0.218498f, -0.388556f, -0.254796f, -0.248399f,
+ -0.608744f, -0.265146f, 0.238517f, 0.066882f, -2.916806f, 0.054642f,
+ 0.282590f, 0.075248f, 0.010188f, -0.133486f, 0.985945f, -0.045849f,
+ -0.347564f, 0.057320f, -0.417920f, 0.063664f, 0.387062f, -2.692059f,
+ -0.535549f, 0.263736f, 0.327889f, -0.070273f, -0.775254f, 0.147250f,
+ 3.309425f, -0.212191f, -0.067204f, -2.912663f, -0.061496f, 0.084233f,
+ 0.022907f, 0.138421f, -0.112159f, -0.288447f, -0.010799f, 0.056049f,
+ -0.036527f, 0.021525f, 0.106649f, -0.291883f, 0.088424f, -0.057773f,
+ -0.086031f, 0.015277f, -0.318505f, -0.269049f, -1.008913f, -0.224785f,
+ -0.025820f, -0.649037f, 0.706381f, 0.096410f, 0.643776f, -0.046743f,
+ -0.009654f, -0.024246f, 1.469255f, -0.183536f, -0.370046f, -0.048442f,
+ -0.376527f, -0.431264f, -0.245109f, -0.093951f, 0.203683f, -0.099872f,
+ 0.087210f, 0.160692f, -3.527694f, -0.068891f, -0.228994f, -0.231817f,
+ -0.241949f, 0.193613f, 0.979597f, -0.091259f, 0.414424f, -0.047341f,
+ -0.209582f, -0.295134f, -0.016824f, 0.460327f, -0.072671f, 0.246234f,
+ 0.235896f, 0.127238f, -1.068683f, 0.035648f, 2.254888f, 0.180105f,
+ -0.260098f, -2.322120f, -0.184249f, -0.314801f, -0.099969f, -0.272117f,
+ -0.237916f, 0.031103f, -0.274063f, -0.049384f, -0.044917f, 0.102477f,
+ -0.342148f, -0.257558f, -0.346300f, 0.115333f, -0.115456f, 0.208354f,
+ -0.359301f, -0.167395f, 1.146514f, -0.177861f, -0.098658f, -0.444570f,
+ 6.759993f, -0.369772f, -0.831118f, 0.001866f, -0.073298f, -0.072095f,
+ 0.811902f, -0.431997f, -0.286587f, -0.269500f, 0.111492f, -0.525364f,
+ -0.351785f, -2.463474f, -1.852659f, 0.135325f, 0.138267f, 0.100643f,
+ -2.373278f, -0.285514f, -0.395388f, -0.185016f, -0.030249f, -0.005767f,
+ -0.716424f, -0.031674f, 0.011147f, 0.057405f, -0.215873f, -0.094401f,
+ 0.573528f, -1.223820f, 0.414852f, -0.059053f, -0.076488f, -0.287168f,
+ -0.842640f, 0.174084f, -0.567186f, 0.336629f, -0.062514f, 2.075448f,
+ -0.061680f, -0.131529f, -0.098994f, -0.204111f, -0.347865f, 0.108516f,
+ -0.049616f, -0.069212f, -0.273935f, -0.096545f, -0.210784f, -0.284698f,
+ 0.141501f, -0.176924f, -0.361341f, -0.251197f, -0.286694f, 0.245569f,
+ -1.521661f, -0.122639f, -0.015760f, -0.718912f, 5.877828f, 0.146916f,
+ 0.151767f, 0.220785f, -0.032298f, 0.230902f, 0.663943f, -0.252613f,
+ 0.057718f, -0.436038f, -0.323994f, -1.139787f, -0.042489f, -1.326298f,
+ -1.031206f, -0.104136f, 0.389897f, 0.127602f, -2.667789f, -0.212366f,
+ -0.506262f, -0.009115f, -0.213202f, 0.076167f, -1.629405f, 0.055129f,
+ 0.375393f, -0.150272f, -0.241515f, -0.326497f, 0.100069f, 0.410703f,
+ 0.340622f, 0.042437f, -0.349945f, 0.041176f, -1.178950f, 0.030992f,
+ 0.933908f, -0.035844f, -0.098660f, 1.030584f, -0.092043f, -0.355739f,
+ -0.305562f, 0.036161f, -0.049558f, -0.033225f, -0.403856f, -0.088276f,
+ 0.215493f, -0.149105f, -0.013363f, 0.025886f, -0.101306f, -0.205781f,
+ -1.072487f, -0.076019f, 0.077555f, 0.131003f, 1.267763f, -0.008954f,
+ -0.327617f, -0.246539f, 6.664081f, -0.404403f, -1.442489f, 0.191301f,
+ -0.336361f, 0.181156f, 0.833108f, 0.007879f, -0.194464f, -1.029408f,
+ -0.036268f, -0.927110f, -0.379190f, -0.293443f, -1.848579f, -0.242548f,
+ -0.065990f, 0.203160f, -0.291788f, 0.000680f, 0.587011f, -0.241289f,
+ 0.037034f, 0.000552f, 1.072308f, -0.387230f, -0.230050f, 0.292322f,
+ -0.720001f, 0.034109f, -0.467260f, 2.211644f, -1.839191f, -0.048797f,
+ -0.083469f, -0.334686f, -0.269056f, 0.051295f, 1.319904f, -0.035603f,
+ -0.018457f, -0.824915f, -0.212285f, -0.230516f, -0.035093f, -0.400843f,
+ -0.305469f, -0.099011f, 0.014225f, -0.452772f, 0.170331f, -0.389312f,
+ -0.115084f, -0.014770f, -0.429387f, -0.155961f, -0.568200f, -0.037853f,
+ -0.125137f, 0.067228f, -1.329271f, -0.117874f, -0.132499f, -0.218376f,
+ -0.588325f, -0.320024f, 0.085695f, -0.235047f, -0.217790f, 0.103015f,
+ -0.698644f, 0.017766f, -0.058299f, 0.199411f, -0.122485f, -0.563949f,
+ -0.349011f, -0.557045f, -0.131165f, 0.002281f, 0.118559f, -0.210302f,
+ -1.153815f, 0.116738f, -0.236007f, -0.003487f, -0.006885f, -0.244816f,
+ 0.953222f, 0.093748f, 0.266869f, 0.241869f, -0.860832f, -0.387012f,
+ -0.338986f, 2.097515f, -1.942512f, -0.298021f, 0.543911f, -0.043214f,
+ 0.082125f, -0.120242f, 0.712231f, 0.213327f, -0.301687f, -0.544011f,
+ -0.392131f, 0.004302f, 0.004825f, -0.317440f, -0.107518f, -0.293407f,
+ -0.159111f, -0.080367f, 0.132663f, -0.017726f, -0.237521f, -0.190297f,
+ -0.361633f, 0.200518f, -0.538296f, -0.027975f, -0.381704f, -0.016963f,
+ 0.630105f, -0.190997f, -0.287840f, -0.603488f, 3.605598f, -0.276614f,
+ -1.346383f, 0.186912f, -0.047575f, -0.189232f, -1.519072f, 0.097816f,
+ -0.223722f, 0.304924f, -0.213022f, -1.052433f, -0.322283f, -1.706734f,
+ -2.458027f, 0.237976f, 0.171050f, -0.103139f, -0.278689f, 0.329824f,
+ -0.262448f, -0.122916f, -0.236398f, -0.013848f, -0.969160f, -0.374907f,
+ 0.091018f, -0.386471f, -0.723940f, 0.064956f, -0.057652f, 1.321024f,
+ -1.397418f, -0.143136f, 0.272468f, -0.030749f, 0.037324f, 0.069316f,
+ -0.904925f, -0.333693f, -0.117709f, 2.279598f, -0.428065f, -0.131157f,
+ -0.014288f, -0.402862f, -0.666090f, 0.017070f, -0.028333f, 0.002481f,
+ 0.197156f, -0.038120f, -0.271062f, -0.188275f, -0.021370f, -0.070849f,
+ -0.905007f, -0.095886f, -0.093055f, -0.121821f, -1.239812f, -0.411799f,
+ -0.089948f, -0.936827f, 1.437569f, -0.388908f, 0.126170f, 0.186162f,
+ -0.018819f, -0.138364f, -1.066412f, -0.138222f, -0.022186f, 0.107331f,
+ -0.230436f, -1.352605f, -0.161323f, -1.081810f, -0.933825f, -0.136675f,
+ 0.378157f, 0.113377f, -0.850610f, 0.080245f, -0.087305f, -0.002852f,
+ 0.044408f, -0.188172f, -1.891998f, 0.092189f, 0.125325f, -0.105090f,
+ -0.848510f, -0.396308f, -0.384130f, 2.007509f, -1.480787f, -0.126946f,
+ 0.314767f, 0.000195f, -0.285628f, -0.110442f, -0.293948f, 0.258559f,
+ -0.417603f, 1.570705f, 0.092459f, -0.340974f, -0.284754f, -0.007801f,
+ -0.324610f, -0.004734f, -0.207716f, -0.057175f, 0.055467f, -0.210830f,
+ -0.113005f, -0.299177f, 0.068074f, 0.017929f, -2.897598f, -0.260074f,
+ -0.014422f, -0.206467f, 1.246997f, -0.372863f, -0.214160f, -0.114035f,
+ 5.805862f, 0.003611f, -1.340990f, -0.021085f, -0.260431f, -0.002720f,
+ -1.251640f, -0.353531f, -0.304009f, -0.153376f,
+};
+
+static const float av1_ab_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.521497f, -1.061572f, -0.078756f, -0.660662f, -0.403741f, -0.960163f,
+ 0.001427f, 0.523607f, 0.225068f, -0.055273f, 1.019519f, 1.181880f,
+ -0.010198f, 0.130597f, 1.276752f, 2.028188f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_32_layer0,
+ av1_ab_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_32_layer0,
+ av1_ab_partition_nn_bias_32_layer1,
+ },
+};
+
+// nn model for ab partition pruning, 16x16.
+static const float av1_ab_partition_nn_weights_16_layer0[FEATURE_SIZE * 64] = {
+ 0.151902f, 0.007947f, -1.788454f, 0.431869f, -2.971387f, 0.923566f,
+ 1.632542f, -1.665136f, -0.338632f, -5.075884f, 0.398267f, 0.030467f,
+ 2.263534f, -0.045532f, -1.066128f, 0.915139f, -0.560500f, -3.293125f,
+ 2.072793f, -1.011414f, 0.122716f, -0.060169f, -0.388860f, 0.031019f,
+ -0.381861f, 0.001551f, -0.328472f, 0.038296f, -0.060398f, -0.375556f,
+ 0.209226f, 0.014764f, -1.443469f, -0.345486f, 2.409269f, 1.524846f,
+ -0.640666f, 1.322139f, -2.074771f, -0.580944f, -0.203960f, -0.072893f,
+ 0.329701f, 0.115339f, -1.339542f, 0.249024f, -0.421545f, -0.409151f,
+ -0.258293f, 0.836288f, -0.073685f, -0.009624f, 0.895712f, 0.320639f,
+ 0.451002f, -1.544558f, 0.193709f, -1.389012f, 1.305451f, 0.089795f,
+ 0.050338f, -0.017433f, -0.304667f, 0.500729f, 0.504346f, 0.073757f,
+ 0.582649f, -0.993623f, 1.766766f, -3.067265f, -0.415774f, -0.006036f,
+ -1.245281f, 0.253205f, -0.591245f, -0.626238f, 0.551852f, 0.593755f,
+ 0.491023f, 1.099384f, -0.348448f, 0.054564f, -0.451422f, -0.375781f,
+ -0.248390f, -0.052548f, -0.380069f, -0.165391f, -0.297968f, -0.052142f,
+ -0.316381f, -0.045246f, -0.243905f, -0.034169f, -0.247523f, -0.180773f,
+ 0.068066f, -0.374920f, 0.057536f, -0.189748f, 0.058375f, -0.267749f,
+ -0.147286f, -0.246153f, 0.006183f, -0.202029f, -0.059128f, 0.116852f,
+ 0.134719f, -0.126900f, -0.064646f, -0.196458f, -0.182331f, 0.108029f,
+ -0.264499f, 0.155816f, -0.107255f, -0.056983f, -0.209771f, -0.099070f,
+ 0.007313f, -0.254124f, -0.231964f, -0.275972f, 0.032098f, -0.264564f,
+ -0.208743f, 0.155599f, -0.121511f, -0.156145f, -0.162315f, -0.059788f,
+ -0.257073f, -0.076654f, -0.110616f, -0.321675f, -0.051952f, 0.006301f,
+ -0.154114f, 0.017032f, -0.017364f, -0.233247f, 0.009918f, -0.179289f,
+ -0.190722f, 0.147106f, -0.063910f, -0.396872f, -0.263123f, -0.003850f,
+ -0.040718f, -0.324699f, 0.118660f, -0.170727f, -0.316788f, 0.100886f,
+ -0.202842f, 0.045371f, 0.150561f, -0.057054f, -0.308150f, 0.028346f,
+ -0.381473f, -0.195365f, 0.026221f, -0.281795f, 0.087204f, 0.047689f,
+ -0.027643f, -0.104724f, -0.089030f, -0.117661f, -0.349160f, 0.056982f,
+ -0.340273f, 0.048086f, 0.046103f, -0.121527f, 0.021697f, 0.054109f,
+ -0.002768f, -0.008461f, -2.297240f, 0.124651f, 3.621661f, -0.057120f,
+ -1.151656f, 2.296894f, -3.678720f, -0.290240f, 0.087683f, -0.186389f,
+ 0.007656f, -0.090236f, -0.245217f, 0.110389f, -0.251719f, -0.029084f,
+ -0.128203f, -0.100005f, -0.032779f, 0.007281f, -0.366596f, -0.267870f,
+ -0.215620f, 0.047687f, 0.010303f, 0.097980f, -0.191569f, -0.341162f,
+ 0.119249f, 0.026279f, -2.161546f, 0.459591f, 1.290566f, 1.791797f,
+ -0.409835f, 0.127081f, -1.156367f, 0.198286f, 0.099561f, -0.067445f,
+ -0.034352f, 0.017966f, -0.277380f, -0.057220f, -0.174198f, -0.014164f,
+ 0.146090f, -0.357530f, 0.097644f, -0.000932f, 0.446603f, -0.066793f,
+ 2.448620f, 0.937617f, -1.232922f, 0.313183f, 0.816827f, -0.275115f,
+ -0.245205f, -0.126895f, 0.156668f, -0.186977f, -0.273505f, 0.013315f,
+ 0.168629f, -0.089084f, 0.006166f, -0.116107f, -0.199316f, -0.024010f,
+ -0.242303f, 0.011612f, -0.218485f, -0.229661f, -0.123922f, 0.136699f,
+ 0.006732f, -0.148718f, -0.164225f, 0.116063f, 1.587898f, 0.690519f,
+ 0.360566f, 0.009739f, -0.678702f, -0.046003f, 0.126984f, 0.605212f,
+ 1.240663f, -0.000228f, -1.119369f, -0.415589f, -0.721003f, 0.097936f,
+ -1.410586f, -2.358833f, -2.773129f, -3.983361f, -0.087144f, -0.050029f,
+ -0.242255f, 0.137424f, -0.307490f, -0.084637f, -0.023812f, -0.196582f,
+ -0.078695f, 0.038257f, -0.012110f, -0.263521f, 0.009839f, -0.109125f,
+ -0.226036f, 0.060712f, 0.093671f, 0.153143f, 0.039116f, -0.290891f,
+ 0.227057f, -0.204633f, -0.207539f, -0.148242f, 0.046204f, -0.231268f,
+ -0.209315f, -0.307579f, -0.436556f, 0.023475f, 0.131793f, -0.038301f,
+ 1.650584f, 0.392570f, 1.446576f, 1.254380f, -0.516867f, -0.057116f,
+ 0.149320f, 0.414424f, -0.246309f, 0.003877f, -0.480238f, -1.037035f,
+ -0.830779f, -1.122244f, -0.408267f, -0.253956f, 0.382005f, 0.940609f,
+ -1.113370f, -0.018554f, 0.141064f, -0.182504f, 1.270707f, 0.414904f,
+ -0.216036f, 0.203831f, 0.450716f, -0.452909f, 0.139358f, -0.027143f,
+ 1.956892f, 1.643732f, -0.867839f, -0.620520f, -0.334607f, -0.519982f,
+ 0.205023f, 0.661159f, -0.000809f, 0.049033f, -0.348579f, -0.200338f,
+ -0.362144f, -0.346590f, -0.230096f, 0.180746f, -0.149954f, -0.253429f,
+ -0.378170f, -0.040724f, -0.041597f, 0.243659f, -0.472181f, 0.015401f,
+ -0.180376f, 0.153139f, -0.247738f, -0.010485f, -0.157158f, 0.016825f,
+ -0.238925f, -0.265798f, -0.318374f, 0.142352f, -0.210520f, 0.051928f,
+ -0.352190f, -0.179052f, -0.185498f, 0.025540f, -0.111667f, -0.235187f,
+ -0.215454f, 0.010931f, -0.238372f, -0.126659f, 0.075691f, -0.091167f,
+ -2.462379f, -0.007950f, -0.637990f, 0.285554f, -0.051275f, 0.282279f,
+ -0.744083f, -0.570646f, 0.592198f, 1.421332f, -0.256027f, -0.140315f,
+ 0.160247f, -0.063185f, -0.055895f, -0.199864f, -0.287353f, -0.074561f,
+ -0.071228f, 0.055864f, -1.084764f, -0.263409f, 0.779266f, 0.228187f,
+ 0.375013f, 0.121204f, -0.656948f, 0.533561f, 0.272671f, -0.015423f,
+ -0.124180f, -0.009127f, 2.934838f, -0.150998f, 1.163152f, 0.081997f,
+ -4.715939f, -3.676595f, -1.524886f, -0.167593f, 0.281186f, 0.024046f,
+ -1.451709f, 0.332558f, 0.990504f, 0.376290f, -1.466773f, -0.448439f,
+ -2.929108f, -4.255188f, 0.065238f, 0.019950f, 1.372393f, 0.444052f,
+ -2.538772f, 1.579767f, -0.464911f, -1.866114f, 1.053958f, 0.434467f,
+ -0.125964f, 0.034671f, 0.077116f, -0.138466f, -0.413395f, -0.223453f,
+ -0.172127f, -0.251265f, -0.048239f, -0.395519f, 0.023141f, 0.037459f,
+ -0.249593f, -0.062215f, -0.047209f, -0.435189f, -0.164155f, -0.077590f,
+ -0.241164f, -0.126128f, -0.038243f, -0.180888f, 0.198840f, -0.328036f,
+ -0.169790f, 0.036506f, 0.052572f, -0.183570f, -0.073617f, -0.244959f,
+ 0.266498f, 0.032846f, -1.902106f, 0.486078f, 2.414993f, 0.975182f,
+ -0.382875f, 1.647810f, -2.197017f, -0.890107f, 0.221287f, 0.010889f,
+ 3.817042f, 0.572728f, 0.092466f, 0.473337f, -1.634659f, -1.069455f,
+ 1.486776f, -1.023850f, 0.088184f, 0.008842f, 0.518202f, 0.270259f,
+ 1.757191f, -0.121839f, -2.912229f, -1.250866f, -2.381808f, 0.335309f,
+ -0.120079f, -0.061294f, -0.058725f, -0.315169f, -0.262443f, 0.072434f,
+ -0.267836f, -0.319354f, -0.274975f, 0.068970f, -0.406467f, 0.044074f,
+ -0.152311f, -0.333656f, -0.228355f, -0.185613f, 0.017346f, -0.177674f,
+ -0.090675f, -0.102047f, -0.011768f, -0.025280f, -0.271661f, 0.098099f,
+ -0.312272f, -0.222217f, -0.100548f, 0.106260f, -0.034655f, 0.135109f,
+ -0.021276f, 0.018177f, -0.353097f, -0.011128f, 0.061136f, -0.511662f,
+ -0.223236f, -0.308841f, 0.118789f, -0.154628f, -0.053178f, -0.055973f,
+ 0.013175f, -0.368337f, -0.090863f, -0.116920f, 0.178990f, -0.025278f,
+ -0.190553f, -0.238092f, 0.303943f, -0.024944f, 0.719373f, 0.384332f,
+ -0.378480f, -0.423316f, 0.709922f, 0.758514f, -1.559023f, -2.503173f,
+ 0.068652f, -0.234741f, -0.182932f, 0.037878f, 0.020684f, -0.174142f,
+ -0.182300f, -0.052796f, -0.219145f, 0.113028f, -1.041826f, 0.035317f,
+ 0.919904f, -0.676011f, 0.652297f, 1.456447f, -0.166904f, -0.861823f,
+ 0.895827f, 0.429821f, -0.180376f, -0.076587f, -0.273945f, -0.288990f,
+ -0.206692f, -0.080745f, -0.085444f, 0.186953f, -0.050135f, 0.044243f,
+ -0.391706f, -0.160498f, -0.292268f, 0.164060f, 0.412649f, 0.211611f,
+ -0.327294f, -0.919399f, 0.320297f, 0.385284f, -0.088848f, -0.072556f,
+ -0.384813f, -0.176267f, -0.065918f, 0.134724f, -0.231104f, -0.337707f,
+ -0.195442f, -0.263569f, 0.098090f, -0.341411f, -0.189211f, -0.439276f,
+ -0.404046f, 0.262491f, -0.311093f, -0.086454f, -0.013400f, -0.061447f,
+ -0.026945f, -0.112036f, -0.322985f, 0.078500f, -0.230205f, -0.344535f,
+ -0.021087f, 0.110220f, -0.128671f, 0.044219f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer0[64] = {
+ 2.936406f, -0.396539f, -0.110456f, -1.254954f, 0.785350f, 0.516290f,
+ -0.172341f, 0.254386f, -0.192465f, -0.106751f, -0.055518f, -0.094994f,
+ 0.000000f, -0.065018f, -0.004908f, -0.130483f, -0.119580f, -0.142072f,
+ 0.457446f, -0.125051f, -0.107712f, 0.714607f, -0.140809f, -1.788650f,
+ -0.087199f, 0.000000f, -1.290050f, 0.443930f, -0.110634f, -0.109380f,
+ -0.188213f, -1.414179f, 1.193579f, 0.388775f, -0.873193f, -0.110050f,
+ -0.072565f, -0.117050f, -0.119132f, 0.456959f, -0.132069f, 0.131974f,
+ 1.160474f, 1.746465f, 0.442628f, -0.188849f, -0.207794f, -0.108364f,
+ -0.856655f, -2.141620f, 0.335476f, -0.105508f, -0.212162f, -0.109319f,
+ -0.237213f, -0.109980f, -0.291044f, -0.137877f, 0.470191f, -0.023908f,
+ 0.123809f, -0.109797f, 0.200510f, -0.147542f,
+};
+
+static const float av1_ab_partition_nn_weights_16_layer1[64 * LABEL_SIZE] = {
+ -6.823716f, 1.406568f, -0.144009f, 2.228765f, 0.838336f, 0.738107f,
+ -0.319014f, -0.148756f, 0.240862f, -0.111089f, -0.004241f, 0.025758f,
+ -0.193820f, -0.246362f, -0.181363f, -0.201556f, 0.024268f, 0.252994f,
+ -0.289443f, 0.194932f, 0.057467f, 0.724735f, 0.014063f, 1.361352f,
+ 0.025191f, 0.024274f, 0.231462f, -7.227959f, -0.094515f, 0.039946f,
+ 0.412719f, 0.812318f, 3.038903f, -0.286289f, 0.647482f, -0.115114f,
+ 0.053590f, 0.066069f, 0.153134f, 0.996250f, -0.125700f, 0.951365f,
+ -6.243494f, -4.827697f, 0.566320f, 0.239515f, -0.099702f, 0.054546f,
+ 1.847330f, 3.680076f, -3.049829f, -0.127709f, 0.068469f, -0.017794f,
+ 0.223864f, -0.106778f, -0.020425f, -0.040226f, -0.251890f, -0.168673f,
+ -0.552073f, 0.043311f, 0.218668f, 0.033209f, -3.199210f, 0.193079f,
+ 0.321406f, 0.718307f, -0.181418f, -0.459612f, -1.981170f, 0.968496f,
+ -0.029757f, -0.130065f, 0.043782f, 0.072394f, -0.088686f, 0.025322f,
+ 0.129882f, 0.101324f, 0.335707f, 0.072714f, -2.079774f, 0.203997f,
+ 0.239321f, -0.301757f, 0.257845f, 1.288382f, -0.031275f, -0.234194f,
+ 0.310722f, 2.045469f, 0.034716f, 0.135638f, -0.251388f, 0.320071f,
+ -1.065301f, -0.322731f, -0.545028f, 0.226276f, 0.090799f, 0.019289f,
+ 0.048950f, -1.079300f, 0.231938f, 0.083683f, 4.762127f, 0.145037f,
+ -0.145549f, 0.075592f, 0.172336f, 0.108175f, 0.333751f, 1.090501f,
+ 1.056114f, 0.047073f, 0.182052f, -0.081587f, 0.089900f, 0.339286f,
+ 2.049988f, 0.073585f, 0.537355f, -0.243322f, -0.010179f, -0.052601f,
+ -0.174915f, 0.117793f, 2.222990f, -2.520837f, -0.092699f, 1.199887f,
+ 0.138720f, 0.679918f, -0.463155f, -0.659496f, -0.109913f, -0.003398f,
+ 0.114633f, -0.128377f, 0.092970f, -0.107489f, -0.191078f, 0.185182f,
+ 0.216980f, -0.019343f, 3.443133f, 0.287953f, 0.099314f, 0.985958f,
+ 0.157268f, -0.606516f, 0.049418f, -0.221809f, -0.453081f, -0.344796f,
+ -0.003735f, -0.107269f, -0.128541f, -0.259543f, -0.934806f, -0.542456f,
+ -1.011192f, 0.022795f, 0.186363f, -0.076356f, -0.050932f, -0.165098f,
+ 0.168177f, -0.101596f, -5.270886f, 2.553943f, -0.440870f, -0.017494f,
+ 0.215208f, -0.017032f, 1.495915f, -4.304677f, 0.762211f, 0.182937f,
+ 0.254406f, -0.029433f, -0.088364f, -0.110160f, -0.108257f, -0.036538f,
+ 0.737697f, -0.234989f, 0.168095f, 0.245118f, -0.077262f, 0.195718f,
+ 0.753302f, -1.637869f, 0.126227f, 0.982129f, -0.121444f, -0.295570f,
+ -1.215799f, 0.147867f, -0.068496f, 0.132726f, -0.005772f, -0.181774f,
+ 0.126513f, 0.204723f, -0.366123f, 0.103906f, -0.148053f, -0.075272f,
+ 0.243884f, -0.104828f, 0.198988f, 0.501034f, -0.112671f, 0.111421f,
+ 0.167508f, -0.117803f, -0.738624f, 2.046292f, 0.124011f, 0.057983f,
+ -0.359154f, -0.648883f, -0.259462f, -0.459041f, -2.501223f, -0.065138f,
+ 0.122417f, 0.060291f, -0.129033f, -0.843086f, 0.268241f, -0.399927f,
+ 1.585888f, 1.816393f, -0.631427f, 0.127826f, 0.088105f, 0.073488f,
+ 0.717694f, -1.497362f, 2.608528f, 0.066896f, -0.079230f, 0.223436f,
+ -0.010530f, 0.175310f, 1.120365f, 0.034391f, 0.835312f, 0.071652f,
+ -0.080615f, 0.111395f, 0.162742f, 0.079927f, -3.859582f, -0.638431f,
+ -0.167880f, -0.992659f, -0.885355f, -1.276197f, 1.334344f, 0.931940f,
+ -0.078244f, -0.149030f, -0.070974f, -0.133566f, 0.200034f, 0.102793f,
+ -0.048546f, 0.063545f, 0.023864f, -0.190863f, 1.934257f, -0.136286f,
+ -0.107916f, -0.637468f, 0.066449f, 1.089693f, -0.214047f, -0.265780f,
+ 0.899660f, -0.130333f, 0.288311f, -0.049024f, 0.090202f, 0.487969f,
+ 0.339704f, 0.858479f, 0.841253f, -0.184100f, -0.637070f, -0.125071f,
+ -0.077650f, -0.087877f, 0.202268f, -0.027300f, 2.842862f, -0.100698f,
+ -0.259080f, 0.260556f, 0.157912f, -0.070364f, 0.467190f, 1.200037f,
+ 1.419317f, -0.033588f, -0.227824f, 0.292617f, 0.228574f, 0.213839f,
+ -1.091099f, -0.022258f, -1.294681f, 0.136118f, 0.081652f, -0.185359f,
+ -0.039706f, 0.191407f, -2.053219f, -0.261934f, 0.047812f, -0.029536f,
+ -0.823869f, -1.090534f, -0.755890f, 0.441035f, -0.167945f, 0.231441f,
+ -0.135013f, -0.260762f, 0.256872f, 0.130339f, -0.243751f, 0.189760f,
+ -0.288454f, 0.145363f, 0.338490f, 0.403898f, -0.022814f, -1.263598f,
+ -0.101315f, 0.860135f, 0.136511f, 0.028942f, 0.574047f, 2.656370f,
+ 0.037587f, -0.188690f, -0.125312f, 1.100435f, -1.080402f, 0.380905f,
+ 0.004635f, 0.097144f, -0.214309f, 0.085552f, -0.285066f, -0.705134f,
+ -0.054704f, -0.319951f, 5.486626f, 0.958158f, -1.380585f, 0.223340f,
+ -0.169167f, -0.170697f, -0.216748f, 0.324232f, 2.684204f, -0.008490f,
+ -0.211052f, -0.201190f, 0.123466f, -0.000234f, 0.579907f, 0.096938f,
+ -0.042745f, 0.201855f, 0.157195f, -0.261440f, 0.029699f, -0.046599f,
+ 1.618216f, -2.596280f, -0.377420f, -0.526725f, -0.493592f, -0.579615f,
+ 0.579699f, -0.100392f, 0.150694f, 0.061794f, 0.200425f, -0.062515f,
+ -0.179122f, 0.250112f, -0.344675f, -0.118359f, -0.095670f, 0.152311f,
+ 3.662276f, -0.154921f, -0.312991f, 0.972008f, -0.308596f, -0.190426f,
+ 0.133889f, -0.238673f, -0.094726f, 1.683835f, -0.215629f, -0.198890f,
+ -0.035278f, -0.367973f, -0.822435f, 0.240848f, -0.194656f, 0.034655f,
+ -0.079424f, 0.146670f, 0.026646f, -0.034507f, 0.059467f, -0.153109f,
+ -0.431033f, 2.552991f, -1.894091f, -0.180462f, -0.306839f, -0.025648f,
+ 1.026326f, -3.096230f, 1.346935f, 0.033633f, -0.181827f, 0.094376f,
+ 0.001696f, -0.379264f, -1.069503f, -0.140972f, -0.208769f, -0.195239f,
+ 0.281795f, -0.127251f, 0.180776f, 0.067763f, 0.697124f, -1.040779f,
+ 0.111280f, 0.188351f, -0.340234f, -0.207790f, -0.720075f, -0.137409f,
+ -0.070310f, -0.032918f, -0.060787f, 0.131484f, -0.077845f, -0.258652f,
+ 0.056911f, -0.062034f, 0.007663f, -0.185100f, 1.340361f, 0.014096f,
+ -0.124602f, 0.194241f, 0.128383f, 0.360465f, 0.082979f, -0.050475f,
+ -0.519294f, 3.323262f, 0.067014f, 0.221203f, -0.085082f, -0.228606f,
+ -0.916668f, -0.022643f, -1.386737f, -0.131902f, -0.349952f, -0.032874f,
+ -0.189190f, -0.898790f, -0.102394f, -1.017387f, 2.214050f, 1.790253f,
+ -1.913561f, -0.043716f, -0.214924f, -0.194598f, -0.064723f, -1.671793f,
+ 2.251166f, -0.146007f, 0.138527f, -0.003134f, 0.103665f, 0.006928f,
+ -0.240253f, -0.227464f, 0.578437f, -0.214724f, 0.503085f, 0.158093f,
+ 0.033091f, 0.008061f, 4.815371f, 2.132264f, 0.281850f, -2.288560f,
+ -0.145012f, 1.296832f, -0.362401f, -0.403252f, 0.109873f, 0.185746f,
+ 0.244764f, 0.172367f, -0.185588f, 0.139801f, -0.178254f, 0.068629f,
+ 0.358488f, -0.153969f, -6.433524f, 0.225983f, -0.138123f, -0.095971f,
+ -0.036089f, -1.400083f, 0.265908f, 0.257787f, 0.181144f, -1.647228f,
+ -0.136289f, -0.074206f, 0.122988f, -0.088895f, -1.266717f, 0.006010f,
+ 0.536681f, 0.263061f, -0.032207f, -0.155136f, 0.086431f, 0.441950f,
+ -0.060755f, -0.280683f, -0.783475f, -2.567033f, 1.093221f, 0.117667f,
+ -0.000408f, 0.225719f, -2.199698f, 0.141447f, -1.459051f, 0.051315f,
+ 0.203228f, 0.354432f, -0.005775f, -0.028073f, -0.965817f, 0.231083f,
+ -0.666884f, 0.026283f, -0.317486f, 0.210754f, 0.123897f, 0.223827f,
+ 4.214405f, 1.457334f, -0.253945f, -1.306733f, -0.391235f, 0.451154f,
+ -1.553888f, -0.353429f, 0.069533f, 0.159278f, -0.173836f, -0.004952f,
+ -0.137033f, 0.127012f, 0.143600f, 0.051587f, -0.070549f, 0.066509f,
+ -5.776547f, 0.180021f, -0.189183f, -1.288504f, -0.233575f, -1.473873f,
+ 0.140940f, 0.144451f, -0.104534f, 2.089873f, -0.168168f, 0.110726f,
+ 0.132134f, -0.215223f, -1.682754f, 0.157757f, -0.146163f, 0.064882f,
+ 0.117313f, -0.038780f, -0.124720f, -0.501697f, 0.092047f, -0.233992f,
+ 3.324976f, 0.516601f, 1.294202f, 0.119989f, 0.061055f, 0.043420f,
+ -2.750727f, -0.382812f, -0.648496f, -0.115353f, -0.334205f, 0.024354f,
+ -0.282998f, -0.282705f, 0.073798f, 0.169851f, 0.135651f, 0.182677f,
+ -0.040220f, 0.132462f, -0.303120f, -0.230113f, 6.165739f, -0.258596f,
+ 0.024127f, -1.388283f, -0.006042f, 0.572600f, 0.348411f, -0.387376f,
+ -0.075845f, 0.122319f, -0.029616f, 0.077873f, 0.154763f, 0.049073f,
+ 0.018597f, 0.102688f, -0.204165f, 0.020734f, -1.389133f, -0.032854f,
+ -0.147561f, 0.853944f, 0.132100f, -3.259659f, 0.243745f, 0.181529f,
+ -0.738414f, 1.509994f, 0.023470f, -0.005329f, 0.066115f, -1.345081f,
+ -1.455402f, -0.172023f, -0.194625f, 0.071885f, -0.201742f, -0.262402f,
+ 0.077601f, -0.048938f, 0.257993f, -0.504029f, -2.032415f, 1.158880f,
+ 0.448647f, -0.025633f, 0.117586f, -0.072275f, -0.673744f, -3.854342f,
+ -0.983843f, 0.047766f, -0.017193f, -0.215775f, -0.158743f, -0.232042f,
+ -0.509112f, 0.148812f, 0.130122f, 0.006486f, -0.099016f, 0.022514f,
+ -0.486850f, -0.059623f, 4.012731f, 0.025454f, 0.029059f, -0.783546f,
+ -0.295260f, 0.322521f, -0.473201f, -0.172100f, -0.100087f, -0.076516f,
+ -0.258367f, -0.112897f, 0.269364f, -0.065912f, 0.169022f, -0.178783f,
+ -0.095114f, 0.122089f, -2.790099f, -0.100431f, -0.087963f, -0.009431f,
+ -0.087819f, -2.774399f, -0.100757f, 0.013005f, -0.964533f, 3.236665f,
+ -0.354903f, -0.144169f, -0.166869f, -1.396513f, -0.931271f, -0.046261f,
+ -1.799262f, -0.365269f, 0.108611f, 0.037994f, 0.024747f, -1.073639f,
+ -0.203158f, -0.935006f, 1.880891f, 1.578385f, 0.726272f, -0.024546f,
+ -0.011626f, -0.151363f, -1.121716f, -1.787484f, 0.232806f, 0.075451f,
+ 0.182899f, 0.092215f, -0.207347f, -0.030111f, 0.054316f, 0.192481f,
+ 0.594639f, -0.247694f, 0.547471f, -0.032094f, -0.065000f, 0.007198f,
+ 1.605377f, -0.155945f, -0.066200f, -2.343716f, -1.016283f, -0.079321f,
+ 0.919365f, 0.599980f, 0.125545f, 0.265813f, 0.246884f, 0.095385f,
+ -0.260374f, -0.202916f, -0.042770f, 0.234967f, -0.233139f, -0.326994f,
+ -1.375256f, 0.121766f, 0.077433f, -1.103569f, 0.019497f, -1.029185f,
+ 0.253905f, 0.206569f, 0.187334f, -0.237089f, -0.294351f, 0.164137f,
+ 0.149696f, -0.749787f, -0.413433f, 0.976587f, 1.027976f, -0.285264f,
+ 0.209273f, -0.124762f, 0.050884f, 0.250764f, -0.082031f, -0.646520f,
+ 4.116680f, 0.437336f, 0.671684f, 0.129509f, -0.078462f, 0.014072f,
+ -0.678232f, 0.094831f, 1.125624f, 0.207070f, -0.154750f, -0.025780f,
+ -0.103030f, 0.118019f, -0.908186f, -0.263546f, -1.555324f, -0.236887f,
+ -0.217854f, -0.051790f, 0.017915f, 0.171001f, 1.355562f, 0.094603f,
+ -0.233929f, -1.282169f, -0.773183f, -0.161682f, -0.834565f, -0.286776f,
+ -0.298901f, 0.038162f, 0.251899f, 0.039612f, -0.022935f, -0.232308f,
+ -0.043855f, -0.192892f, -0.279009f, -0.182234f, -1.272808f, -0.070344f,
+ -0.092432f, -1.915946f, -0.134373f, -1.405496f, -0.067071f, -0.131922f,
+ 0.185269f, 1.465082f, 0.040240f, 0.112665f, 0.144329f, -0.286112f,
+ -0.617649f, 0.916177f, 0.221044f, -0.079867f, 0.170251f, -0.093638f,
+ -0.212620f, -0.305945f, -0.234356f, -0.482501f, 3.928472f, 1.241179f,
+ 0.355922f, -0.170848f, -0.189168f, 0.080225f, -1.357793f, 0.190890f,
+ 0.976800f, -0.068070f, -0.016295f, -0.088623f, -0.129560f, -0.212267f,
+ -0.071537f, -0.219501f, -0.655198f, -0.225188f, -0.116024f, 0.224174f,
+ -0.049715f, -0.178005f, 3.029985f, -1.141546f, 0.080066f, -1.932316f,
+ -0.641137f, -0.189564f, 0.935080f, 0.136119f, 0.015558f, -0.179331f,
+ 0.204571f, 0.020350f, 0.009362f, 0.108478f, 0.037076f, -0.049009f,
+ 0.081090f, -0.180202f, 1.455561f, -0.081559f, 0.059361f, 0.484971f,
+ 0.160923f, -2.170744f, -0.013204f, 0.126561f, -0.407122f, 1.223661f,
+ 0.044262f, 0.118044f, 0.058274f, -1.747100f, -0.171318f, 0.971374f,
+ 0.306995f, -0.103268f, -0.319443f, -0.333176f, -0.038608f, 0.119674f,
+ -0.106479f, -0.907933f, 1.121231f, 1.673840f, -0.421458f, -0.021146f,
+ -0.254838f, 0.097632f, 0.235109f, -2.901782f, 0.289518f, -0.355459f,
+ -0.068264f, -0.179121f, 0.068560f, -0.047570f, -0.522523f, -0.228963f,
+ -1.037158f, -0.163723f, 0.280563f, -0.000868f, -0.197220f, -0.239329f,
+ 1.985274f, -0.256181f, -0.064341f, -0.822417f, -0.465140f, -0.010942f,
+ -0.792024f, -0.114290f, 0.060969f, 0.104106f, -0.252123f, -0.150400f,
+ -0.133277f, 0.267147f, 0.274413f, 0.223744f, -0.180223f, -0.345415f,
+ -0.104883f, 0.119210f, -0.095041f, -0.301635f, 0.013175f, -2.128121f,
+ -0.147208f, -0.151509f, -0.692013f, 3.418555f, -0.016541f, 0.171511f,
+ 0.107159f, -1.516672f, 0.127408f, 0.687035f, -0.906486f, -0.145463f,
+ -0.169382f, -0.143906f, 0.125091f, -0.960645f, -0.180869f, -0.716908f,
+ 2.840951f, 1.904919f, -0.416268f, -0.425181f, -0.194697f, -0.075932f,
+ -0.950604f, -1.599800f, 0.943671f, -0.022744f, -0.270492f, 0.080843f,
+ -0.372916f, 0.047838f, -0.100300f, -0.026600f, 0.011733f, -0.226051f,
+ 0.172790f, -0.172982f, 0.041258f, -0.299379f,
+};
+
+static const float av1_ab_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.053805f, -1.248639f, 0.520965f, -0.904962f, -0.126425f, -0.118798f,
+ 0.748430f, 0.203096f, 0.059317f, 0.418219f, 0.841294f, 0.402693f,
+ -0.658522f, 0.723479f, 0.544264f, 1.035225f,
+};
+
+static const NN_CONFIG av1_ab_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64, // num_hidden_nodes
+ },
+ {
+ av1_ab_partition_nn_weights_16_layer0,
+ av1_ab_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_ab_partition_nn_bias_16_layer0,
+ av1_ab_partition_nn_bias_16_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 18
+#define LABEL_SIZE 4
+
+static const float av1_4_partition_nn_weights_16_layer0[FEATURE_SIZE * 24] = {
+ -2.032866f, 0.056691f, 0.495960f, 0.778785f, 0.548153f, -0.806942f,
+ 0.481155f, 0.282298f, 0.584980f, 0.504688f, 0.209648f, 0.234616f,
+ 0.213484f, 0.221969f, 0.205862f, 0.235054f, 0.317863f, 0.257139f,
+ 0.529478f, 0.098122f, -0.657532f, 0.036296f, 0.327728f, 1.323180f,
+ -0.813082f, 0.160216f, -0.702030f, 0.722733f, -0.270576f, -0.347416f,
+ -0.264700f, -0.254248f, 0.159820f, 0.087995f, -0.184163f, 0.117357f,
+ 0.074194f, -0.667369f, 0.498246f, 0.420506f, 0.072409f, -0.121581f,
+ 0.315788f, 0.000525f, 0.414986f, 0.678166f, -0.011230f, 0.188131f,
+ -0.227749f, 0.009564f, 0.108672f, 0.106923f, -0.080695f, -0.279382f,
+ -0.061339f, -0.297835f, -0.134707f, 0.145865f, -0.009655f, -0.000842f,
+ -0.047436f, -0.159149f, -0.320353f, -0.089646f, -0.344765f, 0.313416f,
+ -0.143413f, 0.279668f, 0.000885f, -0.022380f, -0.140194f, -0.310473f,
+ 0.252699f, 0.066204f, 0.477568f, 0.994609f, -0.276000f, 1.213182f,
+ 0.277028f, -0.411570f, -0.211559f, 0.377815f, 0.121488f, -0.100559f,
+ -0.317082f, -0.251039f, -0.335181f, -0.154114f, -0.052726f, -0.332558f,
+ -0.143196f, -0.334035f, 0.162305f, 0.142279f, -0.001210f, -0.135252f,
+ -0.033562f, 0.204307f, -0.039757f, -0.394174f, 0.126617f, -0.128648f,
+ -0.410979f, 0.107641f, -0.117573f, -0.326512f, 0.235166f, 0.084959f,
+ 0.290063f, -0.005838f, 0.459894f, 1.023709f, -0.196145f, 1.100137f,
+ -0.319815f, -0.308526f, -0.443389f, -0.272769f, -0.035259f, -0.026932f,
+ -0.029743f, 0.125113f, -0.131024f, -0.321458f, -0.143996f, 0.008714f,
+ -0.101234f, 0.079706f, -1.128615f, -0.467381f, 0.220563f, -0.409900f,
+ -0.435353f, 0.759499f, -0.465799f, -0.394309f, 0.176282f, -0.086275f,
+ -0.161225f, -0.354814f, 0.562871f, 0.418253f, 0.414361f, 0.445480f,
+ -0.995903f, -0.086632f, -0.230645f, 0.354656f, -0.317576f, 0.079926f,
+ 0.424369f, 0.997232f, -0.304388f, 1.071667f, -0.023540f, 0.029677f,
+ 0.108564f, 0.183581f, -0.201395f, -0.054854f, -0.193039f, -0.049899f,
+ -0.271949f, -0.358483f, 0.304930f, 0.023823f, -0.009319f, -0.214247f,
+ 0.100712f, -0.050162f, 0.327103f, -0.212999f, -0.030496f, 0.316380f,
+ -0.439589f, -0.249959f, 0.229777f, -0.353664f, -0.384559f, 0.114236f,
+ 0.023119f, 0.007927f, 0.618368f, 0.957759f, -0.019780f, -1.002389f,
+ 0.564277f, -0.839531f, 1.040445f, 0.054340f, 0.031908f, -0.032893f,
+ -0.019170f, -0.042011f, 0.568928f, 0.362567f, -0.559999f, -0.605344f,
+ -0.586146f, -0.290778f, 0.195943f, -0.109580f, -0.088898f, -0.113054f,
+ 0.293282f, 0.429019f, 0.306136f, 0.863025f, 0.021234f, 0.125770f,
+ -0.097108f, -0.072659f, -0.137053f, -0.191631f, 0.106281f, 0.064151f,
+ 0.029883f, 0.076287f, 0.757543f, 0.276713f, -2.529775f, -0.351727f,
+ -1.832316f, 0.544780f, -0.944529f, 0.509705f, -0.010236f, -0.016181f,
+ 0.021520f, 0.086417f, 0.041312f, 0.296853f, -0.372378f, 0.354446f,
+ -1.366762f, 0.048875f, 0.464918f, -0.007450f, 0.750013f, -0.360261f,
+ 0.518532f, 0.753776f, 0.641448f, 0.710746f, 0.250866f, 0.257063f,
+ 0.283421f, 0.253585f, 0.170303f, 0.210426f, 0.208842f, 0.158000f,
+ -0.033144f, 0.130748f, 0.907147f, 0.409248f, -0.854301f, -0.981307f,
+ 0.294427f, -0.507137f, 1.079967f, 0.203203f, 0.383890f, 0.368278f,
+ 0.305122f, 0.449288f, -0.044507f, -0.547263f, -0.298245f, -0.497834f,
+ 0.007016f, -0.101982f, -0.073488f, -0.096111f, -0.479418f, -0.045497f,
+ 0.033502f, -0.018578f, -0.231531f, 0.177949f, 0.099564f, -0.010233f,
+ -0.333055f, -0.078586f, -0.417867f, 0.171271f, 0.013662f, -0.143599f,
+ -0.117296f, 0.135382f, 0.048321f, 0.000924f, -0.055024f, -0.405595f,
+ -0.068260f, -0.271011f, -0.436425f, 0.206751f, -0.899890f, 0.605510f,
+ 0.535649f, -0.238919f, -0.037619f, -0.213734f, -0.391360f, -0.132344f,
+ 0.004660f, 0.176644f, -1.008475f, -0.038895f, 0.155429f, -0.095229f,
+ -0.680124f, -0.258063f, -0.261901f, 0.110380f, -0.337649f, -0.505870f,
+ -1.428536f, 0.610629f, 0.254905f, 0.045098f, 0.044109f, 0.172329f,
+ 0.060001f, -0.234009f, -0.184855f, -0.153028f, -0.140897f, -0.152006f,
+ -0.312134f, 0.081261f, 0.160166f, 0.112690f, 0.266081f, 0.030175f,
+ -0.242746f, 0.000754f, -0.341811f, -0.149774f, -0.017484f, -0.301342f,
+ -0.121466f, 0.067300f, 0.342176f, 0.474538f, 0.085441f, -0.263935f,
+ 0.479235f, -0.003713f, -0.784840f, 0.119480f, 0.456632f, -0.640082f,
+ -0.080575f, -0.744403f, 0.259970f, 0.034667f, -0.274641f, -0.257594f,
+ -1.121124f, -0.003745f, -0.420693f, 0.300441f, -0.100976f, -1.049016f,
+ 0.201960f, 0.113054f, 0.187010f, 1.237427f, 0.054803f, -0.028673f,
+ 0.003596f, -0.034724f, 0.117246f, 0.190977f, 0.278915f, 0.224307f,
+ 0.017852f, -0.336233f, -0.372311f, -0.182284f, -0.143510f, 0.331466f,
+ 0.045698f, -0.301095f, 0.184447f, 0.348240f, -0.017021f, -0.145064f,
+ -0.000221f, -0.382256f, -0.302683f, -0.083927f, -0.008070f, 0.217907f,
+ 0.647597f, -0.050490f, -0.572736f, -0.985748f, -0.289943f, 0.041391f,
+ -0.795464f, -0.186680f, -0.354062f, -0.617400f, -0.282783f, -0.170450f,
+ -0.197197f, -0.146496f, -0.173692f, -0.106277f, -0.071004f, -0.124405f,
+ -0.971412f, 0.038542f, 0.705204f, 0.887113f, 0.150430f, -0.243676f,
+ 0.638410f, 0.320953f, 0.776676f, 0.527584f, 0.070389f, 0.051554f,
+ 0.177519f, 0.140451f, 0.128892f, 0.087771f, 0.197660f, 0.194764f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer0[24] = {
+ 0.614063f, -0.384872f, 0.084884f, -0.023980f, -0.378765f, -0.082312f,
+ -0.458271f, 0.189578f, -0.046169f, -0.073308f, -0.372322f, 0.162793f,
+ 0.148803f, 0.829214f, -0.221162f, -0.111157f, -0.017484f, -0.280596f,
+ -0.031905f, -0.143459f, 0.078823f, -0.021940f, 0.026834f, 0.257472f,
+};
+
+static const float av1_4_partition_nn_weights_16_layer1[24 * LABEL_SIZE] = {
+ -0.985391f, 0.587616f, 0.740683f, 0.192066f, 0.447080f, -0.016585f,
+ 0.680449f, 0.028983f, 0.643111f, 0.234338f, 0.107148f, 0.328456f,
+ -0.216394f, 1.106838f, -0.179062f, -0.129108f, -0.121655f, -0.151340f,
+ -0.306017f, -0.350989f, 0.859284f, -0.372831f, -0.954419f, 0.250495f,
+ 1.046732f, 0.287923f, -0.421088f, 0.326613f, -0.314396f, -0.084757f,
+ -0.474228f, 0.687999f, 0.052334f, 0.441708f, -0.630698f, -0.350348f,
+ -0.602067f, -0.434161f, -0.489824f, -0.313193f, 0.315568f, 0.603119f,
+ 0.120245f, 0.182920f, -1.117797f, -0.239594f, -0.296296f, -0.718093f,
+ 0.489497f, -0.527019f, 0.102453f, 0.426731f, 0.034606f, 0.311461f,
+ -0.012723f, -0.229877f, -0.284290f, 0.383227f, 0.065696f, -0.222400f,
+ 1.279248f, -0.862190f, 0.629766f, -0.250011f, -0.325060f, -0.360115f,
+ -0.159540f, -0.291856f, -0.038348f, 0.224639f, 0.600934f, 0.030205f,
+ 1.337615f, -0.286409f, -0.473710f, -0.418995f, -1.035249f, 0.004359f,
+ -0.481860f, 0.563625f, -0.154709f, -0.101198f, -0.758796f, -0.507616f,
+ -0.095253f, -0.711135f, 0.207759f, 0.076313f, -0.056087f, -0.162719f,
+ -0.232918f, -0.128402f, -0.444620f, -0.447344f, 1.126012f, -1.504446f,
+};
+
+static const float av1_4_partition_nn_bias_16_layer1[LABEL_SIZE] = {
+ -0.462133f,
+ 0.465060f,
+ 0.062211f,
+ 0.401786f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_16_layer0,
+ av1_4_partition_nn_weights_16_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_16_layer0,
+ av1_4_partition_nn_bias_16_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_32_layer0[FEATURE_SIZE * 32] = {
+ -0.219494f, -0.428273f, 0.471006f, 0.448210f, -0.152935f, 0.440435f,
+ 0.922857f, -0.074436f, 1.002195f, 0.414176f, -0.327202f, -0.380066f,
+ -0.212346f, 0.061868f, -0.056620f, 0.594134f, 0.617995f, 0.308358f,
+ 0.232484f, 0.129849f, 1.483593f, -0.071460f, 1.984515f, 1.116422f,
+ -1.141762f, -0.306220f, 0.089075f, -0.271845f, 0.187524f, 0.050396f,
+ -0.061025f, 0.030809f, 0.172799f, -0.458151f, -0.318357f, 0.122052f,
+ -0.414329f, 0.089366f, 0.118898f, -0.376213f, -0.206151f, -0.519946f,
+ -0.463252f, -0.206694f, -0.254383f, -0.379487f, 0.093059f, -0.245280f,
+ -0.205044f, -0.280060f, -0.171229f, -0.045389f, -0.179481f, -0.306245f,
+ -0.500856f, 0.003388f, -0.527397f, -0.449330f, -0.174272f, 0.123769f,
+ 0.023005f, 0.157273f, 0.073400f, 0.019099f, -0.113848f, -0.098601f,
+ -0.290946f, -0.046770f, -0.314592f, -0.179914f, -0.391411f, -0.235631f,
+ -1.282604f, 0.048505f, -0.746382f, 0.093740f, -0.706583f, -0.085729f,
+ 0.947382f, -0.002961f, 1.175362f, 1.007309f, 0.141638f, -0.037608f,
+ -0.118807f, -0.021474f, -0.146763f, 0.069363f, -0.074372f, -0.215713f,
+ -0.004134f, -0.114110f, -0.330438f, -0.031136f, 0.111821f, -0.534598f,
+ -0.357759f, -0.455950f, 0.139469f, 0.036582f, -0.384743f, -0.168828f,
+ -0.239250f, 0.003520f, -0.049003f, 0.075702f, -0.025809f, -0.225972f,
+ -0.228905f, -0.412489f, 0.060570f, -0.328819f, -0.206446f, -0.080231f,
+ -0.372008f, -0.218118f, -0.011954f, 0.024155f, 0.156014f, 0.020679f,
+ 0.194398f, -0.283491f, -0.024463f, -0.275099f, 0.028031f, 0.026340f,
+ -0.254668f, 0.103637f, 2.178693f, 0.552284f, 0.109366f, -0.474806f,
+ -0.379286f, -0.026315f, 2.487924f, -0.089466f, 0.206428f, 0.114578f,
+ 0.152248f, 0.184050f, -0.631948f, -0.014793f, -0.283782f, -0.830353f,
+ 0.009343f, -0.021029f, -0.060534f, -0.025164f, 1.841311f, 1.842748f,
+ -1.979708f, 0.450985f, -1.606357f, -0.785454f, -0.212679f, -0.344342f,
+ 0.198991f, -0.258070f, 0.055974f, 0.224069f, 0.453051f, 0.408053f,
+ 0.027873f, -0.180538f, 0.056609f, 0.207654f, 0.104086f, -0.194426f,
+ -0.359789f, -0.381143f, -0.331212f, -0.203973f, -0.324313f, -0.160825f,
+ -0.160439f, -0.044856f, -0.346647f, 0.044859f, 0.231398f, -0.023643f,
+ -0.140316f, -0.260177f, 0.206965f, -0.425386f, -0.420268f, -0.409748f,
+ 0.006971f, 0.066186f, -0.034950f, -0.345518f, 0.018633f, -0.122489f,
+ -0.038506f, -0.330942f, 0.161236f, -0.314119f, -0.050202f, -0.179597f,
+ 0.731897f, -0.184481f, 0.153598f, -0.539501f, -0.301493f, -0.184967f,
+ -0.883754f, -0.586959f, -0.136292f, -1.772065f, -0.196276f, -0.053272f,
+ -0.101083f, -0.064142f, 0.161190f, 0.430826f, 0.355647f, 0.138266f,
+ 0.051114f, -0.028893f, -0.477673f, -0.238663f, -0.354117f, -0.056747f,
+ -0.334273f, -0.497688f, -0.486004f, -0.092033f, -0.241304f, -0.373250f,
+ 0.120193f, 0.011360f, -0.010475f, -0.092739f, -0.159650f, -0.033129f,
+ -0.259893f, -0.073217f, 0.200128f, 0.103407f, -0.229233f, 0.128831f,
+ -0.063450f, -0.241732f, -0.408428f, -0.342239f, -0.264326f, -0.105403f,
+ -0.442879f, -0.310456f, -0.112881f, 0.263696f, -0.205014f, -0.497936f,
+ -0.261734f, -0.382312f, -0.426807f, -0.021995f, -0.152794f, -0.301494f,
+ 0.117232f, -0.577809f, 0.154596f, -0.409522f, -0.413113f, -0.359199f,
+ 0.307294f, -0.008746f, -0.310522f, 0.347620f, -0.384845f, -0.451398f,
+ -0.226199f, 0.054154f, -0.167608f, 0.046836f, -0.013285f, -0.408119f,
+ -0.177973f, -0.248293f, -0.465830f, 0.035827f, -0.222208f, -0.221717f,
+ 0.066392f, -0.349769f, -0.428029f, -0.516692f, 0.022398f, -0.251682f,
+ 0.134746f, 0.011167f, -2.078787f, 0.173592f, -1.948348f, 0.330060f,
+ 1.993785f, -0.052859f, -0.004795f, -3.703177f, 0.013450f, -0.011687f,
+ 0.073079f, 0.034803f, 0.025515f, 0.005994f, 0.101731f, 0.074303f,
+ -0.109962f, -0.270825f, -0.068273f, -0.163268f, -0.252826f, 0.137190f,
+ 0.007667f, -0.358453f, 0.027412f, 0.033492f, 0.021197f, -0.049991f,
+ 0.104468f, -0.012157f, -0.056252f, -0.380756f, -0.338483f, 0.233235f,
+ -0.048631f, -0.441209f, -0.158482f, -0.148108f, -0.263453f, 0.138847f,
+ -0.304073f, -0.336312f, -0.017941f, -0.135563f, 0.075137f, -0.246475f,
+ -0.229144f, -0.087744f, -0.346909f, 0.172611f, 0.004377f, -0.009386f,
+ -0.023104f, 0.008000f, -0.029390f, -0.317842f, 0.549674f, -0.195337f,
+ -0.863979f, 0.160889f, -0.269014f, -0.442104f, -1.799191f, 1.396533f,
+ -0.112837f, 0.881303f, 0.000764f, -0.035415f, -0.141877f, 0.184831f,
+ -0.363566f, -0.178569f, 0.254134f, -0.326893f, 0.127325f, 0.310620f,
+ -0.384621f, 0.146058f, -0.287682f, -0.373447f, 0.026930f, 0.251650f,
+ 0.053817f, 0.227509f, 0.121396f, 0.396514f, -0.278381f, -0.038969f,
+ -1.538756f, -0.002856f, -0.892900f, 0.363426f, -1.257922f, 0.743795f,
+ 0.941177f, 0.219345f, 0.684189f, 1.396858f, 0.026299f, -0.093433f,
+ -0.066182f, 0.057868f, -0.089278f, -0.159680f, -0.262035f, -0.236656f,
+ 0.005349f, -0.031314f, 0.027917f, -0.182113f, -0.212086f, -0.160774f,
+ 0.051468f, 0.036787f, 0.183881f, -0.288205f, -0.349691f, 0.162511f,
+ 0.117878f, -0.294534f, -0.365037f, -0.246313f, 0.073977f, -0.072378f,
+ -0.173579f, -0.584560f, 0.547194f, 0.259853f, -0.405287f, -0.421146f,
+ 0.165788f, -0.146964f, 0.257415f, 0.772394f, -0.475302f, -0.310906f,
+ 0.058723f, 0.276833f, 0.586842f, 0.248998f, -0.061135f, 0.255779f,
+ 0.152158f, -0.024781f, 2.821834f, 1.365141f, 0.914744f, 0.165752f,
+ -1.048304f, -0.333891f, 1.804087f, -0.437028f, -0.120211f, -0.020443f,
+ 0.040077f, 0.258600f, -0.598893f, -0.494579f, -0.281054f, -0.517041f,
+ 0.005258f, 0.053986f, 0.322755f, 0.429495f, -1.992364f, -0.717192f,
+ -1.774802f, 2.047362f, -0.016194f, 0.312606f, 0.019331f, 0.060950f,
+ 0.116428f, 0.168458f, -0.307001f, -0.420734f, 0.475843f, 0.425346f,
+ -0.107119f, 0.049892f, -1.168619f, 0.010878f, 0.354872f, 0.902717f,
+ -0.391407f, 0.332772f, -1.335037f, -0.447100f, 0.481719f, -0.101069f,
+ -1.806565f, 0.925280f, 0.346999f, 0.093809f, 0.006275f, 0.270814f,
+ -0.691123f, 0.230748f, 0.137033f, 0.068228f, 1.555975f, -0.271637f,
+ -0.370403f, 0.236131f, 0.367464f, -0.136562f, 0.428838f, 0.181750f,
+ 0.338762f, 0.292449f, -0.748204f, -0.922731f, -0.959445f, -0.806418f,
+ -0.140501f, 0.070525f, 1.248748f, 0.637990f, -1.307246f, -0.514055f,
+ 0.393858f, -1.858727f, 0.713591f, -0.141044f, 0.080723f, 0.120220f,
+ -0.031175f, 0.224488f, 0.753818f, -0.833351f, -1.099132f, 0.651100f,
+ -0.135061f, -0.043820f, 0.026983f, -0.059259f, 0.001345f, -0.281775f,
+ 0.006958f, 0.046103f, -0.246539f, 0.057630f, -0.360778f, -0.160681f,
+ -0.414870f, -0.301979f, 0.000683f, 0.132957f, -0.477609f, 0.106110f,
+ -0.637769f, -0.078374f, -0.229494f, 0.583108f, -0.822973f, -0.107540f,
+ 1.063426f, -0.268346f, 1.105787f, 2.587550f, -0.020314f, -0.002161f,
+ -0.063836f, -0.099990f, -0.103975f, -0.114078f, -0.094199f, -0.065181f,
+ -0.019870f, -0.018920f, -0.219732f, 0.035608f, -1.789450f, 0.483032f,
+ -0.464729f, 1.563277f, -1.054195f, 0.359991f, 0.065204f, 0.135623f,
+ 0.158380f, -0.103815f, -1.398726f, -1.436666f, -0.356311f, 0.507752f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer0[32] = {
+ 0.421645f, -0.620548f, -0.187819f, -0.189414f, -0.204975f, -0.189600f,
+ -0.174917f, -0.651928f, -0.799655f, -0.086105f, -0.163449f, -0.089212f,
+ -0.214495f, -0.108500f, -0.065777f, -0.127704f, 1.544948f, -0.032831f,
+ -0.165621f, 0.145844f, -0.032104f, -0.453246f, -0.113444f, 0.321589f,
+ -0.862375f, -0.108826f, -0.486259f, 0.685325f, 0.072569f, -0.187961f,
+ 0.109579f, -0.082685f,
+};
+
+static const float av1_4_partition_nn_weights_32_layer1[32 * LABEL_SIZE] = {
+ 0.255012f, 0.658860f, 0.216907f, 0.165947f, 0.241182f, 0.340854f,
+ 0.409445f, 0.165220f, 0.553373f, -0.242385f, -0.209571f, 0.255515f,
+ 0.222500f, 0.037032f, 0.238590f, 0.061624f, -2.038693f, 0.264167f,
+ -0.230144f, 0.129952f, -0.027979f, 0.847761f, 0.438922f, 0.462323f,
+ 0.555345f, 0.030689f, 0.336357f, -0.357326f, -0.113137f, 0.272631f,
+ 0.421022f, 0.367776f, -0.197094f, 0.157117f, -0.015008f, -0.056123f,
+ -0.283913f, 0.186417f, 0.178561f, -0.763041f, 0.602038f, 0.341092f,
+ 0.320453f, -0.312776f, -0.371240f, -0.356279f, 0.220117f, -0.131871f,
+ 1.517429f, 0.162223f, -0.255069f, 0.451861f, 0.045071f, -0.223257f,
+ 0.003257f, 0.015734f, -0.630447f, -0.672588f, 0.670164f, 0.571031f,
+ -0.657948f, 0.034506f, -0.249076f, 0.790293f, 0.066491f, -0.131245f,
+ 0.355173f, 0.564622f, 0.374048f, 0.033974f, 0.253970f, 0.495498f,
+ -0.556321f, -0.104651f, 0.276947f, 0.057148f, -0.039126f, -0.170050f,
+ -0.141542f, 0.158541f, 0.582763f, -0.100992f, 0.096705f, -0.209029f,
+ 0.008449f, 0.255865f, 0.103565f, 0.317719f, 0.479499f, 0.599126f,
+ -0.065613f, -0.268614f, 0.508736f, 0.180813f, -0.815868f, 0.051238f,
+ 0.001223f, -0.305423f, -0.270079f, 0.036180f, 0.304342f, 0.202634f,
+ 0.218348f, -0.304304f, -0.438297f, 0.241123f, 0.200230f, 0.151804f,
+ 0.051944f, 0.160422f, -0.262981f, -0.417412f, 1.845729f, -0.086183f,
+ 0.403517f, 0.059667f, 0.564543f, -0.081752f, 0.114907f, -0.284489f,
+ -0.673943f, 0.056965f, 0.362221f, 0.403224f, -0.000233f, -0.209552f,
+ -0.800926f, -0.134132f,
+};
+
+static const float av1_4_partition_nn_bias_32_layer1[LABEL_SIZE] = {
+ -0.019518f,
+ 0.198546f,
+ 0.339015f,
+ -0.261961f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_32_layer0,
+ av1_4_partition_nn_weights_32_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_32_layer0,
+ av1_4_partition_nn_bias_32_layer1,
+ },
+};
+
+static const float av1_4_partition_nn_weights_64_layer0[FEATURE_SIZE * 24] = {
+ -0.152649f, 0.074509f, 1.000136f, 0.601661f, -1.416694f, -1.932396f,
+ -1.163850f, 0.640931f, -0.888625f, -0.345711f, 0.161799f, 0.103165f,
+ 0.147513f, 0.089956f, 0.204329f, 0.196922f, 0.014927f, 0.283714f,
+ -0.110422f, 0.062005f, -0.531870f, -0.075287f, -0.448349f, -0.218881f,
+ -0.005592f, -0.130490f, -0.015779f, 0.093521f, -0.158487f, 0.072241f,
+ 0.066879f, -0.418566f, -0.206281f, 0.025634f, 0.048334f, -0.534750f,
+ 0.302081f, 0.028707f, -1.543248f, 0.103799f, -1.214052f, 0.395870f,
+ 0.394754f, -0.272170f, -0.702953f, -4.057464f, -0.033497f, -0.042142f,
+ 0.014742f, 0.065263f, 0.000879f, -0.019768f, 0.101275f, 0.163059f,
+ -0.371392f, -0.283484f, 0.241915f, 0.012684f, -0.210101f, -0.166534f,
+ -0.024894f, 0.274696f, 0.098993f, 0.104086f, 0.055044f, -0.289378f,
+ 0.146571f, -0.147441f, 0.004056f, 0.112244f, -0.416162f, -0.033176f,
+ -0.214836f, -0.213787f, 0.023197f, -0.339043f, 0.301109f, -0.408551f,
+ 0.284922f, -0.344418f, -0.039255f, 0.158748f, -0.344169f, 0.078286f,
+ -0.043957f, -0.302162f, -0.310826f, 0.063425f, 0.198166f, -0.285324f,
+ -0.108252f, 0.038992f, -1.053110f, -1.663290f, -0.417185f, 1.504443f,
+ 0.643206f, -0.850240f, 0.889641f, -0.733214f, 0.147302f, 0.060291f,
+ -0.052954f, 0.167453f, 0.111870f, 0.085471f, 0.035107f, 0.064361f,
+ 0.176053f, 0.184373f, 0.676576f, 0.066164f, 1.455569f, 0.925111f,
+ -0.640845f, 0.803795f, -0.653782f, -0.201038f, 0.060033f, 0.016964f,
+ -0.047590f, 0.045908f, 0.354162f, 0.014812f, 0.156978f, 0.058792f,
+ -0.238119f, 0.002450f, -0.094388f, -0.155229f, 0.194858f, -0.355429f,
+ -0.187098f, -0.119264f, -0.088694f, -0.102845f, 0.184905f, -0.425339f,
+ -0.157808f, -0.104599f, -0.393248f, -0.379842f, 0.027741f, -0.185816f,
+ -0.317294f, 0.002453f, -0.498241f, -0.204302f, -0.079093f, 0.020646f,
+ -0.412850f, -0.426039f, -0.177050f, -0.419304f, -0.064478f, -0.191802f,
+ -0.146812f, 0.171111f, 0.090261f, -0.367033f, -0.299051f, -0.322132f,
+ 0.428192f, -0.252613f, 0.488498f, -0.559682f, 0.486720f, -0.511084f,
+ 0.992506f, 0.346765f, -0.118697f, -0.065127f, -0.376612f, -0.345137f,
+ -0.426517f, -0.516836f, 0.307083f, 0.609362f, 0.369555f, 0.093775f,
+ -0.375664f, -0.221595f, -0.025465f, 0.134374f, -0.387031f, 0.096236f,
+ 0.337465f, -0.124029f, -0.157340f, -0.368790f, -0.104490f, -0.279507f,
+ -0.247705f, 0.146559f, -0.236206f, -0.036073f, 0.064206f, -0.330919f,
+ 0.516591f, -0.013492f, 1.269568f, 1.182530f, -0.455390f, -1.328091f,
+ -0.200950f, -0.380513f, -0.195532f, -0.341479f, 0.016064f, 0.021176f,
+ 0.169119f, 0.103707f, -0.174504f, -0.462719f, -0.079445f, -0.247128f,
+ 0.459111f, 0.036129f, 0.769570f, -0.080405f, 1.667107f, 0.355567f,
+ -2.433896f, 0.627572f, -0.600090f, -0.651872f, -0.059769f, -0.041945f,
+ -0.009933f, 0.014864f, -0.049378f, -0.041561f, 0.075180f, 0.138307f,
+ 0.122366f, -0.160756f, 0.215327f, 0.013572f, 0.198194f, -0.762650f,
+ 0.054466f, 1.110332f, 1.692853f, 0.658654f, -0.409549f, 0.506085f,
+ 0.330962f, -0.223008f, 0.007448f, -0.289062f, -0.476231f, -0.228359f,
+ 0.013977f, -0.000609f, -0.673604f, 0.275996f, 0.405291f, 1.693561f,
+ -1.079768f, 1.122516f, -0.203227f, 0.099265f, -0.165207f, -0.323899f,
+ -0.269973f, -0.080122f, 0.127700f, 0.190201f, 0.219527f, 0.306194f,
+ 0.026049f, -0.003779f, 1.107357f, 1.720315f, 1.017908f, 0.078664f,
+ -1.599813f, -0.482636f, -0.117450f, 0.122249f, 0.030220f, 0.039794f,
+ 0.176350f, 0.129715f, -0.305755f, -0.274044f, -0.299640f, -0.187335f,
+ -0.073616f, -0.564507f, -0.127758f, 0.044855f, -0.191090f, 0.039095f,
+ 0.115378f, 0.969352f, -0.088360f, 0.301443f, 0.065726f, -0.019740f,
+ -0.102350f, -0.084913f, -0.194615f, 0.118582f, 0.920789f, -0.171615f,
+ -1.436553f, -0.026419f, -0.730864f, 0.615697f, -0.795079f, 0.119701f,
+ 0.601782f, 0.792902f, 0.184920f, 1.635090f, -0.085860f, -0.033187f,
+ -0.166883f, 0.008487f, -0.128300f, -0.089923f, -0.108781f, -0.133719f,
+ -0.011988f, -0.239816f, -0.092563f, -0.238471f, -0.339722f, 0.177432f,
+ -0.063101f, -0.121002f, 0.058072f, -0.031166f, 0.086413f, -0.016203f,
+ -0.305075f, -0.005420f, -0.168796f, 0.148745f, -0.116737f, -0.050222f,
+ -0.287952f, -0.290982f, -0.090449f, 0.076098f, -0.345632f, -0.061309f,
+ 0.142218f, 0.035692f, 0.304517f, -0.228031f, 0.119608f, -0.120350f,
+ 0.163404f, -0.105605f, -0.305462f, -0.176657f, 0.210070f, -0.227600f,
+ -0.081965f, -0.464027f, -0.053782f, -0.018367f, 0.119159f, 0.017162f,
+ -0.069792f, 0.305768f, -0.421095f, 0.187740f, -0.032059f, 0.575115f,
+ -0.064283f, -0.091828f, 0.772648f, -0.393189f, -0.297098f, 0.141420f,
+ 0.826389f, -0.071586f, -0.893968f, -0.346793f, -1.151655f, 0.039393f,
+ 1.546000f, -0.094029f, -0.005786f, -0.195764f, -0.169724f, -0.133167f,
+ -0.129312f, -0.418860f, -0.026553f, -0.053667f, -0.091976f, -0.106275f,
+ -0.492625f, 0.025350f, -0.332075f, -0.475638f, -0.076667f, -0.065779f,
+ 0.108957f, 0.246298f, -0.289007f, -0.442552f, -0.206692f, -0.257453f,
+ 0.073806f, -0.458606f, -0.410390f, -0.312674f, -0.144813f, 0.170128f,
+ 0.018810f, -0.098241f, 1.027369f, 0.479328f, 1.129707f, 0.484813f,
+ -0.085207f, 0.621873f, -0.520981f, 0.236175f, 0.273487f, 0.061426f,
+ 0.306085f, 0.161487f, 0.220991f, 0.223783f, -0.091826f, 0.391031f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer0[24] = {
+ 0.580225f, -0.191304f, 1.091767f, -0.134522f, -0.089361f, 0.398750f,
+ -0.882708f, -0.213102f, -0.119981f, 0.378296f, -0.075719f, 0.426598f,
+ -2.015505f, 0.202534f, -1.044792f, -0.841519f, 0.266421f, -0.047115f,
+ -0.131147f, -0.075066f, -0.009441f, 0.853007f, -0.175606f, -0.868306f,
+};
+
+static const float av1_4_partition_nn_weights_64_layer1[24 * LABEL_SIZE] = {
+ -0.851937f, -0.211148f, -2.289513f, -0.275071f, 0.251340f, -0.340847f,
+ 0.498032f, 0.308652f, -0.051574f, 0.323146f, -0.097547f, -0.040269f,
+ 1.909655f, 0.098348f, 0.588136f, 0.568112f, 0.313297f, 0.920848f,
+ -0.014486f, 0.386014f, 0.029199f, -0.537330f, -0.021502f, 0.349073f,
+ -0.524715f, -0.351848f, 1.565454f, -0.297148f, 0.020177f, 0.648369f,
+ 0.027321f, -0.096052f, -0.363163f, -0.132642f, 0.024292f, -0.734176f,
+ -0.782700f, 0.408299f, 0.476945f, -0.489512f, -0.728318f, -0.632042f,
+ 0.405417f, 0.184086f, -0.400730f, 0.359032f, 0.019710f, -0.217409f,
+ 0.519159f, -0.136316f, 0.993592f, -0.147128f, 0.097495f, 0.426189f,
+ -0.295233f, 0.278799f, 0.080667f, -0.025052f, -0.307757f, 0.418716f,
+ -0.853388f, -0.374878f, -0.322725f, 0.696335f, -0.380649f, -0.160356f,
+ -0.140060f, 0.502455f, 0.656728f, -0.095023f, -0.184198f, -0.347069f,
+ 0.456372f, -0.029754f, 0.907923f, 0.265710f, -0.065505f, 0.226763f,
+ -0.277798f, 0.413292f, -0.593899f, -0.060740f, -0.313358f, -0.249944f,
+ -0.627329f, -0.327151f, -0.853788f, -1.163807f, -0.388944f, -0.228788f,
+ -0.057382f, 0.334741f, -0.283083f, 0.368280f, -0.407197f, -0.441849f,
+};
+
+static const float av1_4_partition_nn_bias_64_layer1[LABEL_SIZE] = {
+ -0.478735f,
+ 0.292948f,
+ 0.293172f,
+ 0.040013f,
+};
+
+static const NN_CONFIG av1_4_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24, // num_hidden_nodes
+ },
+ {
+ av1_4_partition_nn_weights_64_layer0,
+ av1_4_partition_nn_weights_64_layer1,
+ },
+ {
+ av1_4_partition_nn_bias_64_layer0,
+ av1_4_partition_nn_bias_64_layer1,
+ },
+};
+
+#undef FEATURE_SIZE
+#undef LABEL_SIZE
+
+#define FEATURE_SIZE 4
+static const float
+ av1_partition_breakout_nn_weights_128_layer0[FEATURE_SIZE * 32] = {
+ -0.331785f, 0.068675f, -0.323814f, 0.033714f, -0.237835f, 0.166316f,
+ -0.498766f, -0.545634f, -0.266173f, -0.476957f, -0.120409f, -0.021042f,
+ 0.124056f, -0.278750f, -0.110120f, -0.372812f, 4.547939f, 0.097618f,
+ -0.002710f, -0.064169f, -1.841173f, -0.403833f, 0.005536f, 0.067188f,
+ -0.434935f, -0.227421f, -0.000011f, -0.139961f, -0.174056f, -0.652384f,
+ -0.000015f, -0.262847f, -3.319706f, -0.947693f, 0.002981f, 0.016717f,
+ -10.408850f, -0.014568f, -0.000018f, 0.019084f, 1.523383f, 0.074525f,
+ -0.002076f, -0.020734f, 4.881495f, 0.002799f, 0.000342f, -0.019623f,
+ 1.786154f, 0.037462f, -0.019037f, 0.052833f, 11.408153f, -0.044602f,
+ 0.026155f, -0.518627f, -0.474499f, -0.427430f, -0.442733f, -0.011116f,
+ -22.379410f, -0.000549f, -0.001418f, 0.008090f, -0.295090f, -0.230268f,
+ -0.337278f, -0.001127f, -0.644282f, -0.598783f, -0.539417f, -0.003303f,
+ 9.189824f, 0.038066f, -0.004097f, -0.460045f, -0.308858f, -0.242691f,
+ -0.230835f, -0.273057f, 0.152226f, 0.179239f, -0.146382f, -0.004655f,
+ -0.242940f, -0.718862f, -0.001685f, -0.214736f, 3.263186f, 0.079463f,
+ -0.003854f, -0.187461f, -0.599144f, -0.419808f, -0.000597f, -0.136980f,
+ 0.184813f, -0.319525f, -0.007246f, 0.079709f, -0.883229f, -0.343748f,
+ -0.000077f, -0.172214f, -0.548759f, -0.194674f, -0.144786f, 0.043896f,
+ -0.176364f, -0.248394f, -0.090215f, -0.294743f, -0.280980f, -0.181436f,
+ -0.115681f, -0.071915f, -13.035494f, -0.075623f, 0.017052f, -0.171152f,
+ 5.910803f, 0.128344f, 0.010256f, -1.073301f, 2.387826f, 0.166183f,
+ -0.007193f, -0.257836f,
+ };
+
+static const float av1_partition_breakout_nn_bias_128_layer0[32] = {
+ 0.115591f, -0.100178f, -0.165523f, -0.122997f, 11.045759f, 1.034761f,
+ -0.323672f, -0.189087f, 2.850950f, 7.010029f, -21.447067f, 1.877031f,
+ 0.437442f, 5.929414f, -0.117274f, 4.462253f, -0.135198f, -0.145927f,
+ 8.727211f, 0.000000f, -3.532987f, -0.405898f, 11.364439f, -0.141728f,
+ -5.994947f, -0.362574f, 1.857687f, -0.100400f, -0.130312f, 0.006080f,
+ 0.429660f, -8.439470f,
+};
+
+static const float av1_partition_breakout_nn_weights_128_layer1[32] = {
+ -0.013738f, 0.022052f, -0.074437f, -0.211377f, -0.080433f, 0.015543f,
+ 0.002091f, 0.014252f, 0.134834f, 0.190263f, 0.244175f, -0.031747f,
+ 0.020068f, -0.068326f, 0.185471f, 0.660268f, -0.134898f, -0.010376f,
+ -0.276023f, -0.282921f, -0.022769f, 0.007070f, -0.186235f, 0.024407f,
+ -0.024837f, 0.005764f, 0.016599f, -0.040077f, 0.020990f, 0.095054f,
+ -0.039662f, 0.131499f,
+};
+
+static const float av1_partition_breakout_nn_bias_128_layer1[1] = {
+ 0.86678213f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_128_layer0,
+ av1_partition_breakout_nn_weights_128_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_128_layer0,
+ av1_partition_breakout_nn_bias_128_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_64_layer0[FEATURE_SIZE * 16] = {
+ 0.872892f, -0.235539f, -0.412159f, -0.142533f, -2.251479f, -0.057073f,
+ -0.001373f, 0.112147f, 5.281734f, 0.060704f, 0.000838f, -0.961554f,
+ 0.244995f, 0.154515f, -0.292654f, -0.167177f, -3.759112f, -0.486347f,
+ 0.003208f, -0.418226f, 2.618152f, 0.026832f, 0.003988f, -0.404406f,
+ -0.405434f, 0.102791f, -0.033406f, -0.029820f, -4.492342f, -0.154291f,
+ 0.012947f, -0.195075f, 0.009311f, -0.411410f, -0.010986f, -0.554822f,
+ 0.160576f, 0.020796f, -0.457230f, -0.191111f, -7.759542f, -0.065039f,
+ -0.001322f, 0.055691f, 0.291924f, -0.053076f, -0.148379f, -0.298383f,
+ 1.022023f, -0.033668f, -0.000804f, -0.825778f, -3.902254f, -0.085812f,
+ -0.052520f, -0.035012f, -0.465468f, -0.319231f, -0.497529f, -0.183068f,
+ -2.407131f, -0.062304f, 0.000874f, 0.108786f,
+ };
+
+static const float av1_partition_breakout_nn_bias_64_layer0[16] = {
+ 0.081425f, -14.404084f, 11.511393f, -0.930053f, 1.841889f, 15.020920f,
+ -1.872288f, 5.392535f, -0.329335f, -0.005358f, 12.600776f, 0.000000f,
+ -0.337413f, 4.492778f, 0.000000f, 17.043072f,
+};
+
+static const float av1_partition_breakout_nn_weights_64_layer1[16] = {
+ -0.465338f, -0.103023f, -0.174808f, -0.005156f, -0.016366f, -0.172494f,
+ 0.014185f, 0.067030f, -0.001939f, -0.175049f, 0.245992f, -0.181660f,
+ -0.038572f, 0.307899f, -0.294283f, 0.118323f,
+};
+
+static const float av1_partition_breakout_nn_bias_64_layer1[1] = {
+ -1.33438122f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_64_layer0,
+ av1_partition_breakout_nn_weights_64_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_64_layer0,
+ av1_partition_breakout_nn_bias_64_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_32_layer0[FEATURE_SIZE * 16] = {
+ -4.825528f, -0.145737f, 0.001907f, 0.145415f, -1.858153f, -0.080744f,
+ 0.000601f, 0.211991f, 0.384265f, -0.043945f, -0.521332f, -0.170622f,
+ -0.046866f, -0.600506f, -0.001216f, -0.332760f, -0.447677f, -0.605844f,
+ -0.121008f, -0.119936f, -0.215739f, -0.269665f, -0.668587f, 0.071318f,
+ -1.202551f, -0.729727f, -0.370084f, 0.088215f, -1.926800f, -0.086519f,
+ 0.000359f, 0.215120f, 0.718749f, 0.022942f, 0.003840f, -0.176518f,
+ 1.213451f, 0.080786f, 0.001557f, -1.053430f, 0.202698f, -0.583919f,
+ -0.535512f, -0.239927f, -0.110151f, -0.128832f, -0.441087f, -0.145575f,
+ -0.178518f, -0.585784f, 0.000029f, -0.833014f, -0.331358f, -0.520297f,
+ -0.088676f, -0.178487f, -1.430755f, 0.022981f, -0.106931f, 0.015573f,
+ -0.520814f, -0.045386f, -0.443123f, -0.484209f,
+ };
+
+static const float av1_partition_breakout_nn_bias_32_layer0[16] = {
+ 11.747026f, -9.337718f, 0.341648f, -0.155847f, -0.104005f, 4.666283f,
+ 6.669584f, 16.625504f, 9.885626f, 15.439183f, -0.346080f, 0.000000f,
+ -0.423808f, 0.000000f, 6.352258f, -0.155787f,
+};
+
+static const float av1_partition_breakout_nn_weights_32_layer1[16] = {
+ 0.168561f, -0.122519f, 0.524667f, 0.032474f, 0.059097f, 0.011900f,
+ 0.166445f, 0.127256f, -0.034838f, -0.212586f, -0.317973f, 0.348419f,
+ -0.004171f, 0.157694f, 0.117845f, 0.272115f,
+};
+
+static const float av1_partition_breakout_nn_bias_32_layer1[1] = {
+ 0.09049262f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_32_layer0,
+ av1_partition_breakout_nn_weights_32_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_32_layer0,
+ av1_partition_breakout_nn_bias_32_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_16_layer0[FEATURE_SIZE * 16] = {
+ 0.209371f, 0.028758f, 0.005764f, -0.384401f, -0.625777f, -0.005647f,
+ -0.316867f, 0.042985f, 0.127344f, 0.025461f, 0.011465f, -0.071043f,
+ -0.295977f, -0.076093f, -0.209681f, -0.311653f, -0.147538f, 0.009910f,
+ -0.130997f, -0.012326f, 0.024124f, -0.323578f, -0.005790f, -0.085664f,
+ -1.575066f, -0.119221f, 0.015018f, 0.187204f, 0.238117f, 0.084924f,
+ -0.004444f, -1.271538f, -0.709860f, -0.006226f, -0.903111f, 0.090573f,
+ -0.278642f, -0.011114f, 0.021162f, 0.081290f, -0.467486f, -0.040771f,
+ -0.224069f, -0.714390f, -0.281905f, -0.001336f, -0.761212f, -0.060385f,
+ -0.814479f, -0.050450f, -0.003666f, 0.085668f, -0.272589f, 0.057330f,
+ -0.206540f, -0.303418f, 0.075335f, -0.180468f, -0.064872f, -0.755948f,
+ -0.509287f, -0.048877f, -0.001512f, 0.077086f,
+ };
+
+static const float av1_partition_breakout_nn_bias_16_layer0[16] = {
+ 16.421495f, 4.012273f, -1.828571f, 0.000000f, -0.263564f, -0.201972f,
+ 6.564987f, 14.651000f, -3.227779f, 2.241833f, -0.137116f, 0.762876f,
+ 5.625762f, 0.615822f, 0.040057f, 16.668884f,
+};
+
+static const float av1_partition_breakout_nn_weights_16_layer1[16] = {
+ -0.096440f, 0.184316f, -0.021148f, 0.424974f, 0.003743f, 0.006310f,
+ 0.046266f, -0.219224f, -0.087004f, 0.024623f, -0.275798f, 0.120164f,
+ 0.269773f, -0.021105f, -0.146698f, 0.188764f,
+};
+
+static const float av1_partition_breakout_nn_bias_16_layer1[1] = {
+ 1.60751927f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_16_layer0,
+ av1_partition_breakout_nn_weights_16_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_16_layer0,
+ av1_partition_breakout_nn_bias_16_layer1,
+ },
+};
+
+static const float
+ av1_partition_breakout_nn_weights_8_layer0[FEATURE_SIZE * 16] = {
+ -0.255885f, 0.109548f, -0.111054f, -0.476119f, -1.083031f, -0.342003f,
+ 0.048241f, -0.356013f, -0.085054f, 0.124908f, 0.000084f, -0.149906f,
+ -0.729829f, 0.133535f, -0.002125f, 0.207516f, -0.210163f, -0.567365f,
+ -0.590103f, 0.045308f, -0.539406f, 0.130550f, -0.663879f, -0.170549f,
+ 0.017587f, -0.054187f, 0.000550f, 0.038297f, -0.112891f, -0.012751f,
+ -0.048067f, 0.095564f, 0.079892f, 0.077285f, -0.749708f, -0.286312f,
+ -0.054334f, 0.132242f, -0.004152f, -0.209758f, -0.073407f, 0.082306f,
+ -0.001034f, -0.090990f, 0.122823f, -0.109794f, -0.230066f, -0.391155f,
+ -0.262245f, -0.004744f, -0.232246f, 0.099290f, -0.637484f, 0.111937f,
+ -0.548556f, -0.598344f, 0.123265f, -0.281395f, -0.399711f, -0.525671f,
+ -0.596269f, 0.098494f, -0.005765f, 0.173652f,
+ };
+
+static const float av1_partition_breakout_nn_bias_8_layer0[16] = {
+ 0.194141f, -0.111223f, 2.503733f, -7.155602f, -0.695068f, 0.114874f,
+ 2.056990f, 5.284306f, 0.639643f, -2.792049f, -2.232339f, -0.232209f,
+ 2.336705f, -0.278834f, 0.231905f, 7.954366f,
+};
+
+static const float av1_partition_breakout_nn_weights_8_layer1[16] = {
+ -0.014439f, 0.010171f, 0.048116f, -0.090659f, -0.081235f, -0.021840f,
+ -0.017360f, 0.031063f, -0.031737f, -0.023439f, -0.037725f, 0.021954f,
+ 0.055858f, 0.230970f, -0.056466f, 0.119780f,
+};
+
+static const float av1_partition_breakout_nn_bias_8_layer1[1] = {
+ 1.27784479f,
+};
+
+static const NN_CONFIG av1_partition_breakout_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16, // num_hidden_nodes
+ },
+ {
+ av1_partition_breakout_nn_weights_8_layer0,
+ av1_partition_breakout_nn_weights_8_layer1,
+ },
+ {
+ av1_partition_breakout_nn_bias_8_layer0,
+ av1_partition_breakout_nn_bias_8_layer1,
+ },
+};
+#undef FEATURE_SIZE
+
+#define FEATURE_SIZE 9 // Input layer size
+#define NUM_NODES 32 // Hidden layer size
+#define LABEL_SIZE 3 // Output layer size
+
+static const float av1_rect_partition_nn_weights_8_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.22151f, 0.99424f, 0.23415f, -1.13841f, -0.11277f, 0.09530f, 0.14769f,
+ -1.18895f, -0.96640f, -0.21421f, -0.13974f, 0.03236f, 0.15777f, -0.03176f,
+ 0.02729f, -0.37344f, -0.01727f, -0.05469f, 0.19402f, -3.45508f, 0.90106f,
+ -2.91557f, 0.19379f, 0.14356f, -0.13291f, 0.05734f, -0.03032f, -0.13060f,
+ 0.35744f, 1.31630f, -1.54493f, -0.20749f, -0.24413f, -0.04524f, -0.12400f,
+ 1.08305f, -0.21596f, 0.76244f, 1.10616f, -1.71706f, 0.05768f, 0.10966f,
+ 0.00949f, -0.12680f, 0.00699f, -0.11522f, -0.38566f, 0.34283f, -0.35266f,
+ -0.40643f, -0.22462f, 0.32300f, -0.39737f, -0.20587f, -0.16096f, 1.07543f,
+ 0.30314f, -1.35659f, -0.38212f, 0.45857f, 0.76615f, 0.16819f, -1.24459f,
+ 0.39677f, 0.87436f, -2.33757f, 1.27471f, 0.27488f, 0.01019f, -0.01221f,
+ -0.07461f, -0.14577f, -0.01231f, -0.64426f, -1.02733f, -1.96242f, 0.95143f,
+ -0.06777f, -1.13868f, 0.01354f, -0.75590f, -0.78222f, -0.07453f, 0.61788f,
+ 0.56899f, 1.17144f, 0.70899f, 0.48568f, 0.11266f, 0.81579f, -0.03929f,
+ 0.01088f, 0.33599f, -0.22401f, -0.49654f, -0.02598f, 0.04509f, -0.08217f,
+ -0.30687f, 0.19851f, -2.96860f, -2.30698f, 0.01848f, 0.11801f, 0.06614f,
+ 0.01673f, -0.11002f, -0.08168f, 0.09204f, -0.06379f, 0.27972f, -0.31716f,
+ -0.00566f, -0.13651f, -0.37276f, 0.01511f, -0.23697f, 0.21696f, -0.19480f,
+ 0.60758f, -0.43506f, -0.02247f, -1.45073f, 0.84442f, -0.94018f, 0.32550f,
+ 0.03985f, -0.06581f, 0.21665f, 0.79472f, -2.41080f, 0.04788f, -0.09492f,
+ -0.10677f, 0.07250f, 0.14329f, -0.37319f, 0.53043f, -0.49108f, 0.25792f,
+ -0.36569f, -0.28669f, -0.18416f, -0.52385f, -1.17081f, -1.32153f, -1.13403f,
+ -0.26196f, 0.93379f, 0.72115f, 0.54464f, 0.27642f, 0.04757f, 2.01629f,
+ 1.55787f, -0.11665f, 1.00722f, -0.24352f, 0.53308f, 0.57719f, 0.39344f,
+ 0.19174f, 0.06339f, -0.02530f, 0.07724f, -0.32416f, -0.26992f, -0.35887f,
+ -0.35285f, -0.33379f, -0.37475f, -0.77335f, 1.70027f, -1.52153f, -0.26503f,
+ 0.97552f, -2.96705f, -0.91220f, -0.11827f, 0.00406f, -0.14514f, 0.18417f,
+ -0.20874f, 0.27293f, -0.34072f, -0.34838f, -0.19054f, -0.29806f, -0.27960f,
+ -0.19293f, -0.18275f, -0.05902f, 0.58625f, -0.05470f, -0.48814f, -0.45382f,
+ -0.05959f, 2.01250f, -0.30014f, 0.69546f, -1.24180f, 1.34923f, 0.20337f,
+ 0.16850f, 0.07187f, 0.72630f, -0.15380f, -2.40973f, -2.73561f, -1.71375f,
+ -1.61695f, 0.50052f, 0.09730f, 0.00579f, 0.06133f, -0.06512f, -0.61439f,
+ -1.16173f, -0.58716f, 1.60438f, 0.23242f, 0.91847f, 0.49041f, -0.16277f,
+ -0.02574f, -0.64593f, 1.17028f, 0.46852f, 0.14926f, 0.73853f, -0.78521f,
+ 0.05959f, -0.35590f, 0.02039f, 0.10812f, -0.28650f, 1.34038f, -0.72188f,
+ 0.62385f, -0.35271f, -0.39599f, 0.41543f, 0.53124f, -0.23510f, -0.15480f,
+ -0.05066f, -0.33529f, 0.05238f, -0.35311f, -0.26983f, -0.39764f, 0.01085f,
+ 0.26593f, -0.18411f, -0.29945f, 0.50090f, -0.03397f, 0.78562f, -0.33068f,
+ 1.21308f, -2.23273f, -0.33366f, -0.15164f, -1.13270f, 0.17394f, 0.65567f,
+ 0.76496f, 0.44325f, 0.01368f, -0.33619f, -0.64256f, 0.64478f, 0.84553f,
+ 1.74183f, 0.22563f, -0.14550f, -0.16258f, 0.03010f, 0.49922f, 0.64575f,
+ -0.29187f, -0.10348f, -1.43619f, -0.56540f, -0.14779f, 0.04616f, 0.87411f,
+ -1.08228f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer0[NUM_NODES] = {
+ 0.33919f, -0.03003f, 0.79073f, -0.18508f, 0.00668f, -0.12017f, 0.35362f,
+ -0.51642f, 0.06536f, 0.41668f, -0.06509f, 0.94606f, -0.15385f, 0.14936f,
+ 1.46274f, -0.06961f, 2.82537f, -1.95576f, -0.09457f, 0.02042f, -0.07480f,
+ -0.55083f, 0.26170f, 4.39883f, 0.33999f, -0.10502f, 0.70884f, -0.06992f,
+ -0.22638f, 1.40940f, -0.09309f, 0.05828f,
+};
+
+static const float av1_rect_partition_nn_weights_8_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.09209f, 0.26236f, 0.62136f, 0.76324f, -1.14678f, 0.42289f, -0.08895f,
+ -0.97267f, 2.05958f, 0.00843f, 0.35335f, 1.12096f, -0.11679f, 0.07350f,
+ -1.23231f, -0.61990f, 1.51379f, -1.99450f, 0.22441f, 2.41974f, -0.30488f,
+ -0.37869f, 0.47168f, -3.70132f, 0.00061f, 0.19432f, 0.11512f, 0.26200f,
+ -0.35285f, 0.37985f, 0.90571f, 0.27344f, 0.74840f, -0.17965f, -2.51433f,
+ 0.59235f, 1.16670f, -0.53446f, 0.67897f, 0.04505f, -0.86874f, 0.45361f,
+ -0.35033f, 1.21283f, 0.31426f, -0.20841f, 0.56757f, 0.45909f, -1.23683f,
+ 0.09835f, -0.17214f, -0.96323f, 0.01138f, -0.50233f, 0.30104f, 2.01814f,
+ 1.15821f, -0.11947f, 0.74574f, -0.30714f, -0.39646f, -1.30086f, -0.88541f,
+ -0.12259f, -0.54977f, 0.30069f, 1.84299f, -0.95141f, -0.65887f, -0.25888f,
+ -0.63265f, 1.29531f, -0.56672f, 0.10837f, -0.21297f, -2.19131f, 0.01156f,
+ 0.51912f, 0.46704f, 0.42810f, -0.59271f, 0.98469f, -0.17914f, -1.91163f,
+ -0.32807f, 0.48199f, -0.99525f, 1.67108f, -0.87631f, -0.60258f, -0.78731f,
+ -0.32877f, 0.44237f, 0.01087f, 0.07489f, -0.28224f,
+};
+
+static const float av1_rect_partition_nn_bias_8_layer1[LABEL_SIZE] = {
+ 1.70665f,
+ -0.77954f,
+ -0.92709f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_8 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_8_layer0,
+ av1_rect_partition_nn_weights_8_layer1 },
+ { av1_rect_partition_nn_bias_8_layer0, av1_rect_partition_nn_bias_8_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_16_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.18480f, -0.05410f, -0.18957f, 0.15451f, -0.38649f, -0.26162f, -0.22727f,
+ -0.38555f, -0.36738f, 0.74384f, -1.85999f, 0.98491f, -0.72119f, 1.77321f,
+ 0.39983f, 0.96314f, 0.23695f, 0.30200f, 0.30629f, -0.47617f, -1.43320f,
+ -1.81730f, 0.36554f, -0.07142f, -1.27242f, -1.27697f, 0.00110f, -0.32179f,
+ 0.27460f, 0.45428f, 0.15308f, -0.73906f, -0.28577f, -0.01238f, -0.16958f,
+ -0.85390f, 1.05484f, -1.62812f, 0.77632f, -0.27327f, -0.32527f, 0.32726f,
+ 1.73255f, 0.53763f, 0.59121f, -0.39068f, -0.32451f, -0.31869f, 0.17777f,
+ 0.07519f, -0.18066f, -0.11250f, -0.14616f, -0.16882f, -0.04099f, -0.67959f,
+ 0.39674f, -0.08596f, 0.18587f, -2.04097f, -1.73993f, 1.57212f, 1.42410f,
+ -1.36762f, -0.41485f, -1.12103f, 0.56959f, 0.11500f, 0.48945f, -0.13585f,
+ 1.22125f, 0.67071f, -1.11812f, -0.20660f, -0.52856f, 0.70663f, 0.74382f,
+ 0.61114f, -0.11454f, 1.14687f, 0.80322f, -0.45965f, -0.44466f, -0.05830f,
+ 0.13206f, -0.53750f, -0.11324f, -0.37971f, -0.13491f, -0.21268f, 1.93407f,
+ 1.34433f, 2.49427f, 2.91955f, 1.71730f, 0.03295f, 0.03587f, -0.14550f,
+ 0.08189f, -0.38655f, -0.35432f, -0.62706f, -0.01849f, -0.57882f, -0.60438f,
+ -1.01334f, -0.57302f, 0.22592f, 0.05916f, -0.05305f, -0.89824f, -0.52969f,
+ -0.24542f, 0.27029f, -0.40924f, -0.82452f, -0.60665f, -5.03025f, 0.83302f,
+ 1.83695f, 2.19716f, 2.31001f, 0.03657f, 0.00063f, -0.04379f, 0.05835f,
+ -0.08623f, 0.20557f, -0.17791f, 0.07874f, -0.25456f, -0.19513f, -0.27753f,
+ -0.31982f, 0.00245f, -0.33183f, 0.26059f, -0.22165f, 0.37582f, -0.30411f,
+ -0.22639f, -0.14739f, -0.20201f, -0.37507f, -1.30653f, 0.49570f, 1.03673f,
+ 0.66139f, 0.44941f, -0.44461f, -0.50376f, -0.49664f, 0.18608f, -0.26175f,
+ 0.14844f, 0.78715f, -0.70344f, -0.87624f, -0.98535f, -0.35346f, 0.37094f,
+ -0.43135f, -0.22571f, 3.46263f, 3.13580f, -1.33203f, -0.15247f, -0.15866f,
+ -0.11214f, 0.12211f, 0.03964f, -1.87597f, -4.81597f, -4.80195f, -4.98096f,
+ -5.62336f, -0.05337f, -0.00943f, 0.00792f, 0.02742f, 1.05679f, 2.41455f,
+ 0.85382f, 1.42504f, 0.58096f, 0.21443f, 1.02694f, 1.06746f, 1.20242f,
+ 0.60767f, 1.98667f, -0.80879f, -0.63495f, 1.95508f, 0.23952f, -0.15019f,
+ -0.16097f, 0.30155f, -3.42407f, -1.34998f, 9.07689f, -2.22559f, 2.22562f,
+ -0.03348f, -0.05229f, 0.05931f, 0.03042f, -0.18068f, -0.05732f, -0.33010f,
+ -0.32279f, -0.26607f, -0.02723f, -0.04067f, 0.08700f, -0.16366f, -0.24935f,
+ -0.69124f, 0.58508f, 0.50654f, 0.04492f, 1.38340f, -1.51487f, 1.72889f,
+ -1.95618f, -3.65013f, -1.38525f, -3.05516f, -2.40448f, 2.47467f, 0.03784f,
+ 0.08052f, -0.01971f, -0.08918f, -0.84997f, -0.55302f, -1.07861f, -0.62626f,
+ 0.61751f, -0.11012f, -0.24185f, -0.39201f, -1.85390f, -0.31261f, -0.11927f,
+ 0.15671f, -0.23450f, -0.14916f, -0.31715f, -0.19350f, 0.01795f, -0.11533f,
+ -0.05799f, -0.03142f, 0.20218f, -0.39499f, -0.33859f, -0.13201f, -0.19527f,
+ -0.28459f, -0.20346f, 0.89457f, -2.22103f, -2.37455f, -2.00221f, 2.44553f,
+ 0.33915f, 0.50047f, -0.34625f, -0.19667f, -0.56333f, -0.84328f, 1.25767f,
+ -1.70297f, 1.00482f, -0.00103f, -1.40813f, 0.21311f, 0.39230f, -0.07302f,
+ -3.49100f, 1.60675f, -2.90692f, 0.11022f, 0.13507f, -0.13308f, 0.15201f,
+ -0.05573f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer0[NUM_NODES] = {
+ -0.16783f, -0.16023f, 0.52215f, -0.04109f, 2.00122f, -0.11633f, 0.25535f,
+ 1.80638f, 1.69273f, -0.25998f, -6.83550f, -0.79682f, -1.03466f, 1.42721f,
+ 0.00000f, -0.00000f, -0.11665f, -0.12047f, -1.01497f, 7.27181f, -0.78548f,
+ -1.39335f, -5.42248f, -0.10388f, 0.07634f, 2.81012f, -0.57429f, -0.15629f,
+ -0.12044f, 1.65478f, -0.75153f, 1.18441f,
+};
+
+static const float av1_rect_partition_nn_weights_16_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ -0.26407f, 0.06322f, 0.87932f, 0.17772f, 0.71686f, -0.12283f, 0.08454f,
+ 0.20098f, -0.31763f, -0.33178f, -4.59535f, -0.04367f, 0.17099f, 3.80486f,
+ 0.16750f, 0.29218f, 0.57234f, -0.96550f, -0.10599f, -4.91130f, -0.14658f,
+ 0.95803f, -4.13925f, 0.24567f, 0.25708f, 1.60547f, -1.03251f, -0.31053f,
+ -0.05659f, -0.94121f, -0.68926f, -0.24738f, -0.38019f, 0.98950f, 0.13689f,
+ 0.24504f, 0.49623f, 0.19980f, 0.38349f, 0.37481f, 0.54540f, -0.02198f,
+ 3.43385f, 1.02543f, -0.40921f, -3.07235f, 0.02996f, 0.00323f, -0.35414f,
+ 0.71099f, 1.39334f, 2.43741f, -1.11007f, -0.22739f, -4.21757f, 0.11905f,
+ 0.00353f, -1.69637f, 0.45944f, -0.19884f, 0.03624f, 0.25729f, 0.23659f,
+ -2.08405f, 0.08573f, -0.53393f, -1.28103f, -0.53970f, -0.65465f, 0.31821f,
+ -0.09884f, -0.69026f, -0.37284f, 0.04622f, 1.32973f, -0.15414f, 0.19138f,
+ -0.67927f, -0.17658f, 0.36008f, -0.51832f, 0.09887f, -1.94414f, 2.95227f,
+ 1.76937f, -0.26687f, 8.50976f, 0.26247f, 0.60262f, -0.27910f, 0.30061f,
+ -0.05117f, 0.16018f, 0.71195f, 0.57871f, 1.57794f,
+};
+
+static const float av1_rect_partition_nn_bias_16_layer1[3] = {
+ 2.68750f,
+ -1.31894f,
+ -1.36768f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_16 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_16_layer0,
+ av1_rect_partition_nn_weights_16_layer1 },
+ { av1_rect_partition_nn_bias_16_layer0, av1_rect_partition_nn_bias_16_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_32_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.54654f, -0.43537f, -0.10620f, -0.48051f, -0.43543f, -0.22737f, -0.15429f,
+ -0.09858f, -0.09438f, 0.37306f, 0.23934f, -1.86375f, -1.18307f, -0.32995f,
+ -0.09745f, 0.05431f, -0.13799f, 0.14734f, -0.33219f, 0.18057f, -0.23792f,
+ -0.28126f, 0.02977f, -0.07431f, 0.07860f, 0.00067f, -0.01927f, 1.01841f,
+ -0.57739f, 0.08412f, -1.33843f, -1.05563f, -0.28693f, -0.39425f, -0.69572f,
+ -0.16703f, 0.02808f, 0.11994f, -0.26267f, 0.19706f, -0.29707f, -0.25305f,
+ -0.07050f, -0.02704f, -0.31528f, -0.42301f, 0.22496f, -0.37001f, -0.23319f,
+ -0.11139f, -0.30513f, 0.04213f, -0.12550f, 0.02504f, 0.33245f, 0.01102f,
+ -0.35950f, -0.05949f, -0.19590f, -0.27457f, -0.28339f, -0.15676f, -0.21538f,
+ 0.65066f, 0.28443f, -1.24943f, -3.00246f, -1.01897f, 0.09304f, 0.70052f,
+ -0.12877f, 0.21120f, -0.37476f, 0.23261f, -0.28401f, 0.09837f, 0.00020f,
+ -0.12106f, -0.32354f, -0.02472f, -0.19772f, 1.01886f, 0.16596f, -0.06532f,
+ 1.72938f, 1.57754f, 0.55963f, 0.33246f, -0.20023f, 0.30715f, 0.08629f,
+ 0.18945f, -0.45988f, -1.22610f, -0.05152f, -0.48859f, -1.02104f, -0.27315f,
+ -0.57698f, 0.04157f, -0.92428f, -1.31268f, 1.78210f, 0.10291f, 1.55042f,
+ -1.26793f, 1.39042f, -1.43729f, 0.25600f, 5.21263f, 5.31955f, 5.19316f,
+ 5.43430f, 0.00294f, -0.00970f, -0.02333f, 0.00250f, 1.17672f, 6.27544f,
+ 4.95973f, 3.54009f, 4.51269f, 0.30750f, 0.78780f, -0.44741f, -0.76442f,
+ 0.75050f, 0.58799f, 0.03400f, -2.09859f, 1.67313f, 0.12503f, 0.28609f,
+ 1.15809f, 2.46530f, -0.04898f, 0.23072f, -0.12635f, -0.82097f, -0.63827f,
+ 2.16779f, 1.77132f, 0.15434f, -1.06427f, 0.06206f, -0.87732f, -0.61897f,
+ -0.44593f, -0.77131f, -0.15979f, -0.02282f, -0.74381f, 0.66052f, -0.22992f,
+ 1.74638f, 1.29199f, -0.55464f, 0.98316f, 0.06665f, 0.50254f, -0.66292f,
+ 0.17113f, -0.32633f, -1.85803f, -0.92759f, 4.44965f, 1.33057f, 0.02135f,
+ -0.27446f, -0.26018f, -0.12613f, -0.14470f, -0.23355f, -0.09717f, -0.24123f,
+ -0.05535f, -0.19146f, -0.36222f, -0.30458f, -0.40323f, 0.21779f, 0.14248f,
+ -0.48630f, 0.18840f, 0.11040f, 0.17287f, -0.51880f, 1.12466f, -0.38888f,
+ -0.16421f, -0.31784f, -0.36112f, -0.25386f, -0.01636f, 0.10029f, -0.26881f,
+ -0.17051f, -0.30903f, -0.08573f, -0.28774f, -0.01173f, -0.09706f, -0.23089f,
+ -0.12922f, -0.17463f, -0.12433f, -0.23074f, 0.15220f, 1.29826f, 0.23788f,
+ 0.04189f, 2.66416f, 0.48815f, -0.06803f, 0.96742f, 1.27165f, -0.70348f,
+ -0.09941f, -0.42948f, -0.20243f, -0.02364f, -0.26689f, -0.40629f, -0.68217f,
+ -0.48073f, 2.43657f, -2.60191f, -1.82837f, 0.50440f, 0.71829f, 0.76491f,
+ 0.28293f, 0.20568f, 0.92642f, -0.02496f, 1.43637f, -0.24474f, -1.21030f,
+ 0.54084f, 1.05130f, 1.29572f, 0.03750f, -0.36894f, 0.74548f, -1.33857f,
+ -0.84858f, 1.35230f, 0.80175f, 0.66136f, 1.06473f, 0.18701f, 1.42413f,
+ 0.04661f, -0.07820f, 0.64990f, -0.43595f, 1.18304f, -0.11437f, -0.06365f,
+ 0.03558f, 0.78260f, -1.74890f, 1.56217f, -1.23424f, 4.59193f, -3.35072f,
+ 0.01180f, -0.18296f, -0.20870f, 0.04510f, 1.52595f, -1.37402f, -0.33123f,
+ -0.85957f, 0.80598f, 0.03743f, 0.02354f, 0.37707f, 1.62095f, -0.29627f,
+ -0.31778f, -0.45789f, -0.14906f, 0.25315f, -0.10817f, -0.32610f, -0.40890f,
+ 0.33984f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer0[NUM_NODES] = {
+ -0.17482f, 0.39042f, 0.00000f, 1.69677f, 0.08792f, -0.09301f, 0.13809f,
+ 4.84061f, 0.00000f, 0.40515f, 0.46246f, 0.20644f, -5.77478f, -1.54510f,
+ 0.05660f, -0.32013f, 0.23649f, 0.03778f, -2.53710f, -0.27869f, 0.45623f,
+ -0.04155f, -0.18445f, -0.73405f, -0.50243f, 2.23191f, 1.93272f, -1.07032f,
+ -0.27602f, -1.98063f, 0.20816f, -0.01315f,
+};
+
+static const float av1_rect_partition_nn_weights_32_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.02827f, 1.02560f, -0.07137f, -0.31911f, 0.11365f, 0.13684f, -0.07816f,
+ -5.23036f, -0.34340f, 0.84526f, -1.51845f, 0.07017f, -8.12570f, 6.24061f,
+ 0.35739f, -0.09937f, -0.30978f, 0.22032f, 0.74968f, -0.34557f, 0.45547f,
+ -0.16512f, 0.07118f, 1.66415f, 0.41320f, -1.81533f, -1.96004f, 1.04666f,
+ 0.84049f, 4.31009f, 0.68850f, 0.26322f, -0.24634f, -1.25889f, 0.31952f,
+ 0.63632f, 0.05801f, -0.10664f, -0.21992f, 2.44386f, 0.19526f, -0.09838f,
+ 1.53049f, -0.26630f, 3.54126f, -3.40574f, 0.72730f, 0.04557f, 0.92652f,
+ 0.15522f, 2.35895f, -0.13347f, 0.56907f, 0.15352f, 0.01823f, -0.73939f,
+ 0.43104f, 1.90321f, 0.31267f, -0.51972f, 0.50094f, -3.98372f, -3.41518f,
+ -0.48183f, 0.26661f, 0.64146f, 0.14500f, -0.01695f, 0.16653f, -0.37846f,
+ 0.08412f, 2.69714f, -0.20258f, -0.75786f, 0.11201f, 0.61878f, 4.22231f,
+ -3.55330f, -1.14137f, -0.37722f, -0.28000f, -0.72581f, -2.62827f, -0.19448f,
+ -0.59398f, -0.30136f, -0.17725f, -0.69630f, -0.41132f, 0.12208f, 2.11441f,
+ -1.08794f, -1.41694f, 0.02620f, 2.18792f, 0.04271f,
+};
+
+static const float av1_rect_partition_nn_bias_32_layer1[3] = {
+ 2.47332f,
+ -1.65756f,
+ -0.81573f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_32 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_32_layer0,
+ av1_rect_partition_nn_weights_32_layer1 },
+ { av1_rect_partition_nn_bias_32_layer0, av1_rect_partition_nn_bias_32_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_64_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ 0.08972f, 4.09095f, -0.31398f, -2.43631f, -0.74767f, 1.42471f, 1.60926f,
+ 1.44721f, 1.88259f, 2.35375f, 1.88299f, 2.01109f, 0.98679f, 2.24131f,
+ 0.06279f, -0.08315f, 0.32107f, 0.91334f, -0.36569f, 5.55049f, 5.44943f,
+ 5.20471f, 5.39099f, -0.01943f, -0.00284f, 0.02203f, -0.01309f, 1.41917f,
+ 6.68460f, -6.15986f, 6.41341f, -3.20630f, -0.00567f, -0.00038f, 0.05960f,
+ 0.04308f, 0.95366f, 3.48535f, 2.98266f, 4.11784f, 3.44255f, 0.61630f,
+ 0.71405f, 0.63945f, -0.00713f, 0.39193f, 1.91621f, 3.32755f, 0.71674f,
+ -0.11647f, 2.07090f, 2.64191f, 0.07949f, -0.05023f, 0.99935f, 0.83145f,
+ 0.75898f, -0.98764f, -0.58731f, 1.21734f, -0.08076f, -3.26780f, 1.66278f,
+ 0.04189f, -0.33177f, -1.58648f, 1.00883f, -0.56132f, -2.34877f, 0.67056f,
+ -2.32297f, -0.91641f, -1.02909f, 4.19781f, 3.87484f, 4.32778f, -1.97171f,
+ -0.24734f, 0.00822f, 0.05892f, 0.12697f, -3.62915f, -2.93127f, 7.94856f,
+ -3.29311f, 3.26001f, -0.02231f, 0.02741f, 0.05919f, 0.08190f, -1.49344f,
+ -0.64475f, -0.24627f, 4.03324f, -1.14799f, -0.18465f, -0.17829f, 0.10394f,
+ 0.08580f, -5.74721f, 4.42467f, 3.63964f, 3.00258f, -1.22744f, -0.29408f,
+ 0.00767f, 0.12305f, 0.05249f, -0.17166f, -0.20120f, -0.32941f, -0.31901f,
+ 0.04628f, -0.35249f, -0.18272f, 0.03956f, -0.19329f, -0.33564f, 0.09856f,
+ -0.00173f, -0.31751f, -0.05702f, -0.20558f, -0.31464f, -0.02488f, -0.00729f,
+ -0.35854f, -0.14762f, -0.34897f, -0.12746f, 0.04011f, -0.24918f, -0.53516f,
+ -0.28440f, -0.36789f, -1.34889f, -9.10044f, -9.19238f, 4.48042f, 6.54429f,
+ -0.00226f, 0.00430f, 0.00321f, 0.00442f, 0.87551f, -0.16224f, -0.22832f,
+ -0.60640f, -0.28738f, 0.18062f, 0.22008f, -0.47406f, 0.80302f, 0.12149f,
+ 1.49530f, 1.05069f, -2.02985f, -0.92833f, 0.25616f, 0.12852f, 3.51840f,
+ 0.25226f, -2.63283f, -4.04386f, 8.46300f, -2.93408f, 0.44069f, 0.08276f,
+ 0.34482f, -0.22615f, 0.28666f, 3.02962f, -1.20055f, -1.04832f, -0.97632f,
+ -0.99530f, 1.44196f, 1.68550f, 0.49360f, 1.08155f, -0.26059f, -0.02876f,
+ -0.27492f, -0.06205f, -0.09496f, -0.12314f, -0.30228f, -0.07453f, -0.38857f,
+ 1.17443f, 2.41497f, 1.90537f, 2.37716f, 2.91495f, -0.44455f, -0.51176f,
+ 0.48195f, 0.53032f, 0.23696f, -1.06211f, 1.47459f, -0.89029f, 0.29521f,
+ 0.66291f, -0.42653f, 1.82308f, -1.30372f, -0.36192f, -3.40388f, -1.61476f,
+ -2.29745f, -0.66886f, -2.08252f, -0.54552f, -4.06849f, 0.02948f, 0.27297f,
+ -4.81472f, 4.60404f, -0.11053f, 0.14765f, 0.02826f, -0.14688f, -0.07066f,
+ -0.01224f, 1.20377f, 7.02725f, -6.02627f, 6.87255f, -3.14257f, 0.01074f,
+ 0.02397f, -0.02359f, 0.01901f, 0.14956f, -1.67671f, 2.26714f, 2.57043f,
+ -0.45888f, -1.60265f, -2.11475f, -2.74029f, -2.74658f, -0.35630f, -2.63013f,
+ -2.14814f, -0.67266f, -1.56850f, 0.57137f, -1.14428f, -0.34265f, -0.12521f,
+ 0.01220f, -0.74906f, -0.19270f, 0.68110f, -0.24737f, -0.70568f, -1.64826f,
+ -0.35847f, -0.15984f, -1.17932f, -8.72306f, -8.72834f, 3.93701f, 6.17812f,
+ -0.03191f, -0.00104f, 0.01402f, -0.00046f, -0.94517f, 1.51266f, -0.56318f,
+ 0.72260f, -0.09253f, -0.09069f, -2.16695f, -0.23653f, 0.24418f, 2.21148f,
+ -1.47954f, -1.01439f, 0.31536f, 0.77238f, -0.85083f, -0.15758f, -0.50886f,
+ 0.09101f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer0[NUM_NODES] = {
+ 0.91706f, -1.31328f, -5.16196f, 1.13191f, -0.98044f, -1.61122f, 1.03039f,
+ -0.98537f, -4.45568f, -4.34802f, -0.92116f, 0.66836f, -0.10752f, -0.13065f,
+ -0.35567f, -0.35693f, 1.74941f, 1.17379f, -3.45555f, 5.66321f, -0.24917f,
+ -1.11940f, -0.73656f, -0.19299f, -0.04181f, 1.11010f, -2.97859f, -0.16774f,
+ 0.59835f, -0.31269f, -0.30585f, -1.66212f,
+};
+
+static const float av1_rect_partition_nn_weights_64_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 0.58963f, 4.20320f, -8.62465f, -6.54014f, 5.41108f, 2.33581f, -0.10354f,
+ -1.17753f, -3.45909f, -2.24722f, 2.20881f, 3.21971f, -0.09087f, -0.21624f,
+ 0.16529f, -8.40985f, -1.60205f, -1.41538f, 4.41826f, -4.63069f, -0.27742f,
+ 4.08710f, 0.26439f, -1.46028f, 0.51234f, 6.25212f, -3.35650f, -1.21348f,
+ 1.37201f, 8.89151f, 0.28859f, -0.97328f, -0.36196f, -2.71701f, 4.54196f,
+ -0.62476f, -2.43814f, -1.34209f, 0.12850f, 1.73859f, 3.09809f, -4.42434f,
+ -1.82552f, -3.66420f, -0.31535f, 0.00968f, -0.02019f, 9.66824f, 0.58835f,
+ 1.50425f, 2.84487f, 2.55522f, 0.01409f, -2.27594f, -0.31800f, 0.91076f,
+ -0.66808f, 0.33120f, -0.12460f, 0.64457f, -0.36416f, -10.30843f, 1.51013f,
+ 2.06861f, -0.20989f, -0.87119f, 3.68642f, 7.33662f, -2.88037f, -0.52414f,
+ -0.35036f, -0.45947f, -0.07406f, 6.46346f, -0.16031f, 0.27071f, 0.38845f,
+ -0.21940f, 0.08583f, -1.39526f, 0.50554f, 0.45279f, -6.61856f, 1.84069f,
+ -0.19149f, -1.77235f, 0.75136f, 1.11797f, 0.32677f, -7.10427f, 3.82908f,
+ 1.04238f, -0.91435f, 1.93317f, -1.84946f, -0.48909f,
+};
+
+static const float av1_rect_partition_nn_bias_64_layer1[3] = {
+ 0.32215f,
+ -0.57522f,
+ 0.25314f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_64 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_64_layer0,
+ av1_rect_partition_nn_weights_64_layer1 },
+ { av1_rect_partition_nn_bias_64_layer0, av1_rect_partition_nn_bias_64_layer1 }
+};
+
+static const float av1_rect_partition_nn_weights_128_layer0[FEATURE_SIZE *
+ NUM_NODES] = {
+ -0.70901f, -3.03481f, 3.30604f, -1.28803f, -0.08610f, -0.33320f, -0.30716f,
+ 0.25100f, 0.14323f, -0.98422f, -0.89084f, -0.24508f, -1.10785f, -0.82524f,
+ 0.11766f, -0.42777f, 1.08965f, 4.35125f, -1.19388f, 4.22042f, 4.96306f,
+ 6.32406f, 3.29899f, -0.90768f, 0.05203f, 0.38467f, 1.74257f, -0.19918f,
+ -0.11335f, 0.00140f, -0.42303f, -0.04419f, 0.03583f, -0.05441f, -0.19586f,
+ 0.01484f, -1.19964f, 0.25497f, 3.04502f, 0.05446f, -0.23253f, 0.00266f,
+ 0.07117f, -2.78986f, -4.62953f, 1.45331f, 0.43923f, 0.92298f, -0.47736f,
+ 1.49165f, 0.45942f, -1.99787f, 3.33510f, 0.17234f, 0.04024f, -1.42780f,
+ 0.23566f, -0.90970f, 1.18041f, -1.45865f, 2.30878f, -1.28507f, 1.87290f,
+ 1.91186f, 4.74826f, -3.70735f, 4.49808f, -4.72275f, -0.02696f, -0.02642f,
+ -0.06093f, -0.01121f, -0.70683f, 2.69737f, -1.88563f, 2.48637f, 1.10922f,
+ 0.74624f, 0.40308f, 2.06396f, 1.39289f, 0.00909f, -2.05271f, -1.53539f,
+ -1.38323f, 0.83303f, -0.32250f, 0.51172f, 3.91249f, 1.66373f, 1.13184f,
+ -2.22874f, -1.13448f, -0.11185f, 0.19387f, 0.36770f, -0.58933f, 0.22789f,
+ 1.17307f, 0.77461f, 0.20817f, 0.33417f, 0.54037f, 0.32961f, -0.18456f,
+ -9.78171f, -0.17216f, -3.44703f, -2.42158f, 0.51946f, 4.35949f, -0.73335f,
+ -1.61515f, -0.29622f, -0.37617f, -0.42316f, 0.74922f, 1.44386f, 3.92704f,
+ -3.76274f, 4.19775f, -3.86958f, 0.00074f, -0.02418f, -0.12944f, 0.05857f,
+ -0.85507f, 5.42546f, 5.40338f, 5.54347f, 5.59791f, -0.01611f, 0.01618f,
+ -0.01654f, -0.00270f, -0.39608f, -0.40410f, -0.24551f, 0.09124f, -0.34413f,
+ -0.11504f, 0.12793f, -0.31523f, 0.09148f, -0.08567f, -0.05140f, -0.13310f,
+ -0.81200f, 0.06882f, -0.52537f, -12.74048f, -0.45395f, -4.04775f, -1.84887f,
+ -1.02573f, 0.32788f, 1.06828f, -1.25503f, -0.42693f, 2.01413f, -2.29103f,
+ 0.62271f, 1.11764f, -1.83113f, -1.32325f, -1.65651f, -2.87826f, 1.46910f,
+ 0.60885f, 0.16079f, 0.00171f, -0.25658f, -0.25465f, -0.14149f, 0.19497f,
+ -0.07866f, -0.37080f, -0.05778f, -0.08870f, -0.20491f, 0.84521f, -0.18214f,
+ -1.38441f, -1.08932f, -1.76627f, 0.73172f, 0.05967f, 1.28057f, 3.42722f,
+ 1.69287f, 0.77169f, 0.44528f, 1.85513f, 0.07840f, 1.31252f, 2.89948f,
+ 1.49489f, 0.15281f, 0.54708f, -1.14185f, -2.51063f, 0.36618f, -0.55322f,
+ 0.96671f, 1.59470f, 1.38252f, 1.99697f, 0.03266f, -0.23200f, -0.01127f,
+ -0.18918f, -0.37598f, -0.03119f, -0.36039f, -0.21192f, -0.11565f, -4.22635f,
+ 1.41252f, 0.56608f, -0.08867f, 3.11924f, -0.54597f, -0.12504f, -0.05289f,
+ -0.28665f, -0.58297f, -1.18362f, -0.76201f, -1.22011f, -0.58756f, 0.14740f,
+ 1.43971f, 0.98381f, -0.02998f, -0.40678f, -0.23047f, -0.12979f, 0.04003f,
+ -0.22081f, -0.09294f, -0.15955f, -0.10379f, -0.10192f, -1.51316f, 2.39482f,
+ -1.69975f, 3.58976f, -0.91032f, -0.03498f, 0.48982f, -0.13418f, 0.76256f,
+ 1.61003f, -2.01676f, -1.24430f, -3.25763f, 1.12314f, 2.00740f, 0.04613f,
+ -0.14746f, -0.57374f, 3.44511f, -0.56767f, -4.08432f, -2.04894f, 2.35951f,
+ -0.00458f, 0.18512f, 0.09916f, -0.04084f, -1.56207f, 1.38034f, 4.17302f,
+ -1.47326f, -2.03530f, -0.00210f, 0.27469f, -0.17423f, 0.86860f, 2.76195f,
+ 2.43269f, -3.57331f, 2.08715f, -1.44171f, -0.17389f, 2.26157f, -0.07852f,
+ 2.02519f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer0[NUM_NODES] = {
+ 2.53427f, 1.66678f, -0.84914f, -0.15070f, -1.74769f, 0.45218f, -0.26067f,
+ 2.05916f, 0.08978f, 5.30984f, 2.66243f, -1.62740f, 0.70018f, 1.96403f,
+ -4.97152f, -0.05425f, -3.84474f, -1.28006f, 3.47490f, -0.08373f, 0.00225f,
+ -1.40692f, -0.27569f, -0.30253f, 0.77377f, -0.67636f, -0.26379f, 1.82348f,
+ 0.66120f, 0.61119f, -1.42293f, 0.32676f,
+};
+
+static const float av1_rect_partition_nn_weights_128_layer1[NUM_NODES *
+ LABEL_SIZE] = {
+ 1.53453f, -0.23707f, 7.88368f, 0.33340f, 0.97523f, 1.38538f, -0.16746f,
+ 4.42070f, 3.18678f, -5.03545f, -2.27029f, -3.75719f, -0.26850f, -4.93432f,
+ -8.75673f, 0.27398f, -5.77882f, -0.91616f, -2.62725f, -0.23961f, 0.31249f,
+ 3.32134f, 0.25375f, -0.00394f, 2.30213f, -0.14183f, 0.14544f, -1.42830f,
+ 1.31101f, 3.99389f, -0.00017f, -2.90184f, -2.11444f, 2.16734f, -3.05133f,
+ 0.39206f, 4.61489f, -2.88181f, -0.47745f, 2.86649f, -1.20621f, 3.70550f,
+ 1.58029f, -4.58731f, -2.29350f, -0.76930f, 5.19135f, -0.22521f, -5.08782f,
+ 2.17316f, 1.30563f, 0.16777f, -2.17767f, -2.09904f, 1.37001f, 0.25091f,
+ -1.76743f, 1.57940f, 0.30544f, -2.39895f, -0.08532f, -1.77122f, 1.84010f,
+ -0.88449f, 0.79299f, -1.35368f, -4.54110f, 0.02244f, -5.11580f, 1.60883f,
+ 0.29352f, -6.47042f, -1.81426f, 1.24013f, 0.90980f, 7.93977f, 2.12555f,
+ 5.24720f, 4.19508f, 0.21499f, 11.06045f, -0.74752f, 0.89396f, 0.26422f,
+ 1.72332f, -1.25113f, -1.71136f, 0.13676f, -0.07867f, -0.96929f, 0.19911f,
+ 3.58233f, -0.76470f, -2.24162f, -2.87465f, 3.18736f,
+};
+
+static const float av1_rect_partition_nn_bias_128_layer1[3] = {
+ 1.09014f,
+ -0.53317f,
+ -0.55668f,
+};
+
+static const NN_CONFIG av1_rect_partition_nnconfig_128 = {
+ FEATURE_SIZE, // num_inputs
+ LABEL_SIZE, // num_outputs
+ 1, // num_hidden_layers
+ {
+ NUM_NODES,
+ }, // num_hidden_nodes
+ { av1_rect_partition_nn_weights_128_layer0,
+ av1_rect_partition_nn_weights_128_layer1 },
+ { av1_rect_partition_nn_bias_128_layer0,
+ av1_rect_partition_nn_bias_128_layer1 }
+};
+#undef FEATURE_SIZE
+#undef NUM_NODES
+#undef LABEL_SIZE
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PARTITION_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/pickcdef.c b/third_party/aom/av1/encoder/pickcdef.c
new file mode 100644
index 0000000000..6d154a7d22
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickcdef.c
@@ -0,0 +1,526 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <string.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "av1/common/cdef.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/reconinter.h"
+#include "av1/encoder/encoder.h"
+
+#define REDUCED_PRI_STRENGTHS 8
+#define REDUCED_TOTAL_STRENGTHS (REDUCED_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+#define TOTAL_STRENGTHS (CDEF_PRI_STRENGTHS * CDEF_SEC_STRENGTHS)
+
+static int priconv[REDUCED_PRI_STRENGTHS] = { 0, 1, 2, 3, 5, 7, 10, 13 };
+
+/* Search for the best strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one(int *lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS], int sb_count,
+ int fast) {
+ uint64_t tot_mse[TOTAL_STRENGTHS];
+ const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id = 0;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ if (mse[i][lev[gi]] < best_mse) {
+ best_mse = mse[i][lev[gi]];
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ uint64_t best = best_mse;
+ if (mse[i][j] < best) best = mse[i][j];
+ tot_mse[j] += best;
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ if (tot_mse[j] < best_tot_mse) {
+ best_tot_mse = tot_mse[j];
+ best_id = j;
+ }
+ }
+ lev[nb_strengths] = best_id;
+ return best_tot_mse;
+}
+
+/* Search for the best luma+chroma strength to add as an option, knowing we
+ already selected nb_strengths options. */
+static uint64_t search_one_dual(int *lev0, int *lev1, int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS], int sb_count,
+ int fast) {
+ uint64_t tot_mse[TOTAL_STRENGTHS][TOTAL_STRENGTHS];
+ int i, j;
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ int best_id0 = 0;
+ int best_id1 = 0;
+ const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+ memset(tot_mse, 0, sizeof(tot_mse));
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ /* Find best mse among already selected options. */
+ for (gi = 0; gi < nb_strengths; gi++) {
+ uint64_t curr = mse[0][i][lev0[gi]];
+ curr += mse[1][i][lev1[gi]];
+ if (curr < best_mse) {
+ best_mse = curr;
+ }
+ }
+ /* Find best mse when adding each possible new option. */
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ uint64_t best = best_mse;
+ uint64_t curr = mse[0][i][j];
+ curr += mse[1][i][k];
+ if (curr < best) best = curr;
+ tot_mse[j][k] += best;
+ }
+ }
+ }
+ for (j = 0; j < total_strengths; j++) {
+ int k;
+ for (k = 0; k < total_strengths; k++) {
+ if (tot_mse[j][k] < best_tot_mse) {
+ best_tot_mse = tot_mse[j][k];
+ best_id0 = j;
+ best_id1 = k;
+ }
+ }
+ }
+ lev0[nb_strengths] = best_id0;
+ lev1[nb_strengths] = best_id1;
+ return best_tot_mse;
+}
+
+/* Search for the set of strengths that minimizes mse. */
+static uint64_t joint_strength_search(int *best_lev, int nb_strengths,
+ uint64_t mse[][TOTAL_STRENGTHS],
+ int sb_count, int fast) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse = search_one(best_lev, i, mse, sb_count, fast);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ if (!fast) {
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) best_lev[j] = best_lev[j + 1];
+ best_tot_mse =
+ search_one(best_lev, nb_strengths - 1, mse, sb_count, fast);
+ }
+ }
+ return best_tot_mse;
+}
+
+/* Search for the set of luma+chroma strengths that minimizes mse. */
+static uint64_t joint_strength_search_dual(int *best_lev0, int *best_lev1,
+ int nb_strengths,
+ uint64_t (**mse)[TOTAL_STRENGTHS],
+ int sb_count, int fast) {
+ uint64_t best_tot_mse;
+ int i;
+ best_tot_mse = (uint64_t)1 << 63;
+ /* Greedy search: add one strength options at a time. */
+ for (i = 0; i < nb_strengths; i++) {
+ best_tot_mse =
+ search_one_dual(best_lev0, best_lev1, i, mse, sb_count, fast);
+ }
+ /* Trying to refine the greedy search by reconsidering each
+ already-selected option. */
+ for (i = 0; i < 4 * nb_strengths; i++) {
+ int j;
+ for (j = 0; j < nb_strengths - 1; j++) {
+ best_lev0[j] = best_lev0[j + 1];
+ best_lev1[j] = best_lev1[j + 1];
+ }
+ best_tot_mse = search_one_dual(best_lev0, best_lev1, nb_strengths - 1, mse,
+ sb_count, fast);
+ }
+ return best_tot_mse;
+}
+
+/* FIXME: SSE-optimize this. */
+static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
+ int src_voffset, int src_hoffset, int sstride,
+ int vsize, int hsize) {
+ int r, c;
+ const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
+ for (r = 0; r < vsize; r++) {
+ for (c = 0; c < hsize; c++) {
+ dst[r * dstride + c] = base[r * sstride + c];
+ }
+ }
+}
+
+static INLINE uint64_t dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int coeff_shift) {
+ uint64_t svar = 0;
+ uint64_t dvar = 0;
+ uint64_t sum_s = 0;
+ uint64_t sum_d = 0;
+ uint64_t sum_s2 = 0;
+ uint64_t sum_d2 = 0;
+ uint64_t sum_sd = 0;
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ sum_s += src[i * sstride + j];
+ sum_d += dst[i * dstride + j];
+ sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+ sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+ sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+ }
+ }
+ /* Compute the variance -- the calculation cannot go negative. */
+ svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+ dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+ return (uint64_t)floor(
+ .5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+ (svar + dvar + (400 << 2 * coeff_shift)) /
+ (sqrt((20000 << 4 * coeff_shift) + svar * (double)dvar)));
+}
+
+static INLINE uint64_t mse_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride) {
+ uint64_t sum = 0;
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ int e = dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
+
+static INLINE uint64_t mse_4x4_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride) {
+ uint64_t sum = 0;
+ int i, j;
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ int e = dst[i * dstride + j] - src[i * sstride + j];
+ sum += e * e;
+ }
+ }
+ return sum;
+}
+
+/* Compute MSE only on the blocks we filtered. */
+uint64_t compute_cdef_dist(uint16_t *dst, int dstride, uint16_t *src,
+ cdef_list *dlist, int cdef_count, BLOCK_SIZE bsize,
+ int coeff_shift, int pli) {
+ uint64_t sum = 0;
+ int bi, bx, by;
+ if (bsize == BLOCK_8X8) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ if (pli == 0) {
+ sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+ &src[bi << (3 + 3)], 8, coeff_shift);
+ } else {
+ sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
+ &src[bi << (3 + 3)], 8);
+ }
+ }
+ } else if (bsize == BLOCK_4X8) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
+ &src[bi << (3 + 2)], 4);
+ sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
+ &src[(bi << (3 + 2)) + 4 * 4], 4);
+ }
+ } else if (bsize == BLOCK_8X4) {
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
+ &src[bi << (2 + 3)], 8);
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
+ &src[(bi << (2 + 3)) + 4], 8);
+ }
+ } else {
+ assert(bsize == BLOCK_4X4);
+ for (bi = 0; bi < cdef_count; bi++) {
+ by = dlist[bi].by;
+ bx = dlist[bi].bx;
+ sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
+ &src[bi << (2 + 2)], 4);
+ }
+ }
+ return sum >> 2 * coeff_shift;
+}
+
+void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
+ AV1_COMMON *cm, MACROBLOCKD *xd, int fast) {
+ int r, c;
+ int fbr, fbc;
+ uint16_t *src[3];
+ uint16_t *ref_coeff[3];
+ static cdef_list dlist[MI_SIZE_128X128 * MI_SIZE_128X128];
+ int dir[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int var[CDEF_NBLOCKS][CDEF_NBLOCKS] = { { 0 } };
+ int stride[3];
+ int bsize[3];
+ int mi_wide_l2[3];
+ int mi_high_l2[3];
+ int xdec[3];
+ int ydec[3];
+ int pli;
+ int cdef_count;
+ int coeff_shift = AOMMAX(cm->seq_params.bit_depth - 8, 0);
+ uint64_t best_tot_mse = (uint64_t)1 << 63;
+ uint64_t tot_mse;
+ int sb_count;
+ int nvfb = (cm->mi_rows + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ int nhfb = (cm->mi_cols + MI_SIZE_64X64 - 1) / MI_SIZE_64X64;
+ int *sb_index = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+ int *selected_strength = aom_malloc(nvfb * nhfb * sizeof(*sb_index));
+ uint64_t(*mse[2])[TOTAL_STRENGTHS];
+ int pri_damping = 3 + (cm->base_qindex >> 6);
+ int sec_damping = 3 + (cm->base_qindex >> 6);
+ int i;
+ int nb_strengths;
+ int nb_strength_bits;
+ int quantizer;
+ double lambda;
+ const int num_planes = av1_num_planes(cm);
+ const int total_strengths = fast ? REDUCED_TOTAL_STRENGTHS : TOTAL_STRENGTHS;
+ DECLARE_ALIGNED(32, uint16_t, inbuf[CDEF_INBUF_SIZE]);
+ uint16_t *in;
+ DECLARE_ALIGNED(32, uint16_t, tmp_dst[1 << (MAX_SB_SIZE_LOG2 * 2)]);
+ quantizer = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth) >>
+ (cm->seq_params.bit_depth - 8);
+ lambda = .12 * quantizer * quantizer / 256.;
+
+ av1_setup_dst_planes(xd->plane, cm->seq_params.sb_size, frame, 0, 0, 0,
+ num_planes);
+ mse[0] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+ mse[1] = aom_malloc(sizeof(**mse) * nvfb * nhfb);
+ for (pli = 0; pli < num_planes; pli++) {
+ uint8_t *ref_buffer;
+ int ref_stride;
+ switch (pli) {
+ case 0:
+ ref_buffer = ref->y_buffer;
+ ref_stride = ref->y_stride;
+ break;
+ case 1:
+ ref_buffer = ref->u_buffer;
+ ref_stride = ref->uv_stride;
+ break;
+ case 2:
+ ref_buffer = ref->v_buffer;
+ ref_stride = ref->uv_stride;
+ break;
+ }
+ src[pli] = aom_memalign(
+ 32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+ ref_coeff[pli] = aom_memalign(
+ 32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
+ xdec[pli] = xd->plane[pli].subsampling_x;
+ ydec[pli] = xd->plane[pli].subsampling_y;
+ bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
+ : (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
+ stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
+ mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
+ mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
+
+ const int frame_height =
+ (cm->mi_rows * MI_SIZE) >> xd->plane[pli].subsampling_y;
+ const int frame_width =
+ (cm->mi_cols * MI_SIZE) >> xd->plane[pli].subsampling_x;
+
+ for (r = 0; r < frame_height; ++r) {
+ for (c = 0; c < frame_width; ++c) {
+ if (cm->seq_params.use_highbitdepth) {
+ src[pli][r * stride[pli] + c] = CONVERT_TO_SHORTPTR(
+ xd->plane[pli].dst.buf)[r * xd->plane[pli].dst.stride + c];
+ ref_coeff[pli][r * stride[pli] + c] =
+ CONVERT_TO_SHORTPTR(ref_buffer)[r * ref_stride + c];
+ } else {
+ src[pli][r * stride[pli] + c] =
+ xd->plane[pli].dst.buf[r * xd->plane[pli].dst.stride + c];
+ ref_coeff[pli][r * stride[pli] + c] = ref_buffer[r * ref_stride + c];
+ }
+ }
+ }
+ }
+ in = inbuf + CDEF_VBORDER * CDEF_BSTRIDE + CDEF_HBORDER;
+ sb_count = 0;
+ for (fbr = 0; fbr < nvfb; ++fbr) {
+ for (fbc = 0; fbc < nhfb; ++fbc) {
+ int nvb, nhb;
+ int gi;
+ int dirinit = 0;
+ nhb = AOMMIN(MI_SIZE_64X64, cm->mi_cols - MI_SIZE_64X64 * fbc);
+ nvb = AOMMIN(MI_SIZE_64X64, cm->mi_rows - MI_SIZE_64X64 * fbr);
+ int hb_step = 1;
+ int vb_step = 1;
+ BLOCK_SIZE bs = BLOCK_64X64;
+ MB_MODE_INFO *const mbmi =
+ cm->mi_grid_visible[MI_SIZE_64X64 * fbr * cm->mi_stride +
+ MI_SIZE_64X64 * fbc];
+ if (((fbc & 1) &&
+ (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64)) ||
+ ((fbr & 1) &&
+ (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_64X128)))
+ continue;
+ if (mbmi->sb_type == BLOCK_128X128 || mbmi->sb_type == BLOCK_128X64 ||
+ mbmi->sb_type == BLOCK_64X128)
+ bs = mbmi->sb_type;
+ if (bs == BLOCK_128X128 || bs == BLOCK_128X64) {
+ nhb = AOMMIN(MI_SIZE_128X128, cm->mi_cols - MI_SIZE_64X64 * fbc);
+ hb_step = 2;
+ }
+ if (bs == BLOCK_128X128 || bs == BLOCK_64X128) {
+ nvb = AOMMIN(MI_SIZE_128X128, cm->mi_rows - MI_SIZE_64X64 * fbr);
+ vb_step = 2;
+ }
+ // No filtering if the entire filter block is skipped
+ if (sb_all_skip(cm, fbr * MI_SIZE_64X64, fbc * MI_SIZE_64X64)) continue;
+ cdef_count = sb_compute_cdef_list(cm, fbr * MI_SIZE_64X64,
+ fbc * MI_SIZE_64X64, dlist, bs);
+ for (pli = 0; pli < num_planes; pli++) {
+ for (i = 0; i < CDEF_INBUF_SIZE; i++) inbuf[i] = CDEF_VERY_LARGE;
+ for (gi = 0; gi < total_strengths; gi++) {
+ int threshold;
+ uint64_t curr_mse;
+ int sec_strength;
+ threshold = gi / CDEF_SEC_STRENGTHS;
+ if (fast) threshold = priconv[threshold];
+ /* We avoid filtering the pixels for which some of the pixels to
+ average
+ are outside the frame. We could change the filter instead, but it
+ would add special cases for any future vectorization. */
+ int yoff = CDEF_VBORDER * (fbr != 0);
+ int xoff = CDEF_HBORDER * (fbc != 0);
+ int ysize = (nvb << mi_high_l2[pli]) +
+ CDEF_VBORDER * (fbr + vb_step < nvfb) + yoff;
+ int xsize = (nhb << mi_wide_l2[pli]) +
+ CDEF_HBORDER * (fbc + hb_step < nhfb) + xoff;
+ sec_strength = gi % CDEF_SEC_STRENGTHS;
+ copy_sb16_16(&in[(-yoff * CDEF_BSTRIDE - xoff)], CDEF_BSTRIDE,
+ src[pli],
+ (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) - yoff,
+ (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]) - xoff,
+ stride[pli], ysize, xsize);
+ cdef_filter_fb(NULL, tmp_dst, CDEF_BSTRIDE, in, xdec[pli], ydec[pli],
+ dir, &dirinit, var, pli, dlist, cdef_count, threshold,
+ sec_strength + (sec_strength == 3), pri_damping,
+ sec_damping, coeff_shift);
+ curr_mse = compute_cdef_dist(
+ ref_coeff[pli] +
+ (fbr * MI_SIZE_64X64 << mi_high_l2[pli]) * stride[pli] +
+ (fbc * MI_SIZE_64X64 << mi_wide_l2[pli]),
+ stride[pli], tmp_dst, dlist, cdef_count, bsize[pli], coeff_shift,
+ pli);
+ if (pli < 2)
+ mse[pli][sb_count][gi] = curr_mse;
+ else
+ mse[1][sb_count][gi] += curr_mse;
+ sb_index[sb_count] =
+ MI_SIZE_64X64 * fbr * cm->mi_stride + MI_SIZE_64X64 * fbc;
+ }
+ }
+ sb_count++;
+ }
+ }
+ nb_strength_bits = 0;
+ /* Search for different number of signalling bits. */
+ for (i = 0; i <= 3; i++) {
+ int j;
+ int best_lev0[CDEF_MAX_STRENGTHS];
+ int best_lev1[CDEF_MAX_STRENGTHS] = { 0 };
+ nb_strengths = 1 << i;
+ if (num_planes >= 3)
+ tot_mse = joint_strength_search_dual(best_lev0, best_lev1, nb_strengths,
+ mse, sb_count, fast);
+ else
+ tot_mse = joint_strength_search(best_lev0, nb_strengths, mse[0], sb_count,
+ fast);
+ /* Count superblock signalling cost. */
+ tot_mse += (uint64_t)(sb_count * lambda * i);
+ /* Count header signalling cost. */
+ tot_mse += (uint64_t)(nb_strengths * lambda * CDEF_STRENGTH_BITS);
+ if (tot_mse < best_tot_mse) {
+ best_tot_mse = tot_mse;
+ nb_strength_bits = i;
+ for (j = 0; j < 1 << nb_strength_bits; j++) {
+ cm->cdef_strengths[j] = best_lev0[j];
+ cm->cdef_uv_strengths[j] = best_lev1[j];
+ }
+ }
+ }
+ nb_strengths = 1 << nb_strength_bits;
+
+ cm->cdef_bits = nb_strength_bits;
+ cm->nb_cdef_strengths = nb_strengths;
+ for (i = 0; i < sb_count; i++) {
+ int gi;
+ int best_gi;
+ uint64_t best_mse = (uint64_t)1 << 63;
+ best_gi = 0;
+ for (gi = 0; gi < cm->nb_cdef_strengths; gi++) {
+ uint64_t curr = mse[0][i][cm->cdef_strengths[gi]];
+ if (num_planes >= 3) curr += mse[1][i][cm->cdef_uv_strengths[gi]];
+ if (curr < best_mse) {
+ best_gi = gi;
+ best_mse = curr;
+ }
+ }
+ selected_strength[i] = best_gi;
+ cm->mi_grid_visible[sb_index[i]]->cdef_strength = best_gi;
+ }
+
+ if (fast) {
+ for (int j = 0; j < nb_strengths; j++) {
+ cm->cdef_strengths[j] =
+ priconv[cm->cdef_strengths[j] / CDEF_SEC_STRENGTHS] *
+ CDEF_SEC_STRENGTHS +
+ (cm->cdef_strengths[j] % CDEF_SEC_STRENGTHS);
+ cm->cdef_uv_strengths[j] =
+ priconv[cm->cdef_uv_strengths[j] / CDEF_SEC_STRENGTHS] *
+ CDEF_SEC_STRENGTHS +
+ (cm->cdef_uv_strengths[j] % CDEF_SEC_STRENGTHS);
+ }
+ }
+ cm->cdef_pri_damping = pri_damping;
+ cm->cdef_sec_damping = sec_damping;
+ aom_free(mse[0]);
+ aom_free(mse[1]);
+ for (pli = 0; pli < num_planes; pli++) {
+ aom_free(src[pli]);
+ aom_free(ref_coeff[pli]);
+ }
+ aom_free(sb_index);
+ aom_free(selected_strength);
+}
diff --git a/third_party/aom/av1/encoder/picklpf.c b/third_party/aom/av1/encoder/picklpf.c
new file mode 100644
index 0000000000..c5508e25c4
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_scale_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/av1_loopfilter.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/picklpf.h"
+
+static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc,
+ YV12_BUFFER_CONFIG *dst_bc, int plane) {
+ switch (plane) {
+ case 0: aom_yv12_copy_y(src_bc, dst_bc); break;
+ case 1: aom_yv12_copy_u(src_bc, dst_bc); break;
+ case 2: aom_yv12_copy_v(src_bc, dst_bc); break;
+ default: assert(plane >= 0 && plane <= 2); break;
+ }
+}
+
+int av1_get_max_filter_level(const AV1_COMP *cpi) {
+ if (cpi->oxcf.pass == 2) {
+ return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
+ : MAX_LOOP_FILTER;
+ } else {
+ return MAX_LOOP_FILTER;
+ }
+}
+
+static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
+ AV1_COMP *const cpi, int filt_level,
+ int partial_frame, int plane, int dir) {
+ AV1_COMMON *const cm = &cpi->common;
+ int64_t filt_err;
+
+ assert(plane >= 0 && plane <= 2);
+ int filter_level[2] = { filt_level, filt_level };
+ if (plane == 0 && dir == 0) filter_level[1] = cm->lf.filter_level[1];
+ if (plane == 0 && dir == 1) filter_level[0] = cm->lf.filter_level[0];
+
+ // set base filters for use of get_filter_level when in DELTA_Q_LF mode
+ switch (plane) {
+ case 0:
+ cm->lf.filter_level[0] = filter_level[0];
+ cm->lf.filter_level[1] = filter_level[1];
+ break;
+ case 1: cm->lf.filter_level_u = filter_level[0]; break;
+ case 2: cm->lf.filter_level_v = filter_level[0]; break;
+ }
+
+ // TODO(any): please enable multi-thread and remove the flag when loop
+ // filter mask is compatible with multi-thread.
+#if LOOP_FILTER_BITMASK
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, 0, plane,
+ plane + 1, partial_frame);
+#else
+ if (cpi->num_workers > 1)
+ av1_loop_filter_frame_mt(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+ plane + 1, partial_frame, cpi->workers,
+ cpi->num_workers, &cpi->lf_row_sync);
+ else
+ av1_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, plane,
+ plane + 1, partial_frame);
+#endif
+
+ filt_err = aom_get_sse_plane(sd, cm->frame_to_show, plane,
+ cm->seq_params.use_highbitdepth);
+
+ // Re-instate the unfiltered frame
+ yv12_copy_plane(&cpi->last_frame_uf, cm->frame_to_show, plane);
+
+ return filt_err;
+}
+
+static int search_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ int partial_frame,
+ const int *last_frame_filter_level,
+ double *best_cost_ret, int plane, int dir) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ int filt_direction = 0;
+ int64_t best_err;
+ int filt_best;
+ MACROBLOCK *x = &cpi->td.mb;
+
+ // Start the search at the previous frame filter level unless it is now out of
+ // range.
+ int lvl;
+ switch (plane) {
+ case 0: lvl = last_frame_filter_level[dir]; break;
+ case 1: lvl = last_frame_filter_level[2]; break;
+ case 2: lvl = last_frame_filter_level[3]; break;
+ default: assert(plane >= 0 && plane <= 2); return 0;
+ }
+ int filt_mid = clamp(lvl, min_filter_level, max_filter_level);
+ int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+ // Sum squared error at each filter level
+ int64_t ss_err[MAX_LOOP_FILTER + 1];
+
+ // Set each entry to -1
+ memset(ss_err, 0xFF, sizeof(ss_err));
+ yv12_copy_plane(cm->frame_to_show, &cpi->last_frame_uf, plane);
+ best_err = try_filter_frame(sd, cpi, filt_mid, partial_frame, plane, dir);
+ filt_best = filt_mid;
+ ss_err[filt_mid] = best_err;
+
+ while (filter_step > 0) {
+ const int filt_high = AOMMIN(filt_mid + filter_step, max_filter_level);
+ const int filt_low = AOMMAX(filt_mid - filter_step, min_filter_level);
+
+ // Bias against raising loop filter in favor of lowering it.
+ int64_t bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
+
+ if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+ bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+ // yx, bias less for large block size
+ if (cm->tx_mode != ONLY_4X4) bias >>= 1;
+
+ if (filt_direction <= 0 && filt_low != filt_mid) {
+ // Get Low filter error score
+ if (ss_err[filt_low] < 0) {
+ ss_err[filt_low] =
+ try_filter_frame(sd, cpi, filt_low, partial_frame, plane, dir);
+ }
+ // If value is close to the best so far then bias towards a lower loop
+ // filter value.
+ if (ss_err[filt_low] < (best_err + bias)) {
+ // Was it actually better than the previous best?
+ if (ss_err[filt_low] < best_err) {
+ best_err = ss_err[filt_low];
+ }
+ filt_best = filt_low;
+ }
+ }
+
+ // Now look at filt_high
+ if (filt_direction >= 0 && filt_high != filt_mid) {
+ if (ss_err[filt_high] < 0) {
+ ss_err[filt_high] =
+ try_filter_frame(sd, cpi, filt_high, partial_frame, plane, dir);
+ }
+ // If value is significantly better than previous best, bias added against
+ // raising filter value
+ if (ss_err[filt_high] < (best_err - bias)) {
+ best_err = ss_err[filt_high];
+ filt_best = filt_high;
+ }
+ }
+
+ // Half the step distance if the best filter value was the same as last time
+ if (filt_best == filt_mid) {
+ filter_step /= 2;
+ filt_direction = 0;
+ } else {
+ filt_direction = (filt_best < filt_mid) ? -1 : 1;
+ filt_mid = filt_best;
+ }
+ }
+
+ // Update best error
+ best_err = ss_err[filt_best];
+
+ if (best_cost_ret) *best_cost_ret = RDCOST_DBL(x->rdmult, 0, best_err);
+ return filt_best;
+}
+
+void av1_pick_filter_level(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi,
+ LPF_PICK_METHOD method) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ struct loopfilter *const lf = &cm->lf;
+ (void)sd;
+
+ lf->sharpness_level = 0;
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+ if (method == LPF_PICK_MINIMAL_LPF) {
+ lf->filter_level[0] = 0;
+ lf->filter_level[1] = 0;
+ } else if (method >= LPF_PICK_FROM_Q) {
+ const int min_filter_level = 0;
+ const int max_filter_level = av1_get_max_filter_level(cpi);
+ const int q = av1_ac_quant_Q3(cm->base_qindex, 0, cm->seq_params.bit_depth);
+ // These values were determined by linear fitting the result of the
+ // searched level for 8 bit depth:
+ // Keyframes: filt_guess = q * 0.06699 - 1.60817
+ // Other frames: filt_guess = q * 0.02295 + 2.48225
+ //
+ // And high bit depth separately:
+ // filt_guess = q * 0.316206 + 3.87252
+ int filt_guess;
+ switch (cm->seq_params.bit_depth) {
+ case AOM_BITS_8:
+ filt_guess = (cm->frame_type == KEY_FRAME)
+ ? ROUND_POWER_OF_TWO(q * 17563 - 421574, 18)
+ : ROUND_POWER_OF_TWO(q * 6017 + 650707, 18);
+ break;
+ case AOM_BITS_10:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+ break;
+ case AOM_BITS_12:
+ filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+ break;
+ default:
+ assert(0 &&
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10 "
+ "or AOM_BITS_12");
+ return;
+ }
+ if (cm->seq_params.bit_depth != AOM_BITS_8 && cm->frame_type == KEY_FRAME)
+ filt_guess -= 4;
+ // TODO(chengchen): retrain the model for Y, U, V filter levels
+ lf->filter_level[0] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level[1] = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_u = clamp(filt_guess, min_filter_level, max_filter_level);
+ lf->filter_level_v = clamp(filt_guess, min_filter_level, max_filter_level);
+ } else {
+ const int last_frame_filter_level[4] = { lf->filter_level[0],
+ lf->filter_level[1],
+ lf->filter_level_u,
+ lf->filter_level_v };
+
+ lf->filter_level[0] = lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, NULL, 0, 2);
+ lf->filter_level[0] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, NULL, 0, 0);
+ lf->filter_level[1] =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, NULL, 0, 1);
+
+ if (num_planes > 1) {
+ lf->filter_level_u =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, NULL, 1, 0);
+ lf->filter_level_v =
+ search_filter_level(sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+ last_frame_filter_level, NULL, 2, 0);
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/picklpf.h b/third_party/aom/av1/encoder/picklpf.h
new file mode 100644
index 0000000000..357097ae1b
--- /dev/null
+++ b/third_party/aom/av1/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PICKLPF_H_
+#define AOM_AV1_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+int av1_get_max_filter_level(const AV1_COMP *cpi);
+void av1_pick_filter_level(const struct yv12_buffer_config *sd,
+ struct AV1_COMP *cpi, LPF_PICK_METHOD method);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKLPF_H_
diff --git a/third_party/aom/av1/encoder/pickrst.c b/third_party/aom/av1/encoder/pickrst.c
new file mode 100644
index 0000000000..e7804f6b44
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.c
@@ -0,0 +1,1362 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "config/aom_scale_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/psnr.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/restoration.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/picklpf.h"
+#include "av1/encoder/pickrst.h"
+
+// When set to RESTORE_WIENER or RESTORE_SGRPROJ only those are allowed.
+// When set to RESTORE_TYPES we allow switchable.
+static const RestorationType force_restore_type = RESTORE_TYPES;
+
+// Number of Wiener iterations
+#define NUM_WIENER_ITERS 5
+
+// Penalty factor for use of dual sgr
+#define DUAL_SGR_PENALTY_MULT 0.01
+
+const int frame_level_restore_bits[RESTORE_TYPES] = { 2, 2, 2, 2 };
+
+typedef int64_t (*sse_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b);
+typedef int64_t (*sse_part_extractor_type)(const YV12_BUFFER_CONFIG *a,
+ const YV12_BUFFER_CONFIG *b,
+ int hstart, int width, int vstart,
+ int height);
+
+#define NUM_EXTRACTORS (3 * (1 + 1))
+
+static const sse_part_extractor_type sse_part_extractors[NUM_EXTRACTORS] = {
+ aom_get_y_sse_part, aom_get_u_sse_part,
+ aom_get_v_sse_part, aom_highbd_get_y_sse_part,
+ aom_highbd_get_u_sse_part, aom_highbd_get_v_sse_part,
+};
+
+static int64_t sse_restoration_unit(const RestorationTileLimits *limits,
+ const YV12_BUFFER_CONFIG *src,
+ const YV12_BUFFER_CONFIG *dst, int plane,
+ int highbd) {
+ return sse_part_extractors[3 * highbd + plane](
+ src, dst, limits->h_start, limits->h_end - limits->h_start,
+ limits->v_start, limits->v_end - limits->v_start);
+}
+
+typedef struct {
+ // The best coefficients for Wiener or Sgrproj restoration
+ WienerInfo wiener;
+ SgrprojInfo sgrproj;
+
+ // The sum of squared errors for this rtype.
+ int64_t sse[RESTORE_SWITCHABLE_TYPES];
+
+ // The rtype to use for this unit given a frame rtype as
+ // index. Indices: WIENER, SGRPROJ, SWITCHABLE.
+ RestorationType best_rtype[RESTORE_TYPES - 1];
+} RestUnitSearchInfo;
+
+typedef struct {
+ const YV12_BUFFER_CONFIG *src;
+ YV12_BUFFER_CONFIG *dst;
+
+ const AV1_COMMON *cm;
+ const MACROBLOCK *x;
+ int plane;
+ int plane_width;
+ int plane_height;
+ RestUnitSearchInfo *rusi;
+
+ // Speed features
+ const SPEED_FEATURES *sf;
+
+ uint8_t *dgd_buffer;
+ int dgd_stride;
+ const uint8_t *src_buffer;
+ int src_stride;
+
+ // sse and bits are initialised by reset_rsc in search_rest_type
+ int64_t sse;
+ int64_t bits;
+ int tile_y0, tile_stripe0;
+
+ // sgrproj and wiener are initialised by rsc_on_tile when starting the first
+ // tile in the frame.
+ SgrprojInfo sgrproj;
+ WienerInfo wiener;
+ AV1PixelRect tile_rect;
+} RestSearchCtxt;
+
+static void rsc_on_tile(int tile_row, int tile_col, void *priv) {
+ (void)tile_col;
+
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ set_default_sgrproj(&rsc->sgrproj);
+ set_default_wiener(&rsc->wiener);
+
+ rsc->tile_stripe0 =
+ (tile_row == 0) ? 0 : rsc->cm->rst_end_stripe[tile_row - 1];
+}
+
+static void reset_rsc(RestSearchCtxt *rsc) {
+ rsc->sse = 0;
+ rsc->bits = 0;
+}
+
+static void init_rsc(const YV12_BUFFER_CONFIG *src, const AV1_COMMON *cm,
+ const MACROBLOCK *x, const SPEED_FEATURES *sf, int plane,
+ RestUnitSearchInfo *rusi, YV12_BUFFER_CONFIG *dst,
+ RestSearchCtxt *rsc) {
+ rsc->src = src;
+ rsc->dst = dst;
+ rsc->cm = cm;
+ rsc->x = x;
+ rsc->plane = plane;
+ rsc->rusi = rusi;
+ rsc->sf = sf;
+
+ const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+ const int is_uv = plane != AOM_PLANE_Y;
+ rsc->plane_width = src->crop_widths[is_uv];
+ rsc->plane_height = src->crop_heights[is_uv];
+ rsc->src_buffer = src->buffers[plane];
+ rsc->src_stride = src->strides[is_uv];
+ rsc->dgd_buffer = dgd->buffers[plane];
+ rsc->dgd_stride = dgd->strides[is_uv];
+ rsc->tile_rect = av1_whole_frame_rect(cm, is_uv);
+ assert(src->crop_widths[is_uv] == dgd->crop_widths[is_uv]);
+ assert(src->crop_heights[is_uv] == dgd->crop_heights[is_uv]);
+}
+
+static int64_t try_restoration_unit(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect,
+ const RestorationUnitInfo *rui) {
+ const AV1_COMMON *const cm = rsc->cm;
+ const int plane = rsc->plane;
+ const int is_uv = plane > 0;
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ RestorationLineBuffers rlbs;
+ const int bit_depth = cm->seq_params.bit_depth;
+ const int highbd = cm->seq_params.use_highbitdepth;
+
+ const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+ // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+ // also used in encoder.
+ const int optimized_lr = 0;
+
+ av1_loop_restoration_filter_unit(
+ limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
+ is_uv && cm->seq_params.subsampling_x,
+ is_uv && cm->seq_params.subsampling_y, highbd, bit_depth,
+ fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
+ rsc->dst->strides[is_uv], cm->rst_tmpbuf, optimized_lr);
+
+ return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
+}
+
+int64_t av1_lowbd_pixel_proj_error_c(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int32_t *flt0,
+ int flt0_stride, int32_t *flt1,
+ int flt1_stride, int xq[2],
+ const sgr_params_type *params) {
+ int i, j;
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u) + xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ }
+ } else if (params->r[0] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt0[j] < (1 << 15) && flt0[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[0] * (flt0[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ }
+ } else if (params->r[1] > 0) {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ assert(flt1[j] < (1 << 15) && flt1[j] > -(1 << 15));
+ const int32_t u = (int32_t)(dat[j] << SGRPROJ_RST_BITS);
+ int32_t v = u << SGRPROJ_PRJ_BITS;
+ v += xq[1] * (flt1[j] - u);
+ const int32_t e =
+ ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t e = (int32_t)(dat[j]) - src[j];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+
+ return err;
+}
+
+static int64_t get_pixel_proj_error(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int use_highbitdepth,
+ int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int *xqd,
+ const sgr_params_type *params) {
+ int i, j;
+ int64_t err = 0;
+ int xq[2];
+ decode_xq(xqd, xq, params);
+ if (!use_highbitdepth) {
+ err = av1_lowbd_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, flt0, flt0_stride, flt1,
+ flt1_stride, xq, params);
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ const int32_t half = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1);
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ int xq0 = xq[0];
+ int xq1 = xq[1];
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v0 = flt0[j] - u;
+ int32_t v1 = flt1[j] - u;
+ int32_t v = half;
+ v += xq0 * v0;
+ v += xq1 * v1;
+ const int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += e * e;
+ }
+ dat += dat_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ src += src_stride;
+ }
+ } else if (params->r[0] > 0 || params->r[1] > 0) {
+ int exq;
+ int32_t *flt;
+ int flt_stride;
+ if (params->r[0] > 0) {
+ exq = xq[0];
+ flt = flt0;
+ flt_stride = flt0_stride;
+ } else {
+ exq = xq[1];
+ flt = flt1;
+ flt_stride = flt1_stride;
+ }
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t u = (int32_t)(d << SGRPROJ_RST_BITS);
+ int32_t v = half;
+ v += exq * (flt[j] - u);
+ const int32_t e =
+ (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + d - s;
+ err += e * e;
+ }
+ dat += dat_stride;
+ flt += flt_stride;
+ src += src_stride;
+ }
+ } else {
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const int32_t d = dat[j];
+ const int32_t s = src[j];
+ const int32_t e = d - s;
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ }
+ }
+ return err;
+}
+
+#define USE_SGRPROJ_REFINEMENT_SEARCH 1
+static int64_t finer_search_pixel_proj_error(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int use_highbitdepth, int32_t *flt0,
+ int flt0_stride, int32_t *flt1, int flt1_stride, int start_step, int *xqd,
+ const sgr_params_type *params) {
+ int64_t err = get_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ (void)start_step;
+#if USE_SGRPROJ_REFINEMENT_SEARCH
+ int64_t err2;
+ int tap_min[] = { SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MIN1 };
+ int tap_max[] = { SGRPROJ_PRJ_MAX0, SGRPROJ_PRJ_MAX1 };
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = 0; p < 2; ++p) {
+ if ((params->r[0] == 0 && p == 0) || (params->r[1] == 0 && p == 1)) {
+ continue;
+ }
+ int skip = 0;
+ do {
+ if (xqd[p] - s >= tap_min[p]) {
+ xqd[p] -= s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] += s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (xqd[p] + s <= tap_max[p]) {
+ xqd[p] += s;
+ err2 =
+ get_pixel_proj_error(src8, width, height, src_stride, dat8,
+ dat_stride, use_highbitdepth, flt0,
+ flt0_stride, flt1, flt1_stride, xqd, params);
+ if (err2 > err) {
+ xqd[p] -= s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+#endif // USE_SGRPROJ_REFINEMENT_SEARCH
+ return err;
+}
+
+static void get_proj_subspace(const uint8_t *src8, int width, int height,
+ int src_stride, const uint8_t *dat8,
+ int dat_stride, int use_highbitdepth,
+ int32_t *flt0, int flt0_stride, int32_t *flt1,
+ int flt1_stride, int *xq,
+ const sgr_params_type *params) {
+ int i, j;
+ double H[2][2] = { { 0, 0 }, { 0, 0 } };
+ double C[2] = { 0, 0 };
+ double Det;
+ double x[2];
+ const int size = width * height;
+
+ aom_clear_system_state();
+
+ // Default
+ xq[0] = 0;
+ xq[1] = 0;
+ if (!use_highbitdepth) {
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const double s =
+ (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const double f1 =
+ (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+ const double f2 =
+ (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
+ H[0][0] += f1 * f1;
+ H[1][1] += f2 * f2;
+ H[0][1] += f1 * f2;
+ C[0] += f1 * s;
+ C[1] += f2 * s;
+ }
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width; ++j) {
+ const double u = (double)(dat[i * dat_stride + j] << SGRPROJ_RST_BITS);
+ const double s =
+ (double)(src[i * src_stride + j] << SGRPROJ_RST_BITS) - u;
+ const double f1 =
+ (params->r[0] > 0) ? (double)flt0[i * flt0_stride + j] - u : 0;
+ const double f2 =
+ (params->r[1] > 0) ? (double)flt1[i * flt1_stride + j] - u : 0;
+ H[0][0] += f1 * f1;
+ H[1][1] += f2 * f2;
+ H[0][1] += f1 * f2;
+ C[0] += f1 * s;
+ C[1] += f2 * s;
+ }
+ }
+ }
+ H[0][0] /= size;
+ H[0][1] /= size;
+ H[1][1] /= size;
+ H[1][0] = H[0][1];
+ C[0] /= size;
+ C[1] /= size;
+ if (params->r[0] == 0) {
+ // H matrix is now only the scalar H[1][1]
+ // C vector is now only the scalar C[1]
+ Det = H[1][1];
+ if (Det < 1e-8) return; // ill-posed, return default values
+ x[0] = 0;
+ x[1] = C[1] / Det;
+
+ xq[0] = 0;
+ xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+ } else if (params->r[1] == 0) {
+ // H matrix is now only the scalar H[0][0]
+ // C vector is now only the scalar C[0]
+ Det = H[0][0];
+ if (Det < 1e-8) return; // ill-posed, return default values
+ x[0] = C[0] / Det;
+ x[1] = 0;
+
+ xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+ xq[1] = 0;
+ } else {
+ Det = (H[0][0] * H[1][1] - H[0][1] * H[1][0]);
+ if (Det < 1e-8) return; // ill-posed, return default values
+ x[0] = (H[1][1] * C[0] - H[0][1] * C[1]) / Det;
+ x[1] = (H[0][0] * C[1] - H[1][0] * C[0]) / Det;
+
+ xq[0] = (int)rint(x[0] * (1 << SGRPROJ_PRJ_BITS));
+ xq[1] = (int)rint(x[1] * (1 << SGRPROJ_PRJ_BITS));
+ }
+}
+
+void encode_xq(int *xq, int *xqd, const sgr_params_type *params) {
+ if (params->r[0] == 0) {
+ xqd[0] = 0;
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else if (params->r[1] == 0) {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ } else {
+ xqd[0] = clamp(xq[0], SGRPROJ_PRJ_MIN0, SGRPROJ_PRJ_MAX0);
+ xqd[1] = clamp((1 << SGRPROJ_PRJ_BITS) - xqd[0] - xq[1], SGRPROJ_PRJ_MIN1,
+ SGRPROJ_PRJ_MAX1);
+ }
+}
+
+// Apply the self-guided filter across an entire restoration unit.
+static void apply_sgr(int sgr_params_idx, const uint8_t *dat8, int width,
+ int height, int dat_stride, int use_highbd, int bit_depth,
+ int pu_width, int pu_height, int32_t *flt0, int32_t *flt1,
+ int flt_stride) {
+ for (int i = 0; i < height; i += pu_height) {
+ const int h = AOMMIN(pu_height, height - i);
+ int32_t *flt0_row = flt0 + i * flt_stride;
+ int32_t *flt1_row = flt1 + i * flt_stride;
+ const uint8_t *dat8_row = dat8 + i * dat_stride;
+
+ // Iterate over the stripe in blocks of width pu_width
+ for (int j = 0; j < width; j += pu_width) {
+ const int w = AOMMIN(pu_width, width - j);
+ const int ret = av1_selfguided_restoration(
+ dat8_row + j, w, h, dat_stride, flt0_row + j, flt1_row + j,
+ flt_stride, sgr_params_idx, bit_depth, use_highbd);
+ (void)ret;
+ assert(!ret);
+ }
+ }
+}
+
+static SgrprojInfo search_selfguided_restoration(
+ const uint8_t *dat8, int width, int height, int dat_stride,
+ const uint8_t *src8, int src_stride, int use_highbitdepth, int bit_depth,
+ int pu_width, int pu_height, int32_t *rstbuf) {
+ int32_t *flt0 = rstbuf;
+ int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
+ int ep, bestep = 0;
+ int64_t besterr = -1;
+ int exqd[2], bestxqd[2] = { 0, 0 };
+ int flt_stride = ((width + 7) & ~7) + 8;
+ assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_width == RESTORATION_PROC_UNIT_SIZE);
+ assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
+ pu_height == RESTORATION_PROC_UNIT_SIZE);
+
+ for (ep = 0; ep < SGRPROJ_PARAMS; ep++) {
+ int exq[2];
+ apply_sgr(ep, dat8, width, height, dat_stride, use_highbitdepth, bit_depth,
+ pu_width, pu_height, flt0, flt1, flt_stride);
+ aom_clear_system_state();
+ const sgr_params_type *const params = &sgr_params[ep];
+ get_proj_subspace(src8, width, height, src_stride, dat8, dat_stride,
+ use_highbitdepth, flt0, flt_stride, flt1, flt_stride, exq,
+ params);
+ aom_clear_system_state();
+ encode_xq(exq, exqd, params);
+ int64_t err = finer_search_pixel_proj_error(
+ src8, width, height, src_stride, dat8, dat_stride, use_highbitdepth,
+ flt0, flt_stride, flt1, flt_stride, 2, exqd, params);
+ if (besterr == -1 || err < besterr) {
+ bestep = ep;
+ besterr = err;
+ bestxqd[0] = exqd[0];
+ bestxqd[1] = exqd[1];
+ }
+ }
+
+ SgrprojInfo ret;
+ ret.ep = bestep;
+ ret.xqd[0] = bestxqd[0];
+ ret.xqd[1] = bestxqd[1];
+ return ret;
+}
+
+static int count_sgrproj_bits(SgrprojInfo *sgrproj_info,
+ SgrprojInfo *ref_sgrproj_info) {
+ int bits = SGRPROJ_PARAMS_BITS;
+ const sgr_params_type *params = &sgr_params[sgrproj_info->ep];
+ if (params->r[0] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX0 - SGRPROJ_PRJ_MIN0 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0,
+ sgrproj_info->xqd[0] - SGRPROJ_PRJ_MIN0);
+ if (params->r[1] > 0)
+ bits += aom_count_primitive_refsubexpfin(
+ SGRPROJ_PRJ_MAX1 - SGRPROJ_PRJ_MIN1 + 1, SGRPROJ_PRJ_SUBEXP_K,
+ ref_sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1,
+ sgrproj_info->xqd[1] - SGRPROJ_PRJ_MIN1);
+ return bits;
+}
+
+static void search_sgrproj(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile, int rest_unit_idx,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ (void)rlbs;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+ const AV1_COMMON *const cm = rsc->cm;
+ const int highbd = cm->seq_params.use_highbitdepth;
+ const int bit_depth = cm->seq_params.bit_depth;
+
+ uint8_t *dgd_start =
+ rsc->dgd_buffer + limits->v_start * rsc->dgd_stride + limits->h_start;
+ const uint8_t *src_start =
+ rsc->src_buffer + limits->v_start * rsc->src_stride + limits->h_start;
+
+ const int is_uv = rsc->plane > 0;
+ const int ss_x = is_uv && cm->seq_params.subsampling_x;
+ const int ss_y = is_uv && cm->seq_params.subsampling_y;
+ const int procunit_width = RESTORATION_PROC_UNIT_SIZE >> ss_x;
+ const int procunit_height = RESTORATION_PROC_UNIT_SIZE >> ss_y;
+
+ rusi->sgrproj = search_selfguided_restoration(
+ dgd_start, limits->h_end - limits->h_start,
+ limits->v_end - limits->v_start, rsc->dgd_stride, src_start,
+ rsc->src_stride, highbd, bit_depth, procunit_width, procunit_height,
+ tmpbuf);
+
+ RestorationUnitInfo rui;
+ rui.restoration_type = RESTORE_SGRPROJ;
+ rui.sgrproj_info = rusi->sgrproj;
+
+ rusi->sse[RESTORE_SGRPROJ] = try_restoration_unit(rsc, limits, tile, &rui);
+
+ const int64_t bits_none = x->sgrproj_restore_cost[0];
+ const int64_t bits_sgr = x->sgrproj_restore_cost[1] +
+ (count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj)
+ << AV1_PROB_COST_SHIFT);
+
+ double cost_none =
+ RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+ double cost_sgr =
+ RDCOST_DBL(x->rdmult, bits_sgr >> 4, rusi->sse[RESTORE_SGRPROJ]);
+ if (rusi->sgrproj.ep < 10)
+ cost_sgr *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+
+ RestorationType rtype =
+ (cost_sgr < cost_none) ? RESTORE_SGRPROJ : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_SGRPROJ - 1] = rtype;
+
+ rsc->sse += rusi->sse[rtype];
+ rsc->bits += (cost_sgr < cost_none) ? bits_sgr : bits_none;
+ if (cost_sgr < cost_none) rsc->sgrproj = rusi->sgrproj;
+}
+
+void av1_compute_stats_c(int wiener_win, const uint8_t *dgd, const uint8_t *src,
+ int h_start, int h_end, int v_start, int v_end,
+ int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l;
+ double Y[WIENER_WIN2];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const double X = (double)src[i * src_stride + j] - avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ M[k] += Y[k] * X;
+ for (l = k; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H[k * wiener_win2 + l] += Y[k] * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+
+static double find_average_highbd(const uint16_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
+static AOM_FORCE_INLINE void compute_stats_highbd(
+ int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start,
+ int h_end, int v_start, int v_end, int dgd_stride, int src_stride,
+ double *M, double *H) {
+ int i, j, k, l;
+ double Y[WIENER_WIN2];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+ const double avg =
+ find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ memset(M, 0, sizeof(*M) * wiener_win2);
+ memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
+ for (i = v_start; i < v_end; i++) {
+ for (j = h_start; j < h_end; j++) {
+ const double X = (double)src[i * src_stride + j] - avg;
+ int idx = 0;
+ for (k = -wiener_halfwin; k <= wiener_halfwin; k++) {
+ for (l = -wiener_halfwin; l <= wiener_halfwin; l++) {
+ Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+ idx++;
+ }
+ }
+ assert(idx == wiener_win2);
+ for (k = 0; k < wiener_win2; ++k) {
+ double Yk = Y[k];
+ M[k] += Yk * X;
+ double *H2 = &H[k * wiener_win2];
+ H2[k] += Yk * Yk;
+ for (l = k + 1; l < wiener_win2; ++l) {
+ // H is a symmetric matrix, so we only need to fill out the upper
+ // triangle here. We can copy it down to the lower triangle outside
+ // the (i, j) loops.
+ H2[l] += Yk * Y[l];
+ }
+ }
+ }
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ for (l = k + 1; l < wiener_win2; ++l) {
+ H[l * wiener_win2 + k] = H[k * wiener_win2 + l];
+ }
+ }
+}
+
+static INLINE int wrap_index(int i, int wiener_win) {
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ return (i >= wiener_halfwin1 ? wiener_win - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(int wiener_win, double **Mc, double **Hc,
+ double *a, double *b) {
+ int i, j;
+ double S[WIENER_WIN];
+ double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; ++j) {
+ const int jj = wrap_index(j, wiener_win);
+ A[jj] += Mc[i][j] * b[i];
+ }
+ }
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; j++) {
+ int k, l;
+ for (k = 0; k < wiener_win; ++k)
+ for (l = 0; l < wiener_win; ++l) {
+ const int kk = wrap_index(k, wiener_win);
+ const int ll = wrap_index(l, wiener_win);
+ B[ll * wiener_halfwin1 + kk] +=
+ Hc[j * wiener_win + i][k * wiener_win2 + l] * b[i] * b[j];
+ }
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i)
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ for (i = 0; i < wiener_halfwin1 - 1; ++i)
+ for (j = 0; j < wiener_halfwin1 - 1; ++j)
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = 1.0;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ memcpy(a, S, wiener_win * sizeof(*a));
+ }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(int wiener_win, double **Mc, double **Hc,
+ double *a, double *b) {
+ int i, j;
+ double S[WIENER_WIN];
+ double A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1];
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin1 = (wiener_win >> 1) + 1;
+ memset(A, 0, sizeof(A));
+ memset(B, 0, sizeof(B));
+ for (i = 0; i < wiener_win; i++) {
+ const int ii = wrap_index(i, wiener_win);
+ for (j = 0; j < wiener_win; j++) A[ii] += Mc[i][j] * a[j];
+ }
+
+ for (i = 0; i < wiener_win; i++) {
+ for (j = 0; j < wiener_win; j++) {
+ const int ii = wrap_index(i, wiener_win);
+ const int jj = wrap_index(j, wiener_win);
+ int k, l;
+ for (k = 0; k < wiener_win; ++k)
+ for (l = 0; l < wiener_win; ++l)
+ B[jj * wiener_halfwin1 + ii] +=
+ Hc[i * wiener_win + j][k * wiener_win2 + l] * a[k] * a[l];
+ }
+ }
+ // Normalization enforcement in the system of equations itself
+ for (i = 0; i < wiener_halfwin1 - 1; ++i)
+ A[i] -=
+ A[wiener_halfwin1 - 1] * 2 +
+ B[i * wiener_halfwin1 + wiener_halfwin1 - 1] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 + (wiener_halfwin1 - 1)];
+ for (i = 0; i < wiener_halfwin1 - 1; ++i)
+ for (j = 0; j < wiener_halfwin1 - 1; ++j)
+ B[i * wiener_halfwin1 + j] -=
+ 2 * (B[i * wiener_halfwin1 + (wiener_halfwin1 - 1)] +
+ B[(wiener_halfwin1 - 1) * wiener_halfwin1 + j] -
+ 2 * B[(wiener_halfwin1 - 1) * wiener_halfwin1 +
+ (wiener_halfwin1 - 1)]);
+ if (linsolve(wiener_halfwin1 - 1, B, wiener_halfwin1, A, S)) {
+ S[wiener_halfwin1 - 1] = 1.0;
+ for (i = wiener_halfwin1; i < wiener_win; ++i) {
+ S[i] = S[wiener_win - 1 - i];
+ S[wiener_halfwin1 - 1] -= 2 * S[i];
+ }
+ memcpy(b, S, wiener_win * sizeof(*b));
+ }
+}
+
+static int wiener_decompose_sep_sym(int wiener_win, double *M, double *H,
+ double *a, double *b) {
+ static const int init_filt[WIENER_WIN] = {
+ WIENER_FILT_TAP0_MIDV, WIENER_FILT_TAP1_MIDV, WIENER_FILT_TAP2_MIDV,
+ WIENER_FILT_TAP3_MIDV, WIENER_FILT_TAP2_MIDV, WIENER_FILT_TAP1_MIDV,
+ WIENER_FILT_TAP0_MIDV,
+ };
+ double *Hc[WIENER_WIN2];
+ double *Mc[WIENER_WIN];
+ int i, j, iter;
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+ for (i = 0; i < wiener_win; i++) {
+ a[i] = b[i] = (double)init_filt[i + plane_off] / WIENER_FILT_STEP;
+ }
+ for (i = 0; i < wiener_win; i++) {
+ Mc[i] = M + i * wiener_win;
+ for (j = 0; j < wiener_win; j++) {
+ Hc[i * wiener_win + j] =
+ H + i * wiener_win * wiener_win2 + j * wiener_win;
+ }
+ }
+
+ iter = 1;
+ while (iter < NUM_WIENER_ITERS) {
+ update_a_sep_sym(wiener_win, Mc, Hc, a, b);
+ update_b_sep_sym(wiener_win, Mc, Hc, a, b);
+ iter++;
+ }
+ return 1;
+}
+
+// Computes the function x'*H*x - x'*M for the learned 2D filter x, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+static double compute_score(int wiener_win, double *M, double *H,
+ InterpKernel vfilt, InterpKernel hfilt) {
+ double ab[WIENER_WIN * WIENER_WIN];
+ int i, k, l;
+ double P = 0, Q = 0;
+ double iP = 0, iQ = 0;
+ double Score, iScore;
+ double a[WIENER_WIN], b[WIENER_WIN];
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ const int wiener_win2 = wiener_win * wiener_win;
+
+ aom_clear_system_state();
+
+ a[WIENER_HALFWIN] = b[WIENER_HALFWIN] = 1.0;
+ for (i = 0; i < WIENER_HALFWIN; ++i) {
+ a[i] = a[WIENER_WIN - i - 1] = (double)vfilt[i] / WIENER_FILT_STEP;
+ b[i] = b[WIENER_WIN - i - 1] = (double)hfilt[i] / WIENER_FILT_STEP;
+ a[WIENER_HALFWIN] -= 2 * a[i];
+ b[WIENER_HALFWIN] -= 2 * b[i];
+ }
+ memset(ab, 0, sizeof(ab));
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l)
+ ab[k * wiener_win + l] = a[l + plane_off] * b[k + plane_off];
+ }
+ for (k = 0; k < wiener_win2; ++k) {
+ P += ab[k] * M[k];
+ for (l = 0; l < wiener_win2; ++l)
+ Q += ab[k] * H[k * wiener_win2 + l] * ab[l];
+ }
+ Score = Q - 2 * P;
+
+ iP = M[wiener_win2 >> 1];
+ iQ = H[(wiener_win2 >> 1) * wiener_win2 + (wiener_win2 >> 1)];
+ iScore = iQ - 2 * iP;
+
+ return Score - iScore;
+}
+
+static void quantize_sym_filter(int wiener_win, double *f, InterpKernel fi) {
+ int i;
+ const int wiener_halfwin = (wiener_win >> 1);
+ for (i = 0; i < wiener_halfwin; ++i) {
+ fi[i] = RINT(f[i] * WIENER_FILT_STEP);
+ }
+ // Specialize for 7-tap filter
+ if (wiener_win == WIENER_WIN) {
+ fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+ fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ } else {
+ fi[2] = CLIP(fi[1], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+ fi[1] = CLIP(fi[0], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+ fi[0] = 0;
+ }
+ // Satisfy filter constraints
+ fi[WIENER_WIN - 1] = fi[0];
+ fi[WIENER_WIN - 2] = fi[1];
+ fi[WIENER_WIN - 3] = fi[2];
+ // The central element has an implicit +WIENER_FILT_STEP
+ fi[3] = -2 * (fi[0] + fi[1] + fi[2]);
+}
+
+static int count_wiener_bits(int wiener_win, WienerInfo *wiener_info,
+ WienerInfo *ref_wiener_info) {
+ int bits = 0;
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->vfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->vfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->vfilter[2] - WIENER_FILT_TAP2_MINV);
+ if (wiener_win == WIENER_WIN)
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP0_MAXV - WIENER_FILT_TAP0_MINV + 1,
+ WIENER_FILT_TAP0_SUBEXP_K,
+ ref_wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV,
+ wiener_info->hfilter[0] - WIENER_FILT_TAP0_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP1_MAXV - WIENER_FILT_TAP1_MINV + 1,
+ WIENER_FILT_TAP1_SUBEXP_K,
+ ref_wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV,
+ wiener_info->hfilter[1] - WIENER_FILT_TAP1_MINV);
+ bits += aom_count_primitive_refsubexpfin(
+ WIENER_FILT_TAP2_MAXV - WIENER_FILT_TAP2_MINV + 1,
+ WIENER_FILT_TAP2_SUBEXP_K,
+ ref_wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV,
+ wiener_info->hfilter[2] - WIENER_FILT_TAP2_MINV);
+ return bits;
+}
+
+#define USE_WIENER_REFINEMENT_SEARCH 1
+static int64_t finer_tile_search_wiener(const RestSearchCtxt *rsc,
+ const RestorationTileLimits *limits,
+ const AV1PixelRect *tile,
+ RestorationUnitInfo *rui,
+ int wiener_win) {
+ const int plane_off = (WIENER_WIN - wiener_win) >> 1;
+ int64_t err = try_restoration_unit(rsc, limits, tile, rui);
+#if USE_WIENER_REFINEMENT_SEARCH
+ int64_t err2;
+ int tap_min[] = { WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP1_MINV,
+ WIENER_FILT_TAP2_MINV };
+ int tap_max[] = { WIENER_FILT_TAP0_MAXV, WIENER_FILT_TAP1_MAXV,
+ WIENER_FILT_TAP2_MAXV };
+
+ WienerInfo *plane_wiener = &rui->wiener_info;
+
+ // printf("err pre = %"PRId64"\n", err);
+ const int start_step = 4;
+ for (int s = start_step; s >= 1; s >>= 1) {
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->hfilter[p] - s >= tap_min[p]) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, tile, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->hfilter[p] + s <= tap_max[p]) {
+ plane_wiener->hfilter[p] += s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->hfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, tile, rui);
+ if (err2 > err) {
+ plane_wiener->hfilter[p] -= s;
+ plane_wiener->hfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->hfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ for (int p = plane_off; p < WIENER_HALFWIN; ++p) {
+ int skip = 0;
+ do {
+ if (plane_wiener->vfilter[p] - s >= tap_min[p]) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ err2 = try_restoration_unit(rsc, limits, tile, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ } else {
+ err = err2;
+ skip = 1;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ if (skip) break;
+ do {
+ if (plane_wiener->vfilter[p] + s <= tap_max[p]) {
+ plane_wiener->vfilter[p] += s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] += s;
+ plane_wiener->vfilter[WIENER_HALFWIN] -= 2 * s;
+ err2 = try_restoration_unit(rsc, limits, tile, rui);
+ if (err2 > err) {
+ plane_wiener->vfilter[p] -= s;
+ plane_wiener->vfilter[WIENER_WIN - p - 1] -= s;
+ plane_wiener->vfilter[WIENER_HALFWIN] += 2 * s;
+ } else {
+ err = err2;
+ // At the highest step size continue moving in the same direction
+ if (s == start_step) continue;
+ }
+ }
+ break;
+ } while (1);
+ }
+ }
+// printf("err post = %"PRId64"\n", err);
+#endif // USE_WIENER_REFINEMENT_SEARCH
+ return err;
+}
+
+static void search_wiener(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect, int rest_unit_idx,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ (void)tmpbuf;
+ (void)rlbs;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ double M[WIENER_WIN2];
+ double H[WIENER_WIN2 * WIENER_WIN2];
+ double vfilterd[WIENER_WIN], hfilterd[WIENER_WIN];
+
+ const AV1_COMMON *const cm = rsc->cm;
+ if (cm->seq_params.use_highbitdepth) {
+ compute_stats_highbd(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ limits->h_start, limits->h_end, limits->v_start,
+ limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+ } else {
+ av1_compute_stats(wiener_win, rsc->dgd_buffer, rsc->src_buffer,
+ limits->h_start, limits->h_end, limits->v_start,
+ limits->v_end, rsc->dgd_stride, rsc->src_stride, M, H);
+ }
+
+ const MACROBLOCK *const x = rsc->x;
+ const int64_t bits_none = x->wiener_restore_cost[0];
+
+ if (!wiener_decompose_sep_sym(wiener_win, M, H, vfilterd, hfilterd)) {
+ rsc->bits += bits_none;
+ rsc->sse += rusi->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rusi->sse[RESTORE_WIENER] = INT64_MAX;
+ return;
+ }
+
+ RestorationUnitInfo rui;
+ memset(&rui, 0, sizeof(rui));
+ rui.restoration_type = RESTORE_WIENER;
+ quantize_sym_filter(wiener_win, vfilterd, rui.wiener_info.vfilter);
+ quantize_sym_filter(wiener_win, hfilterd, rui.wiener_info.hfilter);
+
+ // Filter score computes the value of the function x'*A*x - x'*b for the
+ // learned filter and compares it against identity filer. If there is no
+ // reduction in the function, the filter is reverted back to identity
+ if (compute_score(wiener_win, M, H, rui.wiener_info.vfilter,
+ rui.wiener_info.hfilter) > 0) {
+ rsc->bits += bits_none;
+ rsc->sse += rusi->sse[RESTORE_NONE];
+ rusi->best_rtype[RESTORE_WIENER - 1] = RESTORE_NONE;
+ rusi->sse[RESTORE_WIENER] = INT64_MAX;
+ return;
+ }
+
+ aom_clear_system_state();
+
+ rusi->sse[RESTORE_WIENER] =
+ finer_tile_search_wiener(rsc, limits, tile_rect, &rui, wiener_win);
+ rusi->wiener = rui.wiener_info;
+
+ if (wiener_win != WIENER_WIN) {
+ assert(rui.wiener_info.vfilter[0] == 0 &&
+ rui.wiener_info.vfilter[WIENER_WIN - 1] == 0);
+ assert(rui.wiener_info.hfilter[0] == 0 &&
+ rui.wiener_info.hfilter[WIENER_WIN - 1] == 0);
+ }
+
+ const int64_t bits_wiener =
+ x->wiener_restore_cost[1] +
+ (count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener)
+ << AV1_PROB_COST_SHIFT);
+
+ double cost_none =
+ RDCOST_DBL(x->rdmult, bits_none >> 4, rusi->sse[RESTORE_NONE]);
+ double cost_wiener =
+ RDCOST_DBL(x->rdmult, bits_wiener >> 4, rusi->sse[RESTORE_WIENER]);
+
+ RestorationType rtype =
+ (cost_wiener < cost_none) ? RESTORE_WIENER : RESTORE_NONE;
+ rusi->best_rtype[RESTORE_WIENER - 1] = rtype;
+
+ rsc->sse += rusi->sse[rtype];
+ rsc->bits += (cost_wiener < cost_none) ? bits_wiener : bits_none;
+ if (cost_wiener < cost_none) rsc->wiener = rusi->wiener;
+}
+
+static void search_norestore(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect, int rest_unit_idx,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ (void)tile_rect;
+ (void)tmpbuf;
+ (void)rlbs;
+
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const int highbd = rsc->cm->seq_params.use_highbitdepth;
+ rusi->sse[RESTORE_NONE] = sse_restoration_unit(
+ limits, rsc->src, rsc->cm->frame_to_show, rsc->plane, highbd);
+
+ rsc->sse += rusi->sse[RESTORE_NONE];
+}
+
+static void search_switchable(const RestorationTileLimits *limits,
+ const AV1PixelRect *tile_rect, int rest_unit_idx,
+ void *priv, int32_t *tmpbuf,
+ RestorationLineBuffers *rlbs) {
+ (void)limits;
+ (void)tile_rect;
+ (void)tmpbuf;
+ (void)rlbs;
+ RestSearchCtxt *rsc = (RestSearchCtxt *)priv;
+ RestUnitSearchInfo *rusi = &rsc->rusi[rest_unit_idx];
+
+ const MACROBLOCK *const x = rsc->x;
+
+ const int wiener_win =
+ (rsc->plane == AOM_PLANE_Y) ? WIENER_WIN : WIENER_WIN_CHROMA;
+
+ double best_cost = 0;
+ int64_t best_bits = 0;
+ RestorationType best_rtype = RESTORE_NONE;
+
+ for (RestorationType r = 0; r < RESTORE_SWITCHABLE_TYPES; ++r) {
+ // Check for the condition that wiener or sgrproj search could not
+ // find a solution or the solution was worse than RESTORE_NONE.
+ // In either case the best_rtype will be set as RESTORE_NONE. These
+ // should be skipped from the test below.
+ if (r > RESTORE_NONE) {
+ if (rusi->best_rtype[r - 1] == RESTORE_NONE) continue;
+ }
+
+ const int64_t sse = rusi->sse[r];
+ int64_t coeff_pcost = 0;
+ switch (r) {
+ case RESTORE_NONE: coeff_pcost = 0; break;
+ case RESTORE_WIENER:
+ coeff_pcost =
+ count_wiener_bits(wiener_win, &rusi->wiener, &rsc->wiener);
+ break;
+ case RESTORE_SGRPROJ:
+ coeff_pcost = count_sgrproj_bits(&rusi->sgrproj, &rsc->sgrproj);
+ break;
+ default: assert(0); break;
+ }
+ const int64_t coeff_bits = coeff_pcost << AV1_PROB_COST_SHIFT;
+ const int64_t bits = x->switchable_restore_cost[r] + coeff_bits;
+ double cost = RDCOST_DBL(x->rdmult, bits >> 4, sse);
+ if (r == RESTORE_SGRPROJ && rusi->sgrproj.ep < 10)
+ cost *= (1 + DUAL_SGR_PENALTY_MULT * rsc->sf->dual_sgr_penalty_level);
+ if (r == 0 || cost < best_cost) {
+ best_cost = cost;
+ best_bits = bits;
+ best_rtype = r;
+ }
+ }
+
+ rusi->best_rtype[RESTORE_SWITCHABLE - 1] = best_rtype;
+
+ rsc->sse += rusi->sse[best_rtype];
+ rsc->bits += best_bits;
+ if (best_rtype == RESTORE_WIENER) rsc->wiener = rusi->wiener;
+ if (best_rtype == RESTORE_SGRPROJ) rsc->sgrproj = rusi->sgrproj;
+}
+
+static void copy_unit_info(RestorationType frame_rtype,
+ const RestUnitSearchInfo *rusi,
+ RestorationUnitInfo *rui) {
+ assert(frame_rtype > 0);
+ rui->restoration_type = rusi->best_rtype[frame_rtype - 1];
+ if (rui->restoration_type == RESTORE_WIENER)
+ rui->wiener_info = rusi->wiener;
+ else
+ rui->sgrproj_info = rusi->sgrproj;
+}
+
+static double search_rest_type(RestSearchCtxt *rsc, RestorationType rtype) {
+ static const rest_unit_visitor_t funs[RESTORE_TYPES] = {
+ search_norestore, search_wiener, search_sgrproj, search_switchable
+ };
+
+ reset_rsc(rsc);
+ rsc_on_tile(LR_TILE_ROW, LR_TILE_COL, rsc);
+ av1_foreach_rest_unit_in_plane(rsc->cm, rsc->plane, funs[rtype], rsc,
+ &rsc->tile_rect, rsc->cm->rst_tmpbuf, NULL);
+ return RDCOST_DBL(rsc->x->rdmult, rsc->bits >> 4, rsc->sse);
+}
+
+static int rest_tiles_in_plane(const AV1_COMMON *cm, int plane) {
+ const RestorationInfo *rsi = &cm->rst_info[plane];
+ return rsi->units_per_tile;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *src, AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ assert(!cm->all_lossless);
+
+ int ntiles[2];
+ for (int is_uv = 0; is_uv < 2; ++is_uv)
+ ntiles[is_uv] = rest_tiles_in_plane(cm, is_uv);
+
+ assert(ntiles[1] <= ntiles[0]);
+ RestUnitSearchInfo *rusi =
+ (RestUnitSearchInfo *)aom_memalign(16, sizeof(*rusi) * ntiles[0]);
+
+ // If the restoration unit dimensions are not multiples of
+ // rsi->restoration_unit_size then some elements of the rusi array may be
+ // left uninitialised when we reach copy_unit_info(...). This is not a
+ // problem, as these elements are ignored later, but in order to quiet
+ // Valgrind's warnings we initialise the array below.
+ memset(rusi, 0, sizeof(*rusi) * ntiles[0]);
+ cpi->td.mb.rdmult = cpi->rd.RDMULT;
+
+ RestSearchCtxt rsc;
+ const int plane_start = AOM_PLANE_Y;
+ const int plane_end = num_planes > 1 ? AOM_PLANE_V : AOM_PLANE_Y;
+ for (int plane = plane_start; plane <= plane_end; ++plane) {
+ init_rsc(src, &cpi->common, &cpi->td.mb, &cpi->sf, plane, rusi,
+ &cpi->trial_frame_rst, &rsc);
+
+ const int plane_ntiles = ntiles[plane > 0];
+ const RestorationType num_rtypes =
+ (plane_ntiles > 1) ? RESTORE_TYPES : RESTORE_SWITCHABLE_TYPES;
+
+ double best_cost = 0;
+ RestorationType best_rtype = RESTORE_NONE;
+
+ const int highbd = rsc.cm->seq_params.use_highbitdepth;
+ extend_frame(rsc.dgd_buffer, rsc.plane_width, rsc.plane_height,
+ rsc.dgd_stride, RESTORATION_BORDER, RESTORATION_BORDER,
+ highbd);
+
+ for (RestorationType r = 0; r < num_rtypes; ++r) {
+ if ((force_restore_type != RESTORE_TYPES) && (r != RESTORE_NONE) &&
+ (r != force_restore_type))
+ continue;
+
+ double cost = search_rest_type(&rsc, r);
+
+ if (r == 0 || cost < best_cost) {
+ best_cost = cost;
+ best_rtype = r;
+ }
+ }
+
+ cm->rst_info[plane].frame_restoration_type = best_rtype;
+ if (force_restore_type != RESTORE_TYPES)
+ assert(best_rtype == force_restore_type || best_rtype == RESTORE_NONE);
+
+ if (best_rtype != RESTORE_NONE) {
+ for (int u = 0; u < plane_ntiles; ++u) {
+ copy_unit_info(best_rtype, &rusi[u], &cm->rst_info[plane].unit_info[u]);
+ }
+ }
+ }
+
+ aom_free(rusi);
+}
diff --git a/third_party/aom/av1/encoder/pickrst.h b/third_party/aom/av1/encoder/pickrst.h
new file mode 100644
index 0000000000..3fec0c34b3
--- /dev/null
+++ b/third_party/aom/av1/encoder/pickrst.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_PICKRST_H_
+#define AOM_AV1_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/encoder.h"
+#include "aom_ports/system_state.h"
+
+struct yv12_buffer_config;
+struct AV1_COMP;
+
+static const uint8_t g_shuffle_stats_data[16] = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+};
+
+static INLINE double find_average(const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int stride) {
+ uint64_t sum = 0;
+ double avg = 0;
+ int i, j;
+ aom_clear_system_state();
+ for (i = v_start; i < v_end; i++)
+ for (j = h_start; j < h_end; j++) sum += src[i * stride + j];
+ avg = (double)sum / ((v_end - v_start) * (h_end - h_start));
+ return avg;
+}
+
+void av1_pick_filter_restoration(const YV12_BUFFER_CONFIG *sd, AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PICKRST_H_
diff --git a/third_party/aom/av1/encoder/pustats.h b/third_party/aom/av1/encoder/pustats.h
new file mode 100644
index 0000000000..40dd467689
--- /dev/null
+++ b/third_party/aom/av1/encoder/pustats.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_PUSTATS_H_
+#define AOM_AV1_ENCODER_PUSTATS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+#define NUM_FEATURES_PUSTATS 8
+#define NUM_HIDDEN_LAYERS 2
+#define HIDDEN_LAYERS_0_NODES 12
+#define HIDDEN_LAYERS_1_NODES 10
+#define LOGITS_NODES 1
+
+static const float
+ av1_pustats_rate_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.1758f, -0.0499f, -10.0069f, -2.2838f, -0.3359f, 0.3459f, -0.3285f,
+ -0.0515f, -0.5417f, 0.2357f, -0.0575f, -69.0782f, 0.5348f, 1.4068f,
+ 0.2213f, -1.0490f, -0.0636f, 0.1654f, 1.1002f, 33.4924f, 0.4358f,
+ 1.2499f, 0.1143f, 0.0592f, -1.6335f, -0.0092f, 1.2207f, -28.4543f,
+ -0.4973f, 0.4368f, 0.2341f, -0.1623f, -3.8986f, 0.1311f, -1.8789f,
+ -3.9079f, -0.8158f, -0.8420f, 1.4295f, -2.3629f, -1.4825f, 0.6498f,
+ -5.3669f, 6.4434f, 1.8393f, -35.0678f, 3.7459f, -2.8504f, 2.0502f,
+ -0.1812f, -3.9011f, -1.0155f, 1.8375f, -1.4517f, 1.3917f, 3.8664f,
+ 0.8345f, -0.3472f, 5.7740f, -1.1196f, -0.3264f, -1.2481f, -0.9284f,
+ -4.9657f, 2.2831f, 0.7337f, 2.3176f, 0.6416f, 0.8804f, 1.9988f,
+ -1.3426f, 1.2728f, 1.2249f, -0.1551f, 5.6045f, 0.2046f, -2.1464f,
+ -2.4922f, -0.5334f, 12.1055f, 7.2467f, -0.0070f, 0.0234f, 0.0021f,
+ 0.0215f, -0.0098f, -0.0682f, -6.1494f, -0.3176f, -1.6069f, -0.2119f,
+ -1.0533f, -0.3566f, 0.5294f, -0.4335f, 0.1626f,
+ };
+
+static const float av1_pustats_rate_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+ {
+ 10.5266f, 5.3268f, -1.0678f, 7.7411f, 8.7164f, -0.3235f,
+ 7.3028f, 9.0874f, -6.4594f, -1.0102f, -1.1146f, 10.8419f,
+ };
+
+static const float
+ av1_pustats_rate_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ 10.5932f, 2.5192f, -0.0015f, 5.9479f, 5.2426f, -0.4091f, 5.3220f,
+ 6.0469f, 0.7200f, 3.3241f, 5.5006f, 12.8290f, -1.6396f, 0.5743f,
+ -0.8370f, 1.9956f, -4.9270f, -1.5295f, 2.1350f, -9.4415f, -0.7094f,
+ 5.1822f, 19.7287f, -3.0444f, -0.3320f, 0.0031f, -0.2709f, -0.5249f,
+ 0.3281f, -0.2240f, 0.2225f, -0.2386f, -0.4370f, -0.2438f, -0.4928f,
+ -0.2842f, -2.1772f, 9.2570f, -17.6655f, 3.5448f, -2.8394f, -1.0167f,
+ -0.5115f, -1.9260f, -0.2111f, -0.7528f, -1.2387f, -0.0401f, 5.0716f,
+ -3.3763f, -0.2898f, -0.4956f, -7.9993f, 0.1526f, -0.0242f, 0.7354f,
+ 6.0432f, 4.8043f, 7.4790f, -0.6295f, 1.7565f, 3.7197f, -2.3963f,
+ 6.8945f, 2.9717f, -3.1623f, 3.4241f, 4.4676f, -1.8154f, -2.9401f,
+ -8.5657f, -3.0240f, -1.4661f, 8.1145f, -12.7858f, 3.3624f, -1.0819f,
+ -4.2856f, 1.1801f, -0.5587f, -1.6062f, -1.1813f, -3.5882f, -0.2490f,
+ -24.9566f, -0.4140f, -0.1113f, 3.5537f, 4.4112f, 0.1367f, -1.5876f,
+ 1.6605f, 1.3903f, -0.0253f, -2.1419f, -2.2197f, -0.7659f, -0.4249f,
+ -0.0424f, 0.1486f, 0.4643f, -0.9068f, -0.3619f, -0.7624f, -0.9132f,
+ -0.4947f, -0.3527f, -0.5445f, -0.4768f, -1.7761f, -1.0686f, 0.5462f,
+ 1.3371f, 4.3116f, 0.0777f, -2.7216f, -1.8908f, 3.4989f, 7.7269f,
+ -2.7566f,
+ };
+
+static const float av1_pustats_rate_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+ {
+ 13.2435f, -8.5477f, -0.0998f, -1.5131f, -12.0187f,
+ 6.1715f, 0.5094f, 7.6433f, -0.3992f, -1.3555f,
+ };
+
+static const float
+ av1_pustats_rate_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ 4.3078f, -17.3497f, 0.0195f, 34.6032f, -5.0127f,
+ 5.3079f, 10.0077f, -13.129f, 0.0087f, -8.4009f,
+ };
+
+static const float av1_pustats_rate_logits_bias[LOGITS_NODES] = {
+ 4.5103f,
+};
+
+static const NN_CONFIG av1_pustats_rate_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_rate_hiddenlayer_0_kernel,
+ av1_pustats_rate_hiddenlayer_1_kernel,
+ av1_pustats_rate_logits_kernel,
+ },
+ {
+ av1_pustats_rate_hiddenlayer_0_bias,
+ av1_pustats_rate_hiddenlayer_1_bias,
+ av1_pustats_rate_logits_bias,
+ },
+};
+
+static const float
+ av1_pustats_dist_hiddenlayer_0_kernel[NUM_FEATURES_PUSTATS *
+ HIDDEN_LAYERS_0_NODES] = {
+ -0.2560f, 0.1105f, -0.8434f, -0.0132f, -8.9371f, -1.1176f, -0.3655f,
+ 0.4885f, 1.7518f, 0.4985f, 0.5582f, -0.3739f, 0.9403f, 0.3874f,
+ 0.3265f, 1.7383f, 3.1747f, 0.0285f, 3.3942f, -0.0123f, 0.5057f,
+ 0.1584f, 0.2697f, 4.6151f, 3.6251f, -0.0121f, -1.0047f, -0.0037f,
+ 0.0127f, 0.1935f, -0.5277f, -2.7144f, 0.0729f, -0.1457f, -0.0816f,
+ -0.5462f, 0.4738f, 0.3599f, -0.0564f, 0.0910f, 0.0126f, -0.0310f,
+ -2.1311f, -0.4666f, -0.0074f, -0.0765f, 0.0287f, -0.2662f, -0.0999f,
+ -0.2983f, -0.4899f, -0.2314f, 0.2873f, -0.3614f, 0.1783f, -0.1210f,
+ 0.3569f, 0.5436f, -8.0536f, -0.0044f, -1.5255f, -0.8247f, -0.4556f,
+ 1.9045f, 0.5463f, 0.1102f, -0.9293f, -0.0185f, -0.8302f, -0.4378f,
+ -0.3531f, -1.3095f, 0.6099f, 0.7977f, 4.1950f, -0.0067f, -0.2762f,
+ -0.1574f, -0.2149f, 0.6104f, -1.7053f, 0.1904f, 4.2402f, -0.2671f,
+ 0.8940f, 0.6820f, 0.2241f, -0.9459f, 1.4571f, 0.5255f, 2.3352f,
+ -0.0806f, 0.5231f, 0.3928f, 0.4146f, 2.0956f,
+ };
+
+static const float av1_pustats_dist_hiddenlayer_0_bias[HIDDEN_LAYERS_0_NODES] =
+ {
+ 1.1597f, 0.0836f, -0.7471f, -0.2439f, -0.0438f, 2.4626f,
+ 0.f, 1.1485f, 2.7085f, -4.7897f, 1.4093f, -1.657f,
+ };
+
+static const float
+ av1_pustats_dist_hiddenlayer_1_kernel[HIDDEN_LAYERS_0_NODES *
+ HIDDEN_LAYERS_1_NODES] = {
+ -0.5203f, -1.3468f, 0.3865f, -0.6859f, 0.0058f, 4.0682f, 0.4807f,
+ -0.1380f, 0.6050f, 0.8958f, 0.7748f, -0.1311f, 1.7317f, 1.1265f,
+ 0.0827f, 0.1407f, -0.3605f, 0.5429f, 0.1880f, -0.1439f, 0.2837f,
+ 1.6477f, 0.0832f, 0.0593f, -1.8464f, -0.7241f, -1.0672f, -0.3546f,
+ -0.3842f, -2.3637f, 0.2514f, 0.8263f, -0.1872f, 0.5774f, -0.3610f,
+ -0.0205f, 1.3977f, -0.1083f, 0.6923f, 1.3039f, -0.2870f, 1.0622f,
+ -0.0566f, 0.2697f, -0.5429f, -0.6193f, 1.7559f, 0.3246f, 1.9159f,
+ 0.3744f, 0.0686f, 1.0191f, -0.4212f, 1.9591f, -0.0691f, -0.1085f,
+ -1.2034f, 0.0606f, 1.0116f, 0.5565f, -0.1874f, -0.7898f, 0.4796f,
+ 0.2290f, 0.4334f, -0.5817f, -0.2949f, 0.1367f, -0.2932f, -1.1265f,
+ 0.0133f, -0.5309f, -3.3191f, 0.0939f, 0.3895f, -2.5812f, -0.0066f,
+ -3.0063f, -0.2982f, 0.7309f, -0.2422f, -0.2770f, -0.7152f, 0.1700f,
+ 1.9630f, 0.1988f, 0.4194f, 0.8762f, 0.3402f, 0.1051f, -0.1598f,
+ 0.2405f, 0.0392f, 1.1256f, 1.5245f, 0.0950f, 0.2160f, -0.5023f,
+ 0.2584f, 0.2074f, 0.2218f, 0.3966f, -0.0921f, -0.2435f, -0.4560f,
+ -1.1923f, -0.3716f, -0.3286f, -1.3225f, 0.1896f, -0.3342f, -0.7888f,
+ -0.4488f, -1.7168f, 0.3341f, 0.1146f, 0.5226f, 0.2610f, -0.4574f,
+ -0.4164f,
+ };
+
+static const float av1_pustats_dist_hiddenlayer_1_bias[HIDDEN_LAYERS_1_NODES] =
+ {
+ -2.3014f, -2.4292f, 1.3317f, -3.2361f, -1.918f,
+ 2.7149f, -2.5649f, 2.7765f, 2.9617f, 2.7684f,
+ };
+
+static const float
+ av1_pustats_dist_logits_kernel[HIDDEN_LAYERS_1_NODES * LOGITS_NODES] = {
+ -0.6868f, -0.6715f, 0.449f, -1.293f, 0.6214f,
+ 0.9894f, -0.4342f, 0.7002f, 1.4363f, 0.6951f,
+ };
+
+static const float av1_pustats_dist_logits_bias[LOGITS_NODES] = {
+ 2.3371f,
+};
+
+static const NN_CONFIG av1_pustats_dist_nnconfig = {
+ NUM_FEATURES_PUSTATS, // num_inputs
+ LOGITS_NODES, // num_outputs
+ NUM_HIDDEN_LAYERS, // num_hidden_layers
+ { HIDDEN_LAYERS_0_NODES, HIDDEN_LAYERS_1_NODES }, // num_hidden_nodes
+ {
+ av1_pustats_dist_hiddenlayer_0_kernel,
+ av1_pustats_dist_hiddenlayer_1_kernel,
+ av1_pustats_dist_logits_kernel,
+ },
+ {
+ av1_pustats_dist_hiddenlayer_0_bias,
+ av1_pustats_dist_hiddenlayer_1_bias,
+ av1_pustats_dist_logits_bias,
+ },
+};
+
+#undef NUM_HIDDEN_LAYERS
+#undef HIDDEN_LAYERS_0_NODES
+#undef HIDDEN_LAYERS_1_NODES
+#undef LOGITS_NODES
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_PUSTATS_H_
diff --git a/third_party/aom/av1/encoder/random.h b/third_party/aom/av1/encoder/random.h
new file mode 100644
index 0000000000..0bca391029
--- /dev/null
+++ b/third_party/aom/av1/encoder/random.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANDOM_H_
+#define AOM_AV1_ENCODER_RANDOM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Generate a random number in the range [0, 32768).
+static INLINE unsigned int lcg_rand16(unsigned int *state) {
+ *state = (unsigned int)(*state * 1103515245ULL + 12345);
+ return *state / 65536 % 32768;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RANDOM_H_
diff --git a/third_party/aom/av1/encoder/ransac.c b/third_party/aom/av1/encoder/ransac.c
new file mode 100644
index 0000000000..781f528ebf
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.c
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "av1/encoder/ransac.h"
+#include "av1/encoder/mathutils.h"
+#include "av1/encoder/random.h"
+
+#define MAX_MINPTS 4
+#define MAX_DEGENERATE_ITER 10
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.0
+#define MIN_TRIALS 20
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef int (*IsDegenerateFunc)(double *p);
+typedef void (*NormalizeFunc)(double *p, int np, double *T);
+typedef void (*DenormalizeFunc)(double *params, double *T1, double *T2);
+typedef int (*FindTransformationFunc)(int points, double *points1,
+ double *points2, double *params);
+typedef void (*ProjectPointsDoubleFunc)(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj);
+
+static void project_points_double_translation(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = x + mat[0];
+ *(proj++) = y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_rotzoom(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+ *(proj++) = -mat[3] * x + mat[2] * y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void project_points_double_affine(double *mat, double *points,
+ double *proj, const int n,
+ const int stride_points,
+ const int stride_proj) {
+ int i;
+ for (i = 0; i < n; ++i) {
+ const double x = *(points++), y = *(points++);
+ *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+ *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+ points += stride_points - 2;
+ proj += stride_proj - 2;
+ }
+}
+
+static void normalize_homography(double *pts, int n, double *T) {
+ double *p = pts;
+ double mean[2] = { 0, 0 };
+ double msqe = 0;
+ double scale;
+ int i;
+
+ assert(n > 0);
+ for (i = 0; i < n; ++i, p += 2) {
+ mean[0] += p[0];
+ mean[1] += p[1];
+ }
+ mean[0] /= n;
+ mean[1] /= n;
+ for (p = pts, i = 0; i < n; ++i, p += 2) {
+ p[0] -= mean[0];
+ p[1] -= mean[1];
+ msqe += sqrt(p[0] * p[0] + p[1] * p[1]);
+ }
+ msqe /= n;
+ scale = (msqe == 0 ? 1.0 : sqrt(2) / msqe);
+ T[0] = scale;
+ T[1] = 0;
+ T[2] = -scale * mean[0];
+ T[3] = 0;
+ T[4] = scale;
+ T[5] = -scale * mean[1];
+ T[6] = 0;
+ T[7] = 0;
+ T[8] = 1;
+ for (p = pts, i = 0; i < n; ++i, p += 2) {
+ p[0] *= scale;
+ p[1] *= scale;
+ }
+}
+
+static void invnormalize_mat(double *T, double *iT) {
+ double is = 1.0 / T[0];
+ double m0 = -T[2] * is;
+ double m1 = -T[5] * is;
+ iT[0] = is;
+ iT[1] = 0;
+ iT[2] = m0;
+ iT[3] = 0;
+ iT[4] = is;
+ iT[5] = m1;
+ iT[6] = 0;
+ iT[7] = 0;
+ iT[8] = 1;
+}
+
+static void denormalize_homography(double *params, double *T1, double *T2) {
+ double iT2[9];
+ double params2[9];
+ invnormalize_mat(T2, iT2);
+ multiply_mat(params, T1, params2, 3, 3, 3);
+ multiply_mat(iT2, params2, params, 3, 3, 3);
+}
+
+static void denormalize_affine_reorder(double *params, double *T1, double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = params[0];
+ params_denorm[1] = params[1];
+ params_denorm[2] = params[4];
+ params_denorm[3] = params[2];
+ params_denorm[4] = params[3];
+ params_denorm[5] = params[5];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params_denorm[0];
+ params[3] = params_denorm[1];
+ params[4] = params_denorm[3];
+ params[5] = params_denorm[4];
+ params[6] = params[7] = 0;
+}
+
+static void denormalize_rotzoom_reorder(double *params, double *T1,
+ double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = params[0];
+ params_denorm[1] = params[1];
+ params_denorm[2] = params[2];
+ params_denorm[3] = -params[1];
+ params_denorm[4] = params[0];
+ params_denorm[5] = params[3];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params_denorm[0];
+ params[3] = params_denorm[1];
+ params[4] = -params[3];
+ params[5] = params[2];
+ params[6] = params[7] = 0;
+}
+
+static void denormalize_translation_reorder(double *params, double *T1,
+ double *T2) {
+ double params_denorm[MAX_PARAMDIM];
+ params_denorm[0] = 1;
+ params_denorm[1] = 0;
+ params_denorm[2] = params[0];
+ params_denorm[3] = 0;
+ params_denorm[4] = 1;
+ params_denorm[5] = params[1];
+ params_denorm[6] = params_denorm[7] = 0;
+ params_denorm[8] = 1;
+ denormalize_homography(params_denorm, T1, T2);
+ params[0] = params_denorm[2];
+ params[1] = params_denorm[5];
+ params[2] = params[5] = 1;
+ params[3] = params[4] = 0;
+ params[6] = params[7] = 0;
+}
+
+static int find_translation(int np, double *pts1, double *pts2, double *mat) {
+ int i;
+ double sx, sy, dx, dy;
+ double sumx, sumy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ sumx = 0;
+ sumy = 0;
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ sumx += dx - sx;
+ sumy += dy - sy;
+ }
+ mat[0] = sumx / np;
+ mat[1] = sumy / np;
+ denormalize_translation_reorder(mat, T1, T2);
+ return 0;
+}
+
+static int find_rotzoom(int np, double *pts1, double *pts2, double *mat) {
+ const int np2 = np * 2;
+ double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 5 + 20));
+ double *b = a + np2 * 4;
+ double *temp = b + np2;
+ int i;
+ double sx, sy, dx, dy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 2 * 4 + 0] = sx;
+ a[i * 2 * 4 + 1] = sy;
+ a[i * 2 * 4 + 2] = 1;
+ a[i * 2 * 4 + 3] = 0;
+ a[(i * 2 + 1) * 4 + 0] = sy;
+ a[(i * 2 + 1) * 4 + 1] = -sx;
+ a[(i * 2 + 1) * 4 + 2] = 0;
+ a[(i * 2 + 1) * 4 + 3] = 1;
+
+ b[2 * i] = dx;
+ b[2 * i + 1] = dy;
+ }
+ if (!least_squares(4, a, np2, 4, b, temp, mat)) {
+ aom_free(a);
+ return 1;
+ }
+ denormalize_rotzoom_reorder(mat, T1, T2);
+ aom_free(a);
+ return 0;
+}
+
+static int find_affine(int np, double *pts1, double *pts2, double *mat) {
+ const int np2 = np * 2;
+ double *a = (double *)aom_malloc(sizeof(*a) * (np2 * 7 + 42));
+ double *b = a + np2 * 6;
+ double *temp = b + np2;
+ int i;
+ double sx, sy, dx, dy;
+
+ double T1[9], T2[9];
+ normalize_homography(pts1, np, T1);
+ normalize_homography(pts2, np, T2);
+
+ for (i = 0; i < np; ++i) {
+ dx = *(pts2++);
+ dy = *(pts2++);
+ sx = *(pts1++);
+ sy = *(pts1++);
+
+ a[i * 2 * 6 + 0] = sx;
+ a[i * 2 * 6 + 1] = sy;
+ a[i * 2 * 6 + 2] = 0;
+ a[i * 2 * 6 + 3] = 0;
+ a[i * 2 * 6 + 4] = 1;
+ a[i * 2 * 6 + 5] = 0;
+ a[(i * 2 + 1) * 6 + 0] = 0;
+ a[(i * 2 + 1) * 6 + 1] = 0;
+ a[(i * 2 + 1) * 6 + 2] = sx;
+ a[(i * 2 + 1) * 6 + 3] = sy;
+ a[(i * 2 + 1) * 6 + 4] = 0;
+ a[(i * 2 + 1) * 6 + 5] = 1;
+
+ b[2 * i] = dx;
+ b[2 * i + 1] = dy;
+ }
+ if (!least_squares(6, a, np2, 6, b, temp, mat)) {
+ aom_free(a);
+ return 1;
+ }
+ denormalize_affine_reorder(mat, T1, T2);
+ aom_free(a);
+ return 0;
+}
+
+static int get_rand_indices(int npoints, int minpts, int *indices,
+ unsigned int *seed) {
+ int i, j;
+ int ptr = lcg_rand16(seed) % npoints;
+ if (minpts > npoints) return 0;
+ indices[0] = ptr;
+ ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+ i = 1;
+ while (i < minpts) {
+ int index = lcg_rand16(seed) % npoints;
+ while (index) {
+ ptr = (ptr == npoints - 1 ? 0 : ptr + 1);
+ for (j = 0; j < i; ++j) {
+ if (indices[j] == ptr) break;
+ }
+ if (j == i) index--;
+ }
+ indices[i++] = ptr;
+ }
+ return 1;
+}
+
+typedef struct {
+ int num_inliers;
+ double variance;
+ int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+ const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+ const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+ if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+ if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+ if (motion_a->variance < motion_b->variance) return -1;
+ if (motion_a->variance > motion_b->variance) return 1;
+ return 0;
+}
+
+static int is_better_motion(const RANSAC_MOTION *motion_a,
+ const RANSAC_MOTION *motion_b) {
+ return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+ const int *indices, int num_points) {
+ for (int i = 0; i < num_points; ++i) {
+ const int index = indices[i];
+ dest[i * 2] = src[index * 2];
+ dest[i * 2 + 1] = src[index * 2 + 1];
+ }
+}
+
+static const double kInfiniteVariance = 1e12;
+
+static void clear_motion(RANSAC_MOTION *motion, int num_points) {
+ motion->num_inliers = 0;
+ motion->variance = kInfiniteVariance;
+ memset(motion->inlier_indices, 0,
+ sizeof(*motion->inlier_indices * num_points));
+}
+
+static int ransac(const int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions, const int minpts,
+ IsDegenerateFunc is_degenerate,
+ FindTransformationFunc find_transformation,
+ ProjectPointsDoubleFunc projectpoints) {
+ static const double PROBABILITY_REQUIRED = 0.9;
+ static const double EPS = 1e-12;
+
+ int N = 10000, trial_count = 0;
+ int i = 0;
+ int ret_val = 0;
+
+ unsigned int seed = (unsigned int)npoints;
+
+ int indices[MAX_MINPTS] = { 0 };
+
+ double *points1, *points2;
+ double *corners1, *corners2;
+ double *image1_coord;
+
+ // Store information for the num_desired_motions best transformations found
+ // and the worst motion among them, as well as the motion currently under
+ // consideration.
+ RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+ RANSAC_MOTION current_motion;
+
+ // Store the parameters and the indices of the inlier points for the motion
+ // currently under consideration.
+ double params_this_motion[MAX_PARAMDIM];
+
+ double *cnp1, *cnp2;
+
+ for (i = 0; i < num_desired_motions; ++i) {
+ num_inliers_by_motion[i] = 0;
+ }
+ if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+ return 1;
+ }
+
+ points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+ points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+ corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+ corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+ image1_coord = (double *)aom_malloc(sizeof(*image1_coord) * npoints * 2);
+
+ motions =
+ (RANSAC_MOTION *)aom_malloc(sizeof(RANSAC_MOTION) * num_desired_motions);
+ for (i = 0; i < num_desired_motions; ++i) {
+ motions[i].inlier_indices =
+ (int *)aom_malloc(sizeof(*motions->inlier_indices) * npoints);
+ clear_motion(motions + i, npoints);
+ }
+ current_motion.inlier_indices =
+ (int *)aom_malloc(sizeof(*current_motion.inlier_indices) * npoints);
+ clear_motion(&current_motion, npoints);
+
+ worst_kept_motion = motions;
+
+ if (!(points1 && points2 && corners1 && corners2 && image1_coord && motions &&
+ current_motion.inlier_indices)) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+
+ cnp1 = corners1;
+ cnp2 = corners2;
+ for (i = 0; i < npoints; ++i) {
+ *(cnp1++) = *(matched_points++);
+ *(cnp1++) = *(matched_points++);
+ *(cnp2++) = *(matched_points++);
+ *(cnp2++) = *(matched_points++);
+ }
+
+ while (N > trial_count) {
+ double sum_distance = 0.0;
+ double sum_distance_squared = 0.0;
+
+ clear_motion(&current_motion, npoints);
+
+ int degenerate = 1;
+ int num_degenerate_iter = 0;
+
+ while (degenerate) {
+ num_degenerate_iter++;
+ if (!get_rand_indices(npoints, minpts, indices, &seed)) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+
+ copy_points_at_indices(points1, corners1, indices, minpts);
+ copy_points_at_indices(points2, corners2, indices, minpts);
+
+ degenerate = is_degenerate(points1);
+ if (num_degenerate_iter > MAX_DEGENERATE_ITER) {
+ ret_val = 1;
+ goto finish_ransac;
+ }
+ }
+
+ if (find_transformation(minpts, points1, points2, params_this_motion)) {
+ trial_count++;
+ continue;
+ }
+
+ projectpoints(params_this_motion, corners1, image1_coord, npoints, 2, 2);
+
+ for (i = 0; i < npoints; ++i) {
+ double dx = image1_coord[i * 2] - corners2[i * 2];
+ double dy = image1_coord[i * 2 + 1] - corners2[i * 2 + 1];
+ double distance = sqrt(dx * dx + dy * dy);
+
+ if (distance < INLIER_THRESHOLD) {
+ current_motion.inlier_indices[current_motion.num_inliers++] = i;
+ sum_distance += distance;
+ sum_distance_squared += distance * distance;
+ }
+ }
+
+ if (current_motion.num_inliers >= worst_kept_motion->num_inliers &&
+ current_motion.num_inliers > 1) {
+ int temp;
+ double fracinliers, pNoOutliers, mean_distance, dtemp;
+ mean_distance = sum_distance / ((double)current_motion.num_inliers);
+ current_motion.variance =
+ sum_distance_squared / ((double)current_motion.num_inliers - 1.0) -
+ mean_distance * mean_distance * ((double)current_motion.num_inliers) /
+ ((double)current_motion.num_inliers - 1.0);
+ if (is_better_motion(&current_motion, worst_kept_motion)) {
+ // This motion is better than the worst currently kept motion. Remember
+ // the inlier points and variance. The parameters for each kept motion
+ // will be recomputed later using only the inliers.
+ worst_kept_motion->num_inliers = current_motion.num_inliers;
+ worst_kept_motion->variance = current_motion.variance;
+ memcpy(worst_kept_motion->inlier_indices, current_motion.inlier_indices,
+ sizeof(*current_motion.inlier_indices) * npoints);
+
+ assert(npoints > 0);
+ fracinliers = (double)current_motion.num_inliers / (double)npoints;
+ pNoOutliers = 1 - pow(fracinliers, minpts);
+ pNoOutliers = fmax(EPS, pNoOutliers);
+ pNoOutliers = fmin(1 - EPS, pNoOutliers);
+ dtemp = log(1.0 - PROBABILITY_REQUIRED) / log(pNoOutliers);
+ temp = (dtemp > (double)INT32_MAX)
+ ? INT32_MAX
+ : dtemp < (double)INT32_MIN ? INT32_MIN : (int)dtemp;
+
+ if (temp > 0 && temp < N) {
+ N = AOMMAX(temp, MIN_TRIALS);
+ }
+
+ // Determine the new worst kept motion and its num_inliers and variance.
+ for (i = 0; i < num_desired_motions; ++i) {
+ if (is_better_motion(worst_kept_motion, &motions[i])) {
+ worst_kept_motion = &motions[i];
+ }
+ }
+ }
+ }
+ trial_count++;
+ }
+
+ // Sort the motions, best first.
+ qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+ // Recompute the motions using only the inliers.
+ for (i = 0; i < num_desired_motions; ++i) {
+ if (motions[i].num_inliers >= minpts) {
+ copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+ motions[i].num_inliers);
+ copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+ motions[i].num_inliers);
+
+ find_transformation(motions[i].num_inliers, points1, points2,
+ params_by_motion + (MAX_PARAMDIM - 1) * i);
+ }
+ num_inliers_by_motion[i] = motions[i].num_inliers;
+ }
+
+finish_ransac:
+ aom_free(points1);
+ aom_free(points2);
+ aom_free(corners1);
+ aom_free(corners2);
+ aom_free(image1_coord);
+ aom_free(current_motion.inlier_indices);
+ for (i = 0; i < num_desired_motions; ++i) {
+ aom_free(motions[i].inlier_indices);
+ }
+ aom_free(motions);
+
+ return ret_val;
+}
+
+static int is_collinear3(double *p1, double *p2, double *p3) {
+ static const double collinear_eps = 1e-3;
+ const double v =
+ (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+ return fabs(v) < collinear_eps;
+}
+
+static int is_degenerate_translation(double *p) {
+ return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+
+static int is_degenerate_affine(double *p) {
+ return is_collinear3(p, p + 2, p + 4);
+}
+
+int ransac_translation(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3,
+ is_degenerate_translation, find_translation,
+ project_points_double_translation);
+}
+
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+ find_rotzoom, project_points_double_rotzoom);
+}
+
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_desired_motions) {
+ return ransac(matched_points, npoints, num_inliers_by_motion,
+ params_by_motion, num_desired_motions, 3, is_degenerate_affine,
+ find_affine, project_points_double_affine);
+}
diff --git a/third_party/aom/av1/encoder/ransac.h b/third_party/aom/av1/encoder/ransac.h
new file mode 100644
index 0000000000..c429f2ce5e
--- /dev/null
+++ b/third_party/aom/av1/encoder/ransac.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RANSAC_H_
+#define AOM_AV1_ENCODER_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+
+#include "av1/common/warped_motion.h"
+
+typedef int (*RansacFunc)(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+
+/* Each of these functions fits a motion model from a set of
+ corresponding points in 2 frames using RANSAC. */
+int ransac_affine(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_motions);
+int ransac_rotzoom(int *matched_points, int npoints, int *num_inliers_by_motion,
+ double *params_by_motion, int num_motions);
+int ransac_translation(int *matched_points, int npoints,
+ int *num_inliers_by_motion, double *params_by_motion,
+ int num_motions);
+#endif // AOM_AV1_ENCODER_RANSAC_H_
diff --git a/third_party/aom/av1/encoder/rate_distortion_model_params.h b/third_party/aom/av1/encoder/rate_distortion_model_params.h
new file mode 100644
index 0000000000..7cd0962c57
--- /dev/null
+++ b/third_party/aom/av1/encoder/rate_distortion_model_params.h
@@ -0,0 +1,591 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+#define AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// 22 float features +
+// 2 categorical features with 4 possible values, converted to one-hot vectors.
+// So, total 22 + 2 * 4 = 30 features.
+#define NUM_FEATURES 30
+#define NUM_HIDDEN_LAYERS 1
+#define NUM_HIDDEN_NODES 96
+#define NUM_OUTPUTS 1
+
+//------------------------------------------------------------------------------
+// RDCost model
+
+static const float
+ av1_rdcost_model_nn_weights_layer0[NUM_FEATURES * NUM_HIDDEN_NODES] = {
+ -0.0699f, 0.2790f, 0.1915f, 0.2669f, 0.4637f, 0.4095f,
+ 0.2129f, 0.0634f, 0.2306f, -0.2232f, -0.5711f, -0.6493f,
+ -0.7406f, -0.8440f, 0.4105f, 0.1392f, 0.5218f, -0.1618f,
+ -0.1719f, 0.3409f, 0.1111f, -0.3609f, -0.2929f, 0.3869f,
+ -0.5373f, 0.0700f, 0.2572f, 0.2483f, -0.0314f, 0.5228f,
+ 0.0169f, -0.1357f, 0.0419f, -0.1722f, 0.1303f, 0.1198f,
+ -0.0013f, 0.1309f, 0.0293f, -0.1941f, 0.0668f, -0.0643f,
+ -0.0381f, 0.1249f, -0.0731f, -0.1649f, 0.0964f, 0.0270f,
+ 0.1354f, 0.0538f, -0.2064f, -0.2067f, -0.0569f, 0.0449f,
+ 0.1680f, -0.0732f, -0.0785f, 0.1884f, -0.2137f, -0.0189f,
+ 0.2976f, 0.2818f, -0.0222f, 0.2658f, 0.0488f, 0.2778f,
+ -0.1110f, 0.2069f, -0.0072f, -0.0095f, -0.1105f, -0.1365f,
+ -0.4245f, -0.4751f, -0.0736f, 0.2333f, 0.0653f, -0.0249f,
+ 0.0055f, -0.0838f, -0.0489f, -0.2597f, 0.2621f, -0.0251f,
+ -0.0545f, 0.0816f, -0.0816f, 0.3396f, -0.1047f, 0.3678f,
+ 0.1487f, -0.0270f, 0.2574f, 0.1018f, 0.2560f, -0.0598f,
+ -0.0446f, -0.1792f, 0.5336f, -0.1590f, -0.9820f, -0.6514f,
+ -0.6304f, -0.8359f, -0.0699f, 0.0295f, -0.0057f, -0.3088f,
+ -0.1466f, 0.2220f, -0.1980f, -0.3400f, -0.1228f, 0.2667f,
+ -0.4816f, 0.0155f, -0.0194f, 0.2051f, 0.0513f, 0.1575f,
+ -121.4240f, -126.6840f, -124.1106f, -127.6184f, -85.0333f, -26.6396f,
+ 2.7020f, 102.0452f, -85.5128f, 0.0076f, 122.2206f, 107.5265f,
+ 108.3773f, 93.4847f, 20.3705f, -89.6993f, -176.9070f, -41.7543f,
+ -123.0293f, -91.6437f, -205.7099f, -62.5346f, -83.2987f, 21.3830f,
+ 56.6341f, -120.8647f, -127.7562f, -121.6688f, -127.4225f, -74.8045f,
+ -15.9247f, -14.6468f, -14.7788f, -15.4498f, -18.5514f, -11.1579f,
+ -5.8164f, -3.4318f, 0.8100f, 0.0642f, 203.5111f, 189.6872f,
+ 190.4776f, 176.4784f, -4.9427f, -12.5324f, -7.6861f, 21.9182f,
+ -6.7864f, -7.1906f, -8.1292f, 21.4780f, -7.8016f, -5.2653f,
+ 61.8526f, -15.5105f, -14.6900f, -14.1459f, -15.4350f, -19.1379f,
+ -0.7876f, -1.8558f, -4.6035f, -6.8405f, -0.2904f, 2.3202f,
+ 1.8127f, -2.9397f, -0.8187f, -0.6098f, 22.6173f, 10.3668f,
+ 12.9363f, 2.4541f, 6.6700f, 0.3804f, -3.3117f, 8.5464f,
+ -25.8348f, 1.8698f, -9.5753f, 8.5558f, -16.3576f, 7.2217f,
+ 35.3115f, -1.1447f, -2.6530f, -4.7027f, -5.7024f, -0.9513f,
+ 0.8393f, 0.7085f, 0.7879f, 0.3728f, 3.0574f, 1.1360f,
+ 26.0531f, 4.1084f, -1.7340f, 0.1683f, -450.7927f, -444.5818f,
+ -442.5239f, -438.1168f, 2.4924f, -0.0147f, -0.0797f, -47.5322f,
+ -1.7638f, -0.8608f, -0.6500f, -44.4326f, -0.9027f, 2.5560f,
+ -267.6517f, 0.2642f, 0.9457f, 0.7944f, 0.3609f, 3.2742f,
+ -74.3400f, -81.6894f, -76.2162f, -69.2979f, -90.2476f, -39.7389f,
+ 2.2545f, 36.5095f, -60.1129f, -1.0383f, 87.0348f, 83.9940f,
+ 83.7199f, 80.8609f, 14.9075f, -78.7405f, -74.3549f, -4.2382f,
+ -23.9739f, -91.8469f, -67.2654f, -21.5293f, -9.9857f, 11.8391f,
+ 35.8223f, -74.2551f, -81.0729f, -73.8347f, -70.3798f, -86.8052f,
+ 0.1701f, -0.1136f, 0.0060f, -0.0496f, -0.1727f, 0.0195f,
+ -0.1040f, 0.1027f, 0.0467f, -0.2538f, -0.1322f, 0.0860f,
+ 0.0093f, -0.2801f, -0.0958f, 0.0497f, -0.0582f, -0.0311f,
+ 0.1840f, 0.0752f, 0.0282f, 0.0297f, 0.0607f, 0.0650f,
+ 0.0893f, 0.1297f, 0.0373f, 0.0040f, -0.0973f, 0.0248f,
+ -0.1419f, 0.0322f, -0.0712f, 0.0860f, -0.0426f, -0.1989f,
+ 0.1393f, -0.1183f, 0.0735f, -0.1895f, 0.1447f, -0.0056f,
+ -0.1833f, 0.0884f, 0.0949f, 0.0476f, 0.0551f, 0.2125f,
+ -0.1537f, -0.0141f, -0.2182f, 0.1567f, 0.0457f, -0.1485f,
+ -0.1177f, 0.0391f, 0.1982f, -0.1288f, 0.1165f, -0.2019f,
+ 0.4550f, 0.5179f, 0.4311f, 0.1861f, 0.6199f, 0.4542f,
+ 0.2034f, 0.1128f, 1.3489f, -0.2525f, -2.1139f, -2.2444f,
+ -2.3679f, -2.3378f, 0.5682f, 0.1348f, 0.3032f, -1.5835f,
+ 0.2883f, 0.1693f, 0.0439f, -1.4556f, 0.3818f, 0.4875f,
+ -1.8899f, 0.2510f, 0.6450f, 0.6082f, 0.5962f, 0.8131f,
+ 12.0281f, 13.3899f, 13.6249f, 15.8068f, -1.5453f, 6.7456f,
+ -6.0877f, 26.2596f, 6.2223f, -0.5922f, 134.1428f, 128.8985f,
+ 128.7538f, 123.0920f, 1.3207f, 18.3069f, 15.7436f, 46.5230f,
+ 24.7455f, 15.0688f, 19.9965f, 34.7236f, 19.7171f, 1.2018f,
+ 49.7274f, 11.8957f, 13.1578f, 14.0451f, 15.3544f, -3.5601f,
+ 1.0048f, 0.9479f, 1.1832f, 2.0635f, -2.9808f, 2.0803f,
+ -7.5815f, 8.4733f, -4.2008f, 0.1217f, 226.5257f, 210.7018f,
+ 211.6235f, 195.2605f, 0.8283f, 1.0977f, 1.4858f, 41.1242f,
+ 1.5822f, 0.8742f, 2.0440f, 33.6213f, 1.6177f, 0.9661f,
+ 65.0014f, 1.4197f, 1.0109f, 1.3153f, 1.5470f, -3.2833f,
+ 2.0858f, 2.0012f, 2.1088f, 2.5593f, -0.9422f, 1.8554f,
+ -6.5378f, 0.6780f, 2.3186f, 0.0506f, 218.3285f, 203.4055f,
+ 204.0362f, 188.7854f, 0.3701f, 2.5257f, 3.5172f, 28.8144f,
+ 2.1511f, 3.4676f, 2.6337f, 28.5113f, 2.4254f, -0.0548f,
+ 59.4511f, 2.0757f, 2.1551f, 2.2271f, 2.5300f, -1.4173f,
+ 91.9240f, 88.2142f, 83.6155f, 82.2482f, -9.2566f, 10.9654f,
+ -2.6974f, 62.6750f, -3.6298f, -0.1245f, 69.6721f, 67.1340f,
+ 66.9162f, 64.1994f, -83.6778f, 76.8107f, 69.7832f, 64.9261f,
+ 68.4901f, 76.3615f, 70.8108f, 63.5435f, 69.1973f, -83.6034f,
+ 24.8275f, 90.1923f, 87.6831f, 82.9783f, 81.8558f, -7.1010f,
+ 95.1656f, 88.3853f, 80.5835f, 79.5990f, -3.0720f, 8.1290f,
+ -0.6151f, 63.6425f, -4.5833f, -0.0063f, 70.1861f, 66.6250f,
+ 66.6148f, 63.0886f, -89.2863f, 74.7684f, 64.8897f, 60.4134f,
+ 62.5241f, 78.7076f, 61.7234f, 60.1688f, 61.9509f, -89.4098f,
+ 30.3361f, 92.9144f, 88.5954f, 79.6336f, 79.2453f, -0.4101f,
+ 0.6287f, 0.8050f, 0.4417f, 0.5419f, 0.5972f, 1.3037f,
+ 0.4316f, -0.0013f, -0.3673f, -0.4952f, 6.1773f, 5.7825f,
+ 6.1705f, 5.3848f, 1.7607f, -0.0152f, -0.2924f, 0.8199f,
+ 1.3326f, 0.7197f, -0.6332f, 1.1127f, 1.0472f, 1.8468f,
+ 3.4419f, 0.8233f, 0.7175f, 0.8514f, 0.6372f, 0.9472f,
+ -0.0813f, -0.0197f, -0.0096f, -0.2015f, 0.1133f, -0.0305f,
+ 0.0578f, 0.1375f, -0.0750f, -0.1702f, 0.1246f, -0.1782f,
+ 0.2017f, 0.0425f, -0.0602f, 0.1837f, 0.1044f, -0.1273f,
+ -0.1431f, 0.0672f, -0.1807f, -0.1045f, -0.1355f, -0.0497f,
+ -0.0561f, -0.0633f, 0.1907f, -0.0777f, 0.1203f, 0.0754f,
+ 0.4079f, 0.2001f, 0.0558f, 0.0622f, 0.2951f, 0.6541f,
+ -0.0068f, 0.1070f, 0.4469f, -0.1266f, -1.3035f, -1.3324f,
+ -1.3612f, -0.9966f, 0.7986f, 0.3192f, -0.5028f, -0.3844f,
+ -0.4079f, 0.6690f, -0.5109f, -0.2719f, -0.4958f, 1.0310f,
+ -0.8044f, 0.1447f, 0.4221f, 0.3194f, 0.3063f, 0.5520f,
+ 0.4667f, -5.7238f, -0.5602f, 12.6339f, -15.1865f, -14.9035f,
+ -3.0726f, 9.5347f, -24.6225f, -2.7086f, 89.8557f, 95.0657f,
+ 93.8693f, 99.1085f, -35.9483f, -18.0363f, -1.6298f, 25.3484f,
+ 39.3975f, -15.3199f, 5.7664f, 17.2367f, 25.2788f, -36.5648f,
+ 29.1426f, 0.3857f, -5.2117f, 0.0533f, 12.1707f, -11.1735f,
+ 0.2673f, 0.0090f, 0.1574f, 0.0904f, 0.0281f, 0.1144f,
+ 0.1123f, -0.0061f, 0.0954f, -0.0094f, -0.4387f, -0.5006f,
+ -0.2560f, -0.2326f, -0.1769f, 0.0465f, 0.1273f, -0.1627f,
+ 0.2987f, -0.3041f, 0.1131f, -0.3620f, 0.0932f, -0.0649f,
+ -0.4597f, 0.2535f, -0.0994f, 0.1390f, 0.1279f, 0.4207f,
+ -39.1159f, -42.6382f, -38.4225f, -31.2301f, -28.2382f, -28.1176f,
+ -9.5822f, 1.1886f, -1.2964f, -0.7908f, 154.9819f, 147.1914f,
+ 147.0482f, 138.7535f, -21.7014f, -35.7117f, -28.8802f, -3.8968f,
+ -21.5007f, -28.2213f, -28.4878f, -3.7558f, -26.8317f, -22.8491f,
+ 50.9464f, -37.0918f, -42.8811f, -39.3079f, -32.1904f, -26.6354f,
+ -72.5346f, -75.5751f, -72.6896f, -71.3671f, -35.3279f, -21.6077f,
+ -5.8259f, 38.7516f, -6.8012f, 0.0172f, 170.0685f, 157.4452f,
+ 158.2334f, 145.0102f, 10.0653f, -45.1775f, -56.4571f, -5.1165f,
+ -75.8980f, -46.8672f, -55.3642f, -6.5631f, -81.0258f, 10.1348f,
+ 55.9786f, -70.8124f, -75.7040f, -73.9831f, -70.8786f, -34.9723f,
+ 88.6239f, 86.5330f, 80.9333f, 79.6833f, -10.0096f, 10.6312f,
+ -4.2350f, 62.6230f, -3.2991f, -0.0843f, 75.8659f, 72.7886f,
+ 72.5301f, 68.8265f, -81.8276f, 70.3025f, 62.9511f, 62.5706f,
+ 69.1842f, 69.3637f, 65.4820f, 65.4357f, 71.5347f, -82.1064f,
+ 24.1925f, 86.2418f, 85.4985f, 80.4091f, 79.5378f, -9.3877f,
+ -7.6594f, -4.9581f, -10.6385f, -20.2307f, -44.2261f, -13.7557f,
+ -4.5344f, 18.1793f, -10.5522f, -1.5878f, 110.3187f, 102.4945f,
+ 102.3305f, 94.1324f, -25.2665f, 9.8172f, -4.4791f, 69.4972f,
+ -6.7571f, 5.8378f, -11.6101f, 70.7066f, -4.9327f, -24.0513f,
+ 41.4598f, -7.0600f, -7.0940f, -10.2478f, -18.9616f, -46.7505f,
+ 90.9365f, 86.0260f, 73.2934f, 69.3406f, 3.3863f, 3.8524f,
+ 0.6536f, 63.2150f, -10.6304f, 0.0291f, 73.0071f, 69.7660f,
+ 69.0457f, 65.5611f, -92.3379f, 74.2756f, 54.5025f, 84.3183f,
+ 53.7481f, 73.5624f, 55.3827f, 82.3242f, 53.5432f, -92.5355f,
+ 25.3457f, 89.1858f, 84.4763f, 72.9840f, 69.1889f, 4.6719f,
+ -0.0129f, 0.1995f, 0.2069f, 0.0358f, 0.1209f, -0.1185f,
+ -0.1217f, -0.1456f, 0.0125f, -0.1354f, 0.0510f, -0.0572f,
+ 0.1397f, 0.1453f, -0.0086f, 0.0107f, 0.0232f, 0.1508f,
+ 0.0884f, -0.0967f, -0.1786f, 0.1361f, -0.1399f, -0.2021f,
+ -0.0242f, -0.2169f, 0.0133f, 0.0116f, -0.1489f, -0.0093f,
+ -0.0796f, 0.1507f, 0.0906f, 0.0228f, -0.0166f, -0.1875f,
+ 0.0471f, 0.1184f, -0.0007f, -0.2732f, -0.1386f, -0.2057f,
+ -0.0213f, -0.1699f, 0.0996f, 0.1562f, 0.1850f, -0.0362f,
+ -0.2059f, 0.0258f, -0.0135f, -0.1276f, 0.0034f, 0.2023f,
+ 0.0857f, -0.0085f, -0.1955f, -0.1666f, -0.0920f, 0.0971f,
+ -0.0292f, -0.0512f, -0.0753f, -0.0739f, -0.0873f, -0.1200f,
+ 0.0220f, -0.1359f, 0.2013f, -0.0445f, 0.1143f, -0.1484f,
+ -0.1556f, -0.0003f, 0.1711f, -0.0724f, -0.0531f, 0.1126f,
+ 0.0476f, -0.0057f, 0.0088f, 0.0792f, -0.0438f, -0.1118f,
+ -0.0244f, 0.0712f, 0.0930f, -0.0203f, 0.1662f, -0.0695f,
+ -12.3872f, -18.7022f, -13.4237f, -1.4731f, -18.6843f, -14.1515f,
+ -7.5057f, 40.2090f, -2.7774f, -1.8433f, 123.6006f, 119.0557f,
+ 118.2758f, 113.6423f, -32.6216f, -19.5865f, -16.2897f, 17.2068f,
+ 6.3559f, -17.8742f, 0.7098f, 11.5970f, -10.1104f, -33.1830f,
+ 39.5617f, -10.5499f, -17.8137f, -14.7185f, -2.6172f, -14.6004f,
+ 0.3893f, 0.4443f, 0.5305f, 0.3049f, 0.8316f, 0.8679f,
+ 0.2265f, 0.2393f, 1.1970f, -0.2891f, -1.8666f, -1.8266f,
+ -1.6984f, -1.8787f, 0.8706f, 0.4208f, 0.5076f, -0.8436f,
+ -0.1623f, 0.8008f, 0.1512f, -1.0839f, -0.3002f, 0.9263f,
+ -1.3031f, 0.5964f, 0.3413f, 0.5551f, 0.2618f, 0.7018f,
+ -0.1320f, -0.1944f, -0.0209f, -0.0877f, 0.0721f, -0.0840f,
+ 0.0589f, 0.1019f, 0.1927f, -0.2011f, -0.1117f, 0.1575f,
+ 0.1080f, -0.0516f, 0.2154f, -0.1231f, 0.0426f, -0.0522f,
+ -0.1824f, -0.1923f, -0.1206f, -0.1724f, -0.0798f, 0.0401f,
+ -0.2170f, 0.0293f, -0.0853f, 0.1517f, 0.2128f, -0.1934f,
+ 0.0406f, 0.0517f, 0.0822f, -0.0150f, 0.0943f, -0.0989f,
+ -0.1802f, -0.1453f, -0.1967f, -0.1797f, 0.1545f, -0.1217f,
+ 0.1755f, -0.1604f, -0.0515f, 0.0509f, 0.0310f, -0.1220f,
+ -0.1770f, -0.0157f, 0.1989f, -0.0069f, 0.1766f, 0.1267f,
+ -0.0517f, -0.0396f, 0.0346f, 0.1946f, 0.1162f, -0.1345f,
+ -106.6179f, -110.5917f, -107.5476f, -108.0601f, -61.1687f, -22.4247f,
+ 2.6632f, 109.5208f, -66.1177f, 0.0062f, 159.9339f, 144.7755f,
+ 145.5032f, 128.9872f, 18.9180f, -75.3569f, -105.0866f, -52.0704f,
+ -119.1299f, -74.7543f, -109.9468f, -59.0682f, -104.5754f, 19.2878f,
+ 67.2573f, -104.8061f, -111.8610f, -106.6751f, -107.3537f, -56.4758f,
+ -0.6967f, -0.8495f, -0.9586f, -1.0461f, 1.4522f, -0.2762f,
+ 28.2828f, 2.9157f, -2.1062f, 0.1566f, -467.2388f, -461.0685f,
+ -459.0092f, -453.8370f, 1.5422f, -0.8186f, -0.4884f, -53.0399f,
+ -2.0255f, -1.1348f, -1.1039f, -50.2489f, -1.4821f, 1.8021f,
+ -258.0319f, -1.0865f, -0.5542f, -1.0443f, -1.2732f, 1.8413f,
+ 0.2377f, 0.1937f, -0.0116f, 0.0935f, -0.0599f, 0.0118f,
+ -0.0875f, 0.0455f, -0.1301f, -0.1081f, -0.2622f, -0.1960f,
+ 0.0393f, -0.1490f, 0.1852f, -0.0964f, -0.0741f, 0.0419f,
+ 0.1162f, -0.0274f, 0.1200f, -0.0333f, -0.1337f, 0.2141f,
+ 0.0664f, 0.1044f, -0.1744f, 0.1060f, -0.1468f, 0.0679f,
+ 0.0218f, 0.0494f, 0.1064f, 0.1363f, 0.0013f, 0.1331f,
+ -0.2095f, 0.2088f, -0.0399f, -0.1811f, 0.0678f, -0.1974f,
+ 0.1855f, -0.0968f, -0.2008f, 0.0162f, -0.0096f, -0.1493f,
+ 0.2170f, -0.1248f, -0.2055f, 0.1276f, -0.0269f, -0.1697f,
+ -0.0662f, 0.1073f, -0.0029f, -0.1051f, -0.1573f, 0.2106f,
+ -0.2020f, -0.1565f, 0.0335f, -0.1818f, -0.1665f, 0.2169f,
+ 0.1974f, -0.1470f, -0.1738f, -0.2038f, 0.0558f, -0.0441f,
+ 0.0065f, -0.1485f, -0.1366f, -0.2131f, 0.1042f, 0.0349f,
+ -0.1804f, -0.1361f, -0.0116f, -0.1012f, -0.0860f, 0.0606f,
+ -0.2077f, 0.1826f, -0.1014f, -0.0721f, -0.1517f, 0.1022f,
+ -0.1110f, -0.0186f, 0.1505f, 0.1797f, 0.0911f, 0.0340f,
+ 0.1702f, -0.1404f, -0.0566f, -0.2744f, -0.1943f, -0.1871f,
+ 0.0046f, 0.0306f, -0.0436f, 0.1625f, -0.1302f, 0.0175f,
+ 0.1570f, -0.1425f, 0.0779f, 0.1398f, 0.0929f, 0.0897f,
+ 0.0458f, -0.0936f, 0.1321f, -0.1355f, 0.0974f, 0.0457f,
+ -73.3516f, -75.0655f, -72.1062f, -72.4624f, -34.8640f, -14.3727f,
+ -4.4720f, 66.4982f, -18.8358f, 0.0397f, 174.2172f, 160.4959f,
+ 161.1034f, 147.3250f, 9.5507f, -45.0180f, -73.1609f, -1.5230f,
+ -74.8677f, -43.8559f, -68.7622f, -4.8971f, -82.1922f, 9.6490f,
+ 64.7115f, -71.8566f, -75.3879f, -72.5479f, -71.7161f, -34.8056f,
+ 0.1442f, 0.1558f, 0.1267f, -0.1261f, -0.0506f, -0.0823f,
+ -0.1807f, -0.0889f, -0.2098f, -0.1295f, -0.2046f, -0.1749f,
+ -0.1197f, -0.1380f, 0.0799f, -0.0889f, -0.1209f, 0.1919f,
+ 0.1947f, -0.2086f, -0.1042f, -0.0468f, 0.0232f, 0.1052f,
+ -0.0535f, 0.1398f, 0.1713f, -0.1522f, 0.1453f, 0.0286f,
+ -64.8503f, -67.6746f, -63.6497f, -60.4614f, -35.6091f, -20.1605f,
+ -3.6082f, 84.2801f, -37.8552f, -2.2371f, 132.4947f, 123.5057f,
+ 123.5776f, 113.9060f, -14.8772f, -40.7130f, -79.1391f, -10.7024f,
+ -65.7831f, -43.6078f, -79.6847f, -13.0743f, -69.2533f, -16.0171f,
+ 50.4868f, -64.3678f, -68.7061f, -64.0823f, -59.3413f, -28.9405f,
+ 77.1601f, 75.4899f, 69.8696f, 67.8764f, -22.7548f, 5.9814f,
+ -3.2826f, 57.9754f, -5.9500f, -0.0014f, 77.2251f, 74.0737f,
+ 73.7004f, 70.5072f, -80.9661f, 69.3065f, 55.8337f, 76.8831f,
+ 57.9902f, 63.4765f, 56.4748f, 70.0282f, 61.0874f, -81.3960f,
+ 26.2594f, 76.0367f, 74.9115f, 69.2361f, 66.9262f, -20.1637f,
+ 0.1886f, -0.1108f, 0.1262f, 0.0189f, 0.1382f, 0.0859f,
+ -0.1874f, -0.1986f, -0.0171f, -0.1400f, -0.2944f, -0.0750f,
+ -0.0395f, -0.2092f, -0.0878f, 0.1216f, -0.0870f, -0.1613f,
+ 0.2495f, 0.0754f, 0.0244f, -0.1205f, -0.0196f, -0.1729f,
+ 0.1170f, 0.1585f, 0.1482f, -0.1705f, -0.1337f, 0.0199f,
+ 13.0897f, 9.1111f, 6.7413f, 6.3907f, -28.1187f, 0.4556f,
+ -5.3116f, 30.7293f, -16.3644f, -0.0365f, 118.9118f, 111.6125f,
+ 111.3227f, 103.4680f, -30.1883f, 8.9328f, -4.1876f, 79.3936f,
+ -9.0522f, 12.7861f, -1.2736f, 78.0446f, -5.9485f, -30.5716f,
+ 27.8951f, 13.9613f, 6.7173f, 5.2345f, 8.3271f, -27.3705f,
+ 1.0488f, 1.0864f, 1.0710f, 1.7332f, -3.0561f, 1.1622f,
+ -7.6688f, 3.0491f, -1.3865f, 0.0769f, 222.5451f, 207.8170f,
+ 208.1767f, 193.1396f, 0.4447f, 2.1654f, 1.8929f, 35.1469f,
+ 1.1783f, 2.6199f, 1.1611f, 26.2989f, 3.4446f, 0.1551f,
+ 65.6529f, 1.2229f, 0.9851f, 1.0241f, 1.4373f, -3.3421f,
+ 0.1388f, 0.0756f, 0.2047f, 0.1140f, 0.0945f, 0.2038f,
+ 0.1038f, -0.2068f, -0.0626f, -0.1937f, 0.1347f, -0.0464f,
+ -0.0866f, 0.0250f, 0.0264f, -0.1556f, -0.1625f, 0.1028f,
+ -0.1255f, -0.0854f, 0.1033f, 0.0008f, -0.2133f, -0.0317f,
+ 0.1725f, -0.1054f, -0.1900f, 0.0383f, 0.0440f, -0.1900f,
+ -30.0811f, -30.9929f, -29.3194f, -26.8347f, -20.5957f, -4.1595f,
+ -1.9066f, 42.4707f, -9.0435f, 0.0064f, 175.7328f, 163.1350f,
+ 163.5085f, 151.1648f, 4.4620f, -20.6011f, -19.3402f, 1.5468f,
+ -32.0920f, -25.4581f, -12.3706f, -2.1636f, -32.4569f, 3.9365f,
+ 61.0117f, -28.4195f, -31.0837f, -30.2749f, -27.5522f, -22.8688f,
+ -0.3000f, 0.0092f, -0.3675f, -0.4113f, 0.0033f, 0.1138f,
+ 0.2182f, -0.5803f, 0.7507f, -0.2529f, -1.7724f, -1.4702f,
+ -1.5805f, -1.4294f, 0.1435f, -0.0168f, 0.2356f, -0.4373f,
+ -0.4500f, -0.4803f, -0.0041f, -0.3878f, 0.1321f, 0.2761f,
+ -1.1975f, -0.3509f, -0.0465f, -0.4050f, -0.1110f, 0.2233f,
+ 0.0950f, 0.0974f, -0.1600f, -0.1753f, -0.0328f, 0.0741f,
+ -0.0706f, 0.1839f, -0.0833f, -0.1367f, -0.1094f, -0.1739f,
+ -0.1069f, 0.0370f, -0.1404f, 0.1631f, -0.1570f, 0.2117f,
+ -0.1891f, 0.0395f, 0.1081f, 0.1760f, 0.0997f, 0.0853f,
+ -0.1018f, 0.1306f, -0.0924f, -0.2078f, 0.0801f, -0.0949f,
+ 0.5803f, 0.5578f, 0.4089f, 0.1912f, 0.6774f, 0.3145f,
+ 0.3992f, -0.1316f, 1.3142f, -0.2457f, -2.3536f, -2.4939f,
+ -2.3165f, -2.4879f, 0.2321f, 0.1901f, 0.1789f, -1.5215f,
+ 0.2645f, 0.2231f, 0.2411f, -1.2361f, 0.2971f, 0.1421f,
+ -1.6715f, 0.3158f, 0.2476f, 0.3596f, 0.3029f, 0.9297f,
+ -88.8401f, -89.5209f, -86.1926f, -87.4196f, -39.6504f, -17.9684f,
+ -4.2702f, 80.2017f, -29.1676f, -0.4190f, 150.2820f, 138.4751f,
+ 139.1087f, 126.6569f, 13.7188f, -57.0739f, -80.3383f, -18.8351f,
+ -87.4103f, -56.0072f, -82.7707f, -23.1871f, -93.6787f, 13.9287f,
+ 59.6213f, -87.4843f, -90.4227f, -86.2635f, -86.6841f, -37.9086f,
+ 0.1184f, -0.2169f, -0.1915f, 0.0543f, 0.1253f, -0.1370f,
+ 0.0836f, -0.1198f, 0.1544f, -0.2004f, -0.1118f, -0.0786f,
+ 0.1517f, -0.1000f, -0.1055f, 0.0936f, -0.1579f, 0.1098f,
+ -0.0234f, -0.0499f, 0.0951f, -0.1711f, 0.0186f, -0.2008f,
+ 0.1777f, 0.1386f, -0.1495f, -0.0684f, -0.2149f, -0.1198f,
+ -0.6205f, -0.7209f, -0.5487f, -0.9080f, 1.3400f, 0.0085f,
+ 28.2837f, 3.2217f, -1.8463f, 0.1620f, -464.3599f, -458.4327f,
+ -455.9967f, -451.0393f, 1.6619f, -0.6944f, -0.3167f, -52.3630f,
+ -1.6971f, -0.7340f, -0.8923f, -49.2771f, -1.1177f, 1.8810f,
+ -258.9386f, -1.0765f, -0.7279f, -0.5208f, -0.8839f, 1.8175f,
+ -78.8510f, -80.5740f, -77.8843f, -77.9798f, -36.5560f, -16.0818f,
+ -5.5362f, 66.4228f, -16.8150f, 0.0036f, 181.8365f, 167.7181f,
+ 168.2344f, 153.9725f, 11.2659f, -47.5786f, -92.6978f, 6.7573f,
+ -68.7704f, -48.3850f, -95.3637f, 8.8888f, -76.9497f, 11.2243f,
+ 60.9020f, -77.6515f, -80.7610f, -78.4537f, -77.4659f, -36.2872f,
+ -0.0936f, 0.1966f, -0.2121f, 0.0193f, 0.0489f, -0.1445f,
+ 0.0060f, 0.0358f, -0.0783f, -0.0985f, -0.2072f, -0.0802f,
+ -0.0185f, 0.1868f, -0.0631f, 0.1260f, -0.0675f, 0.2167f,
+ -0.2174f, -0.1085f, 0.1483f, -0.1655f, -0.1040f, 0.1605f,
+ -0.1673f, -0.0148f, -0.1856f, -0.1454f, 0.1603f, -0.1620f,
+ -0.9205f, -1.2716f, -3.6561f, -5.0834f, -0.7934f, 1.8710f,
+ 2.2999f, -2.9516f, -1.7631f, -0.3804f, 41.2998f, 26.2358f,
+ 28.9763f, 15.7315f, 5.2164f, 3.2963f, -5.4457f, 18.6310f,
+ -25.0076f, 5.4368f, -12.0085f, 17.1462f, -14.6992f, 5.6365f,
+ 48.6207f, -1.0921f, -1.8723f, -3.5354f, -5.1774f, -1.0200f,
+ -0.1065f, -0.2021f, 0.0332f, 0.1692f, -0.1239f, 0.1325f,
+ -0.0660f, -0.0567f, 0.2107f, -0.2084f, -0.0263f, 0.1411f,
+ 0.0178f, 0.0451f, 0.2024f, -0.1756f, -0.0771f, -0.1690f,
+ -0.2097f, -0.2130f, 0.0714f, 0.0172f, -0.0310f, 0.0649f,
+ -0.1550f, 0.0701f, 0.0306f, -0.1750f, -0.1988f, -0.2060f,
+ 0.0005f, -0.1325f, -0.1823f, -0.0900f, -0.1291f, -0.1817f,
+ 0.0144f, 0.0951f, -0.1954f, -0.0171f, -0.1985f, 0.0875f,
+ 0.0901f, -0.0857f, 0.1681f, 0.0465f, 0.1023f, 0.0985f,
+ -0.2152f, -0.1723f, -0.0825f, 0.0203f, -0.1206f, -0.1431f,
+ -0.1552f, 0.1344f, 0.0398f, 0.0169f, 0.2180f, -0.1530f,
+ 2.7964f, 2.7312f, 2.8831f, 3.4729f, -3.1366f, 2.4043f,
+ -7.2004f, 1.4128f, 2.8648f, 0.0578f, 225.5640f, 210.3712f,
+ 210.6907f, 195.0339f, 0.3140f, 1.8060f, 2.7355f, 33.6917f,
+ 3.3542f, 3.3682f, 1.7371f, 31.2424f, 3.4094f, -0.1192f,
+ 63.0864f, 3.0562f, 2.8633f, 2.6777f, 3.5495f, -4.2616f,
+ -1.4034f, 0.3930f, -4.6756f, -9.9870f, -27.8511f, 5.6071f,
+ -1.0862f, 34.4907f, -10.4831f, -0.0281f, 117.2617f, 104.9590f,
+ 106.1515f, 93.9707f, -16.8801f, 5.3036f, -21.7458f, 98.5306f,
+ -20.7596f, 6.4733f, -17.6440f, 98.3097f, -31.9540f, -17.0600f,
+ 27.4543f, -0.6140f, -1.6182f, -4.9167f, -8.9017f, -26.2485f,
+ -0.1952f, -0.0462f, -0.1958f, 0.1679f, -0.1592f, -0.1634f,
+ -0.0507f, -0.0542f, 0.0038f, -0.0343f, 0.0567f, -0.1983f,
+ 0.0250f, -0.0762f, 0.0902f, -0.0343f, 0.1240f, 0.1161f,
+ 0.1237f, 0.1870f, 0.0346f, 0.0340f, 0.0625f, -0.0355f,
+ 0.0278f, -0.1043f, 0.1755f, 0.0253f, 0.1750f, -0.2070f,
+ -5.5531f, -5.3122f, -4.9348f, -4.4782f, -7.5686f, -1.5478f,
+ -5.4341f, 0.5087f, -2.1382f, 0.0798f, 208.3677f, 194.0083f,
+ 194.4168f, 179.3082f, 1.4443f, -1.5038f, -1.4021f, 25.9363f,
+ -4.0635f, -2.6785f, -1.6640f, 22.2589f, -1.4910f, 1.4715f,
+ 59.1972f, -4.9638f, -5.1920f, -4.9193f, -5.2649f, -8.0556f,
+ 20.1226f, 12.0195f, 9.7385f, 10.7058f, -27.4201f, 8.4869f,
+ -5.0826f, 32.9212f, -2.0674f, -0.0290f, 120.5002f, 112.3222f,
+ 112.3287f, 104.1107f, -20.6293f, 14.8534f, -0.8748f, 103.1141f,
+ -1.1368f, 15.3716f, 2.7653f, 91.7285f, -0.5991f, -20.7338f,
+ 35.9363f, 20.5104f, 11.1988f, 9.0368f, 10.6355f, -26.5309f,
+ -0.2058f, -0.2176f, 0.1331f, -0.1415f, -0.0825f, -0.0470f,
+ -0.0615f, 0.1274f, 0.0076f, -0.0575f, -0.2065f, 0.0866f,
+ 0.2166f, -0.1942f, -0.1952f, 0.1323f, -0.1016f, 0.1803f,
+ -0.0424f, 0.1555f, 0.1118f, 0.1559f, 0.0337f, -0.0341f,
+ -0.0430f, 0.1988f, -0.0553f, -0.0255f, 0.1817f, 0.0608f,
+ 0.1431f, 0.0686f, -0.0245f, -0.2107f, 0.2001f, -0.0964f,
+ -0.0090f, 0.1151f, -0.0365f, -0.1986f, 0.1740f, -0.2098f,
+ 0.0013f, 0.1369f, 0.1910f, 0.1801f, -0.2019f, 0.0348f,
+ -0.1175f, 0.0627f, -0.1929f, -0.0099f, 0.1349f, 0.1804f,
+ -0.1071f, -0.1651f, -0.1146f, -0.0259f, 0.1626f, -0.0271f,
+ 0.1393f, 0.1304f, -0.0200f, 0.0924f, -0.0839f, -0.0031f,
+ -0.1311f, 0.0350f, -0.1330f, -0.0911f, 0.1949f, -0.0209f,
+ -0.1883f, 0.0269f, 0.2040f, 0.1552f, 0.1532f, 0.1157f,
+ -0.1102f, -0.1220f, -0.0808f, -0.1050f, 0.1716f, 0.0846f,
+ -0.0180f, -0.1037f, 0.2063f, 0.1237f, 0.1253f, -0.0496f,
+ -0.0183f, 0.0491f, 0.1703f, -0.0824f, -0.0702f, -0.1100f,
+ -0.0965f, 0.0130f, -0.1222f, -0.1081f, 0.0329f, 0.2115f,
+ -0.1438f, 0.0799f, -0.1602f, -0.0330f, 0.0501f, 0.1072f,
+ -0.0744f, -0.1783f, -0.0240f, 0.0777f, -0.1944f, 0.0438f,
+ -0.0033f, -0.1873f, 0.0984f, -0.0318f, 0.0773f, 0.1489f,
+ 0.3966f, 0.4711f, 0.3972f, 0.0623f, 0.5970f, 0.1018f,
+ 0.1375f, -0.1881f, 0.8921f, -0.1854f, -2.1138f, -2.1178f,
+ -1.8295f, -2.1703f, 0.5784f, -0.1937f, -0.0728f, -0.9953f,
+ 0.2442f, -0.4074f, -0.1591f, -1.1660f, 0.4832f, 0.2203f,
+ -1.4957f, 0.1544f, 0.1810f, 0.2275f, 0.4075f, 0.8153f,
+ 0.0715f, 0.0222f, 0.0463f, -0.0201f, 0.0396f, 0.5951f,
+ -0.2779f, -0.0306f, 0.7532f, -0.1596f, -4.1080f, -3.7925f,
+ -3.8522f, -3.2468f, 0.7728f, 0.0188f, -0.1448f, 0.4084f,
+ -0.4666f, -0.1036f, -1.1469f, 0.4243f, 0.2778f, 0.9023f,
+ -3.0216f, 0.0384f, -0.3348f, -0.0314f, -0.2788f, 0.0479f,
+ 139.0773f, 131.6164f, 115.0392f, 111.1817f, 41.7596f, 9.5379f,
+ 1.8542f, 46.9890f, -12.8221f, 0.0241f, 52.9779f, 51.5268f,
+ 50.8060f, 48.7028f, -132.9665f, 118.3478f, 101.1239f, 81.4608f,
+ 75.4251f, 121.0643f, 97.8947f, 86.8911f, 74.5576f, -133.7606f,
+ 29.2657f, 135.8916f, 131.3661f, 114.1687f, 111.0784f, 31.3790f,
+ -0.0807f, -0.0657f, -0.0027f, 0.0410f, 0.0765f, 0.1194f,
+ 0.0953f, -0.0060f, 0.1531f, -0.2339f, 0.1488f, -0.0615f,
+ -0.0579f, 0.0761f, 0.1250f, -0.0469f, 0.1480f, 0.0683f,
+ -0.0049f, 0.1558f, 0.2168f, -0.0736f, 0.1135f, -0.1244f,
+ 0.0725f, -0.1297f, -0.0215f, -0.0412f, -0.1632f, -0.0200f,
+ -0.1346f, -0.1954f, 0.0053f, 0.0151f, 0.1379f, -0.1497f,
+ -0.0102f, -0.0336f, 0.0900f, -0.1706f, -0.0932f, -0.2084f,
+ 0.1242f, -0.2027f, 0.0849f, -0.2139f, -0.2015f, 0.0944f,
+ -0.0984f, 0.2082f, 0.1625f, -0.0227f, -0.1676f, 0.1021f,
+ 0.1516f, 0.0245f, 0.0955f, -0.1488f, -0.0057f, 0.1783f,
+ -0.8568f, -0.8175f, -0.6282f, -1.3107f, 1.5712f, 0.1044f,
+ 28.2289f, 3.0885f, -1.9829f, 0.1600f, -465.9583f, -459.5893f,
+ -457.5055f, -452.7600f, 1.7229f, -0.6620f, -0.1065f, -52.8017f,
+ -2.0293f, -0.8224f, -1.0389f, -49.9049f, -1.2250f, 1.7647f,
+ -259.2465f, -1.0978f, -0.5169f, -0.8721f, -0.8197f, 1.9158f,
+ 16.2234f, 15.8523f, 13.8343f, 9.8509f, -21.4326f, 15.7650f,
+ -6.4451f, 34.8575f, 1.1387f, -0.0223f, 117.7213f, 109.8494f,
+ 109.7624f, 101.8532f, -20.3275f, 16.0812f, 4.9165f, 92.4919f,
+ 4.1615f, 13.8451f, 9.2112f, 97.1580f, -8.7037f, -20.4420f,
+ 27.1105f, 17.4922f, 13.9998f, 12.3888f, 11.4705f, -20.9568f,
+ 0.5457f, 0.5322f, 0.2823f, 0.3581f, 0.5359f, 0.1576f,
+ 0.1969f, -0.0136f, -0.2748f, -0.3168f, -0.3918f, -0.2167f,
+ -0.1797f, -0.1869f, 0.2986f, -0.2116f, -0.4226f, -0.2022f,
+ 0.9452f, 0.5474f, -0.1218f, 0.2067f, -0.1600f, 0.1937f,
+ 0.0808f, 0.4877f, 0.5106f, 0.2626f, 0.5076f, 0.6228f,
+ 0.5124f, 0.4044f, 0.4023f, 0.1222f, 2.5446f, 0.9623f,
+ 24.9875f, 4.7442f, -2.0551f, 0.1642f, -449.9478f, -444.1841f,
+ -442.0153f, -437.1498f, 2.3209f, -0.6986f, -0.3456f, -47.4074f,
+ -1.2374f, -1.0939f, -0.9112f, -41.1851f, -0.5064f, 2.4209f,
+ -263.4446f, -0.0433f, 0.3460f, 0.1475f, 0.3770f, 2.9154f,
+ 0.2032f, 0.1527f, 0.2161f, -0.1981f, 0.1893f, -0.2003f,
+ 0.1734f, 0.1713f, 0.1207f, -0.2073f, -0.1018f, 0.0770f,
+ 0.0728f, 0.1665f, 0.0689f, 0.1884f, -0.1399f, -0.1326f,
+ -0.0518f, -0.1948f, 0.1576f, -0.1835f, 0.1436f, 0.0497f,
+ 0.0883f, -0.1253f, -0.0417f, -0.0507f, -0.1555f, 0.2076f,
+ -2.4080f, 6.1616f, -0.8564f, -13.6773f, -32.7238f, -16.3144f,
+ -1.9828f, 20.5110f, -17.0191f, -1.7154f, 103.6642f, 95.3675f,
+ 95.5662f, 86.9504f, -35.5340f, 19.6681f, -2.4900f, 65.0847f,
+ -15.8119f, 13.7256f, -4.6753f, 63.4713f, -6.5992f, -34.2369f,
+ 41.3959f, -1.5528f, 3.8106f, -0.7762f, -12.3204f, -35.1734f,
+ -83.9509f, -87.4861f, -83.5925f, -81.5047f, -54.1256f, -45.7506f,
+ -13.5325f, -6.0331f, -8.5062f, 0.0261f, 189.9450f, 177.7870f,
+ 178.6945f, 164.9762f, 9.8521f, -68.0619f, -68.6145f, 6.5056f,
+ -55.9651f, -66.9540f, -65.3349f, -2.1954f, -57.2408f, 8.6577f,
+ 60.6966f, -82.1056f, -88.5245f, -83.3057f, -80.7283f, -50.5285f,
+ -0.1397f, 0.1862f, -0.0691f, -0.0906f, 0.1560f, 0.1377f,
+ -0.0066f, -0.0213f, 0.0708f, -0.0386f, -0.0015f, -0.0020f,
+ -0.2122f, 0.0747f, 0.0795f, 0.0229f, 0.1923f, -0.1661f,
+ 0.0895f, 0.1176f, 0.1398f, -0.0443f, 0.0934f, 0.0638f,
+ -0.1924f, 0.0602f, 0.0404f, 0.1597f, 0.1387f, -0.0601f,
+ -28.3967f, -21.8483f, -25.5175f, -29.9252f, 2.0161f, -3.0092f,
+ 7.7435f, 28.2367f, -35.0188f, -0.1578f, 105.0164f, 93.4495f,
+ 94.9134f, 81.0315f, 4.3602f, 8.1303f, -37.7665f, -16.6986f,
+ -40.8902f, 8.2542f, -33.3215f, -2.0457f, -69.0245f, 4.1016f,
+ 47.2770f, -25.8268f, -23.6034f, -26.4339f, -27.8305f, 8.4468f,
+ 13.8742f, 8.3874f, 4.2044f, 1.4619f, -40.2909f, -0.6358f,
+ -0.7982f, 36.1931f, -17.3147f, -0.3348f, 106.8135f, 96.5298f,
+ 97.8829f, 86.9994f, -25.8170f, 15.0652f, -0.9181f, 85.8544f,
+ 2.5475f, 9.8009f, -3.5931f, 89.2017f, -3.7252f, -25.2986f,
+ 22.5505f, 14.0434f, 7.0708f, 4.6646f, 1.5807f, -39.4024f,
+ -0.1436f, 0.0256f, 0.0274f, -0.2126f, 0.0401f, 0.0745f,
+ -0.0379f, -0.0357f, 0.0777f, -0.0709f, -0.1093f, -0.2047f,
+ -0.0713f, -0.0478f, -0.0908f, 0.1963f, 0.1282f, 0.0977f,
+ 0.1304f, 0.2058f, 0.0700f, 0.0518f, 0.0239f, 0.0686f,
+ -0.1909f, 0.0828f, -0.1243f, -0.1920f, 0.1908f, -0.0808f,
+ 90.8028f, 89.2894f, 84.5339f, 83.3491f, -13.3838f, 12.0240f,
+ -3.9443f, 63.0867f, -2.5321f, -0.0099f, 68.9140f, 66.3206f,
+ 66.0278f, 63.1498f, -83.7261f, 74.3448f, 73.4998f, 64.8477f,
+ 69.7701f, 74.5878f, 71.0331f, 63.2116f, 74.3162f, -83.9282f,
+ 20.8163f, 89.6818f, 88.6452f, 83.7338f, 82.9360f, -13.2357f,
+ 0.1299f, -0.1765f, -0.0168f, -0.1372f, -0.1183f, 0.0472f,
+ 0.1312f, 0.0267f, 0.0194f, -0.1593f, 0.0059f, 0.1775f,
+ 0.0668f, -0.1239f, -0.1982f, -0.1415f, -0.1659f, -0.1148f,
+ 0.0136f, 0.0913f, -0.1254f, -0.0357f, 0.0892f, 0.0835f,
+ -0.0554f, 0.1969f, -0.0888f, -0.0623f, -0.0236f, -0.1492f,
+ 0.4196f, 0.3218f, 0.2287f, 0.5095f, 0.7210f, 0.2279f,
+ 0.4523f, -0.1832f, 1.3095f, -0.2041f, -2.1443f, -2.1947f,
+ -1.9292f, -2.1142f, 0.5840f, 0.1018f, 0.1011f, -1.6565f,
+ 0.4325f, 0.0424f, 0.2836f, -1.7183f, 0.2595f, 0.2686f,
+ -1.8784f, 0.3891f, 0.3050f, 0.6195f, 0.2896f, 0.5905f,
+ -5.3024f, -3.2518f, -12.5192f, -29.1732f, 1.6538f, -1.8315f,
+ 9.9788f, 10.5155f, 6.3234f, -0.3460f, 76.9925f, 51.3785f,
+ 55.7120f, 29.0432f, 5.5901f, 25.6578f, -3.9565f, 13.0509f,
+ -106.0371f, 23.2124f, -18.2004f, 8.4618f, -69.3585f, 5.5651f,
+ 80.0565f, -6.4941f, -5.3742f, -14.4209f, -24.1565f, 6.6801f,
+ -22.0585f, -20.9909f, -26.7939f, -29.6890f, -14.5085f, 2.1866f,
+ -4.2608f, 17.3977f, -30.8824f, -0.4017f, 135.6957f, 126.9320f,
+ 127.0044f, 118.1835f, -1.8768f, -0.8629f, -32.0882f, 44.7862f,
+ -23.9174f, 1.6485f, -27.9940f, 51.9078f, -48.5279f, -1.7550f,
+ 49.9230f, -19.9785f, -22.4647f, -27.6911f, -27.3197f, -10.6545f,
+ -0.1922f, -0.1999f, -0.1396f, 0.1065f, 0.0085f, -0.1940f,
+ 0.0351f, 0.1285f, -0.0292f, -0.1296f, 0.1543f, -0.2082f,
+ -0.1758f, 0.0719f, 0.0764f, 0.1394f, -0.0255f, -0.0370f,
+ 0.1615f, -0.0568f, 0.1920f, -0.1631f, 0.0199f, 0.1884f,
+ 0.0693f, 0.1074f, -0.0273f, 0.1540f, 0.0098f, 0.2111f,
+ 0.1805f, -0.0555f, 0.1159f, 0.0469f, 0.1789f, -0.1711f,
+ -0.1304f, 0.1912f, -0.0737f, -0.1408f, 0.1804f, -0.2023f,
+ -0.0467f, -0.1019f, -0.0136f, 0.0691f, 0.1454f, -0.0213f,
+ 0.0929f, -0.0958f, 0.1299f, 0.1137f, 0.1175f, 0.1042f,
+ -0.2081f, -0.0737f, 0.0582f, 0.1640f, 0.2120f, -0.0646f,
+ -0.0326f, 0.1976f, 0.1182f, -0.1365f, -0.1784f, 0.2113f,
+ 0.0469f, 0.0763f, -0.0197f, -0.1902f, 0.1259f, 0.1598f,
+ -0.0180f, -0.1339f, -0.1675f, -0.1884f, -0.1973f, 0.1529f,
+ 0.1160f, 0.2154f, -0.1446f, -0.1395f, 0.0355f, 0.1513f,
+ -0.2086f, -0.1135f, -0.1502f, -0.0018f, 0.0486f, -0.0110f,
+ -0.0843f, -0.0716f, -0.1367f, 0.0753f, 0.0114f, 0.0475f,
+ -0.0632f, 0.2045f, -0.0512f, -0.0906f, -0.1071f, -0.1957f,
+ 0.1361f, 0.1821f, -0.1684f, -0.1383f, 0.1059f, 0.1579f,
+ -0.0064f, -0.1205f, -0.0718f, -0.1323f, -0.0174f, -0.1092f,
+ -0.1915f, 0.1978f, -0.1245f, 0.1297f, -0.1542f, 0.1556f,
+ -0.1752f, 0.0718f, -0.1020f, -0.1970f, 0.0518f, -0.0888f,
+ 0.0541f, -0.1922f, -0.1467f, -0.0653f, -0.1940f, -0.0800f,
+ -0.1096f, -0.0796f, -0.1310f, 0.0191f, -0.1077f, -0.0973f,
+ 0.1566f, 0.0074f, 0.0500f, -0.0415f, -0.2116f, 0.0227f,
+ 0.0895f, 0.1528f, 0.1404f, 0.0467f, 0.0462f, -0.0973f,
+ -0.1669f, 0.0551f, 0.1167f, -0.1470f, -0.0542f, -0.1006f,
+ 0.2104f, 0.1039f, -0.0211f, -0.1726f, -0.0694f, -0.0270f,
+ 0.0277f, -0.0715f, -0.2055f, -0.1502f, -0.1718f, -0.0043f,
+ 0.0174f, 0.1019f, -0.0233f, -0.1518f, -0.1331f, -0.0001f,
+ -0.1483f, -0.2115f, 0.0666f, 0.0014f, 0.1601f, -0.0690f,
+ };
+
+static const float av1_rdcost_model_nn_biases_layer0[NUM_HIDDEN_NODES] = {
+ 0.156824f, 0.f, 0.130013f, 0.084482f, -129.058197f, -15.090252f,
+ -3.859116f, 0.736356f, -81.361557f, -0.001922f, -0.000713f, 0.440181f,
+ 14.982646f, 1.282223f, 2.23122f, 94.26635f, 93.920929f, 0.614672f,
+ 0.f, 0.315858f, 4.746014f, 0.116901f, -35.661354f, -75.148285f,
+ 92.006989f, -14.112332f, 86.673157f, -0.000307f, -0.000544f, 0.f,
+ -7.851313f, 0.505186f, 0.f, 0.f, -111.681091f, -0.937782f,
+ 0.035789f, 0.f, 0.f, -0.00102f, -75.180527f, 0.f,
+ -63.821148f, 79.592392f, 0.085068f, 11.184906f, 1.25406f, 0.f,
+ -29.779242f, -0.181732f, 0.f, 0.425554f, -90.78405f, 0.f,
+ -0.828326f, -81.132179f, 0.f, -2.757063f, 0.f, 0.f,
+ 2.967951f, -4.440599f, 0.f, -5.105355f, 14.734543f, 0.f,
+ 0.f, 0.f, 0.f, 0.295342f, -0.026907f, 133.375412f,
+ -0.000855f, 0.f, -0.875029f, 15.665165f, 0.437296f, 0.321257f,
+ -0.001932f, -4.235782f, -87.187782f, 0.f, -28.84696f, 7.055514f,
+ 0.f, 95.548302f, -0.000425f, 0.38969f, -13.88008f, -27.347931f,
+ 0.f, 0.f, 0.f, -0.000026f, 0.f, 0.f,
+};
+
+static const float
+ av1_rdcost_model_nn_weights_layer1[NUM_HIDDEN_NODES * NUM_OUTPUTS] = {
+ -0.101706f, -0.14411f, -0.139118f, -0.132945f, 118.811302f,
+ 3.137232f, -32.969776f, -4.150725f, 26.263071f, 0.092841f,
+ 0.174125f, -0.028195f, 15.712872f, 17.722702f, 5.666006f,
+ -121.143929f, -131.933731f, -3.000318f, -0.032063f, -0.380065f,
+ -1.660653f, -0.164802f, 7.177527f, 87.759155f, -119.564224f,
+ -98.051651f, -110.581116f, -0.069982f, 0.023906f, 0.183792f,
+ 40.606274f, -0.080804f, -0.053744f, -0.187848f, 157.44313f,
+ -4.820149f, 0.089499f, 0.070232f, -0.043038f, 0.072996f,
+ 93.347313f, 0.225259f, 103.223228f, -110.682541f, 0.14314f,
+ -89.827538f, 6.505952f, -0.076949f, 73.816132f, -0.063416f,
+ -0.23736f, -0.066059f, 116.049599f, 0.120871f, -4.708246f,
+ 107.501671f, -0.206708f, -32.688675f, 0.047608f, -0.105907f,
+ 6.505825f, -75.461891f, -0.160341f, 6.532121f, -84.868111f,
+ -0.065622f, 0.044756f, 0.008672f, 0.017155f, 0.046108f,
+ -0.218818f, -126.507957f, 0.028271f, 0.180625f, -4.707376f,
+ -121.524307f, -0.03853f, -4.103166f, -0.018947f, -95.768463f,
+ 15.941695f, 0.147154f, -102.863029f, -72.521698f, -0.037133f,
+ -138.1492f, 0.210016f, -0.084692f, -68.693665f, -52.523472f,
+ -0.133385f, -0.17438f, 0.008654f, -0.035642f, -0.145202f,
+ 0.211135f,
+ };
+
+static const float av1_rdcost_model_nn_biases_layer1[NUM_OUTPUTS] = {
+ 0.251909f
+};
+
+static const NN_CONFIG av1_rdcost_model_nnconfig = {
+ NUM_FEATURES,
+ NUM_OUTPUTS,
+ NUM_HIDDEN_LAYERS,
+ {
+ NUM_HIDDEN_NODES,
+ },
+ {
+ av1_rdcost_model_nn_weights_layer0,
+ av1_rdcost_model_nn_weights_layer1,
+ },
+ {
+ av1_rdcost_model_nn_biases_layer0,
+ av1_rdcost_model_nn_biases_layer1,
+ },
+};
+
+//------------------------------------------------------------------------------
+
+#undef NUM_FEATURES
+#undef NUM_HIDDEN_LAYERS
+#undef NUM_HIDDEN_NODES
+#undef NUM_OUTPUTS
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RATE_DISTORTION_MODEL_PARAMS_H_
diff --git a/third_party/aom/av1/encoder/ratectrl.c b/third_party/aom/av1/encoder/ratectrl.c
new file mode 100644
index 0000000000..2597fb9908
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.c
@@ -0,0 +1,1776 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/encoder/aq_cyclicrefresh.h"
+#include "av1/common/common.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+
+// Max rate target for 1080P and below encodes under normal circumstances
+// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+#define MAX_MB_RATE 250
+#define MAXRATE_1080P 2025000
+
+#define DEFAULT_KF_BOOST 2000
+#define DEFAULT_GF_BOOST 2000
+
+#define MIN_BPB_FACTOR 0.005
+#define MAX_BPB_FACTOR 50
+
+#define FRAME_OVERHEAD_BITS 200
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case AOM_BITS_8: name = name##_8; break; \
+ case AOM_BITS_10: name = name##_10; break; \
+ case AOM_BITS_12: name = name##_12; break; \
+ default: \
+ assert(0 && \
+ "bit_depth should be AOM_BITS_8, AOM_BITS_10" \
+ " or AOM_BITS_12"); \
+ name = NULL; \
+ } \
+ } while (0)
+
+// Tables relating active max Q to active min Q
+static int kf_low_motion_minq_8[QINDEX_RANGE];
+static int kf_high_motion_minq_8[QINDEX_RANGE];
+static int arfgf_low_motion_minq_8[QINDEX_RANGE];
+static int arfgf_high_motion_minq_8[QINDEX_RANGE];
+static int inter_minq_8[QINDEX_RANGE];
+static int rtc_minq_8[QINDEX_RANGE];
+
+static int kf_low_motion_minq_10[QINDEX_RANGE];
+static int kf_high_motion_minq_10[QINDEX_RANGE];
+static int arfgf_low_motion_minq_10[QINDEX_RANGE];
+static int arfgf_high_motion_minq_10[QINDEX_RANGE];
+static int inter_minq_10[QINDEX_RANGE];
+static int rtc_minq_10[QINDEX_RANGE];
+static int kf_low_motion_minq_12[QINDEX_RANGE];
+static int kf_high_motion_minq_12[QINDEX_RANGE];
+static int arfgf_low_motion_minq_12[QINDEX_RANGE];
+static int arfgf_high_motion_minq_12[QINDEX_RANGE];
+static int inter_minq_12[QINDEX_RANGE];
+static int rtc_minq_12[QINDEX_RANGE];
+
+static int gf_high = 2000;
+static int gf_low = 400;
+static int kf_high = 5000;
+static int kf_low = 400;
+
+// How many times less pixels there are to encode given the current scaling.
+// Temporary replacement for rcf_mult and rate_thresh_mult.
+static double resize_rate_factor(const AV1_COMP *cpi, int width, int height) {
+ return (double)(cpi->oxcf.width * cpi->oxcf.height) / (width * height);
+}
+
+// Functions to compute the active minq lookup table entries based on a
+// formulaic approach to facilitate easier adjustment of the Q tables.
+// The formulae were derived from computing a 3rd order polynomial best
+// fit to the original data (after plotting real maxq vs minq (not q index))
+static int get_minq_index(double maxq, double x3, double x2, double x1,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ const double minqtarget = AOMMIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq);
+
+ // Special case handling to deal with the step from q2.0
+ // down to lossless mode represented by q 1.0.
+ if (minqtarget <= 2.0) return 0;
+
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ if (minqtarget <= av1_convert_qindex_to_q(i, bit_depth)) return i;
+ }
+
+ return QINDEX_RANGE - 1;
+}
+
+static void init_minq_luts(int *kf_low_m, int *kf_high_m, int *arfgf_low,
+ int *arfgf_high, int *inter, int *rtc,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ for (i = 0; i < QINDEX_RANGE; i++) {
+ const double maxq = av1_convert_qindex_to_q(i, bit_depth);
+ kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.150, bit_depth);
+ kf_high_m[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.45, bit_depth);
+ arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth);
+ arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.55, bit_depth);
+ inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth);
+ rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth);
+ }
+}
+
+void av1_rc_init_minq_luts(void) {
+ init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8,
+ arfgf_low_motion_minq_8, arfgf_high_motion_minq_8,
+ inter_minq_8, rtc_minq_8, AOM_BITS_8);
+ init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10,
+ arfgf_low_motion_minq_10, arfgf_high_motion_minq_10,
+ inter_minq_10, rtc_minq_10, AOM_BITS_10);
+ init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12,
+ arfgf_low_motion_minq_12, arfgf_high_motion_minq_12,
+ inter_minq_12, rtc_minq_12, AOM_BITS_12);
+}
+
+// These functions use formulaic calculations to make playing with the
+// quantizer tables easier. If necessary they can be replaced by lookup
+// tables if and when things settle down in the experimental bitstream
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth) {
+ // Convert the index to a real Q value (scaled down to match old Q values)
+ switch (bit_depth) {
+ case AOM_BITS_8: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 4.0;
+ case AOM_BITS_10: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 16.0;
+ case AOM_BITS_12: return av1_ac_quant_Q3(qindex, 0, bit_depth) / 64.0;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1.0;
+ }
+}
+
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, aom_bit_depth_t bit_depth) {
+ const double q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int enumerator = frame_type == KEY_FRAME ? 2700000 : 1800000;
+
+ assert(correction_factor <= MAX_BPB_FACTOR &&
+ correction_factor >= MIN_BPB_FACTOR);
+
+ // q based adjustment to baseline enumerator
+ enumerator += (int)(enumerator * q) >> 12;
+ return (int)(enumerator * correction_factor / q);
+}
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
+ double correction_factor,
+ aom_bit_depth_t bit_depth) {
+ const int bpm =
+ (int)(av1_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
+ return AOMMAX(FRAME_OVERHEAD_BITS,
+ (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+}
+
+int av1_rc_clamp_pframe_target_size(const AV1_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const int min_frame_target =
+ AOMMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+ // Clip the frame target to the minimum setup value.
+ if (cpi->rc.is_src_frame_alt_ref) {
+ // If there is an active ARF at this location use the minimum
+ // bits on this frame even if it is a constructed arf.
+ // The active maximum quantizer insures that an appropriate
+ // number of bits will be spent if needed for constructed ARFs.
+ target = min_frame_target;
+ } else if (target < min_frame_target) {
+ target = min_frame_target;
+ }
+
+ // Clip the frame target to the maximum allowed value.
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+
+ return target;
+}
+
+int av1_rc_clamp_iframe_target_size(const AV1_COMP *const cpi, int target) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ if (oxcf->rc_max_intra_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+ return target;
+}
+
+// Update the buffer level: leaky bucket model.
+static void update_buffer_level(AV1_COMP *cpi, int encoded_frame_size) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Non-viewable frames are a special case and are treated as pure overhead.
+ // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
+ // differently, since it is a no-show frame.
+ if (!cm->show_frame && !rc->is_bwd_ref_frame)
+ rc->bits_off_target -= encoded_frame_size;
+ else
+ rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
+
+ // Clip the buffer level to the maximum specified buffer size.
+ rc->bits_off_target = AOMMIN(rc->bits_off_target, rc->maximum_buffer_size);
+ rc->buffer_level = rc->bits_off_target;
+}
+
+int av1_rc_get_default_min_gf_interval(int width, int height,
+ double framerate) {
+ // Assume we do not need any constraint lower than 4K 20 fps
+ static const double factor_safe = 3840 * 2160 * 20.0;
+ const double factor = width * height * framerate;
+ const int default_interval =
+ clamp((int)(framerate * 0.125), MIN_GF_INTERVAL, MAX_GF_INTERVAL);
+
+ if (factor <= factor_safe)
+ return default_interval;
+ else
+ return AOMMAX(default_interval,
+ (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5));
+ // Note this logic makes:
+ // 4K24: 5
+ // 4K30: 6
+ // 4K60: 12
+}
+
+int av1_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+ int interval = AOMMIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+ interval += (interval & 0x01); // Round to even value
+#if CONFIG_FIX_GF_LENGTH
+ interval = AOMMAX(FIXED_GF_LENGTH, interval);
+#endif
+ return AOMMAX(interval, min_gf_interval);
+}
+
+void av1_rc_init(const AV1EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
+ int i;
+
+ if (pass == 0 && oxcf->rc_mode == AOM_CBR) {
+ rc->avg_frame_qindex[KEY_FRAME] = oxcf->worst_allowed_q;
+ rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
+ } else {
+ rc->avg_frame_qindex[KEY_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ (oxcf->worst_allowed_q + oxcf->best_allowed_q) / 2;
+ }
+
+ rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
+ rc->last_q[INTER_FRAME] = oxcf->worst_allowed_q;
+
+ rc->buffer_level = rc->starting_buffer_level;
+ rc->bits_off_target = rc->starting_buffer_level;
+
+ rc->rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->rolling_actual_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_target_bits = rc->avg_frame_bandwidth;
+ rc->long_rolling_actual_bits = rc->avg_frame_bandwidth;
+
+ rc->total_actual_bits = 0;
+ rc->total_target_bits = 0;
+ rc->total_target_vs_actual = 0;
+
+ rc->frames_since_key = 8; // Sensible default for first frame.
+ rc->this_key_frame_forced = 0;
+ rc->next_key_frame_forced = 0;
+ rc->source_alt_ref_pending = 0;
+ rc->source_alt_ref_active = 0;
+
+ rc->frames_till_gf_update_due = 0;
+ rc->ni_av_qi = oxcf->worst_allowed_q;
+ rc->ni_tot_qi = 0;
+ rc->ni_frames = 0;
+
+ rc->tot_q = 0.0;
+ rc->avg_q = av1_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth);
+
+ for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
+ rc->rate_correction_factors[i] = 0.7;
+ }
+ rc->rate_correction_factors[KF_STD] = 1.0;
+ rc->min_gf_interval = oxcf->min_gf_interval;
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, oxcf->init_framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ oxcf->init_framerate, rc->min_gf_interval);
+ rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+}
+
+int av1_rc_drop_frame(AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ if (!oxcf->drop_frames_water_mark) {
+ return 0;
+ } else {
+ if (rc->buffer_level < 0) {
+ // Always drop if buffer is below 0.
+ return 1;
+ } else {
+ // If buffer is below drop_mark, for now just drop every other frame
+ // (starting with the next frame) until it increases back over drop_mark.
+ int drop_mark =
+ (int)(oxcf->drop_frames_water_mark * rc->optimal_buffer_level / 100);
+ if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) {
+ --rc->decimation_factor;
+ } else if (rc->buffer_level <= drop_mark && rc->decimation_factor == 0) {
+ rc->decimation_factor = 1;
+ }
+ if (rc->decimation_factor > 0) {
+ if (rc->decimation_count > 0) {
+ --rc->decimation_count;
+ return 1;
+ } else {
+ rc->decimation_count = rc->decimation_factor;
+ return 0;
+ }
+ } else {
+ rc->decimation_count = 0;
+ return 0;
+ }
+ }
+ }
+}
+
+static double get_rate_correction_factor(const AV1_COMP *cpi, int width,
+ int height) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ double rcf;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ rcf = rc->rate_correction_factors[KF_STD];
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rcf = rc->rate_correction_factors[rf_lvl];
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref &&
+ (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ rcf = rc->rate_correction_factors[GF_ARF_STD];
+ else
+ rcf = rc->rate_correction_factors[INTER_NORMAL];
+ }
+ rcf *= resize_rate_factor(cpi, width, height);
+ return fclamp(rcf, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+}
+
+static void set_rate_correction_factor(AV1_COMP *cpi, double factor, int width,
+ int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ // Normalize RCF to account for the size-dependent scaling factor.
+ factor /= resize_rate_factor(cpi, width, height);
+
+ factor = fclamp(factor, MIN_BPB_FACTOR, MAX_BPB_FACTOR);
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ rc->rate_correction_factors[KF_STD] = factor;
+ } else if (cpi->oxcf.pass == 2) {
+ RATE_FACTOR_LEVEL rf_lvl =
+ cpi->twopass.gf_group.rf_level[cpi->twopass.gf_group.index];
+ rc->rate_correction_factors[rf_lvl] = factor;
+ } else {
+ if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) &&
+ !rc->is_src_frame_alt_ref &&
+ (cpi->oxcf.rc_mode != AOM_CBR || cpi->oxcf.gf_cbr_boost_pct > 20))
+ rc->rate_correction_factors[GF_ARF_STD] = factor;
+ else
+ rc->rate_correction_factors[INTER_NORMAL] = factor;
+ }
+}
+
+void av1_rc_update_rate_correction_factors(AV1_COMP *cpi, int width,
+ int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int correction_factor = 100;
+ double rate_correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+ double adjustment_limit;
+ const int MBs = av1_get_MBs(width, height);
+
+ int projected_size_based_on_q = 0;
+
+ // Do not update the rate factors for arf overlay frames.
+ if (cpi->rc.is_src_frame_alt_ref) return;
+
+ // Clear down mmx registers to allow floating point in what follows
+ aom_clear_system_state();
+
+ // Work out how big we would have expected the frame to be at this Q given
+ // the current correction factor.
+ // Stay in double to avoid int overflow when values are large
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->common.seg.enabled) {
+ projected_size_based_on_q =
+ av1_cyclic_refresh_estimate_bits_at_q(cpi, rate_correction_factor);
+ } else {
+ projected_size_based_on_q = av1_estimate_bits_at_q(
+ cpi->common.frame_type, cm->base_qindex, MBs, rate_correction_factor,
+ cm->seq_params.bit_depth);
+ }
+ // Work out a size correction factor.
+ if (projected_size_based_on_q > FRAME_OVERHEAD_BITS)
+ correction_factor = (int)((100 * (int64_t)cpi->rc.projected_frame_size) /
+ projected_size_based_on_q);
+
+ // More heavily damped adjustment used if we have been oscillating either side
+ // of target.
+ if (correction_factor > 0) {
+ adjustment_limit =
+ 0.25 + 0.5 * AOMMIN(1, fabs(log10(0.01 * correction_factor)));
+ } else {
+ adjustment_limit = 0.75;
+ }
+
+ cpi->rc.q_2_frame = cpi->rc.q_1_frame;
+ cpi->rc.q_1_frame = cm->base_qindex;
+ cpi->rc.rc_2_frame = cpi->rc.rc_1_frame;
+ if (correction_factor > 110)
+ cpi->rc.rc_1_frame = -1;
+ else if (correction_factor < 90)
+ cpi->rc.rc_1_frame = 1;
+ else
+ cpi->rc.rc_1_frame = 0;
+
+ if (correction_factor > 102) {
+ // We are not already at the worst allowable quality
+ correction_factor =
+ (int)(100 + ((correction_factor - 100) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor > MAX_BPB_FACTOR)
+ rate_correction_factor = MAX_BPB_FACTOR;
+ } else if (correction_factor < 99) {
+ // We are not already at the best allowable quality
+ correction_factor =
+ (int)(100 - ((100 - correction_factor) * adjustment_limit));
+ rate_correction_factor = (rate_correction_factor * correction_factor) / 100;
+
+ // Keep rate_correction_factor within limits
+ if (rate_correction_factor < MIN_BPB_FACTOR)
+ rate_correction_factor = MIN_BPB_FACTOR;
+ }
+
+ set_rate_correction_factor(cpi, rate_correction_factor, width, height);
+}
+
+int av1_rc_regulate_q(const AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ int q = active_worst_quality;
+ int last_error = INT_MAX;
+ int i, target_bits_per_mb, bits_per_mb_at_this_q;
+ const int MBs = av1_get_MBs(width, height);
+ const double correction_factor =
+ get_rate_correction_factor(cpi, width, height);
+
+ // Calculate required scaling factor based on target frame size and size of
+ // frame produced using previous Q.
+ target_bits_per_mb =
+ (int)((uint64_t)(target_bits_per_frame) << BPER_MB_NORMBITS) / MBs;
+
+ i = active_best_quality;
+
+ do {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ bits_per_mb_at_this_q =
+ (int)av1_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
+ } else {
+ bits_per_mb_at_this_q = (int)av1_rc_bits_per_mb(
+ cm->frame_type, i, correction_factor, cm->seq_params.bit_depth);
+ }
+
+ if (bits_per_mb_at_this_q <= target_bits_per_mb) {
+ if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+ q = i;
+ else
+ q = i - 1;
+
+ break;
+ } else {
+ last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+ }
+ } while (++i <= active_worst_quality);
+
+ // In CBR mode, this makes sure q is between oscillating Qs to prevent
+ // resonance.
+ if (cpi->oxcf.rc_mode == AOM_CBR &&
+ (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
+ cpi->rc.q_1_frame != cpi->rc.q_2_frame) {
+ q = clamp(q, AOMMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
+ AOMMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
+ }
+ return q;
+}
+
+static int get_active_quality(int q, int gfu_boost, int low, int high,
+ int *low_motion_minq, int *high_motion_minq) {
+ if (gfu_boost > high) {
+ return low_motion_minq[q];
+ } else if (gfu_boost < low) {
+ return high_motion_minq[q];
+ } else {
+ const int gap = high - low;
+ const int offset = high - gfu_boost;
+ const int qdiff = high_motion_minq[q] - low_motion_minq[q];
+ const int adjustment = ((offset * qdiff) + (gap >> 1)) / gap;
+ return low_motion_minq[q] + adjustment;
+ }
+}
+
+static int get_kf_active_quality(const RATE_CONTROL *const rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *kf_low_motion_minq;
+ int *kf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq);
+ return get_active_quality(q, rc->kf_boost, kf_low, kf_high,
+ kf_low_motion_minq, kf_high_motion_minq);
+}
+
+static int get_gf_active_quality(const RATE_CONTROL *const rc, int q,
+ aom_bit_depth_t bit_depth) {
+ int *arfgf_low_motion_minq;
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq);
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return get_active_quality(q, rc->gfu_boost, gf_low, gf_high,
+ arfgf_low_motion_minq, arfgf_high_motion_minq);
+}
+
+#if REDUCE_LAST_ALT_BOOST
+static int get_gf_high_motion_quality(int q, aom_bit_depth_t bit_depth) {
+ int *arfgf_high_motion_minq;
+ ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq);
+ return arfgf_high_motion_minq[q];
+}
+#endif
+
+static int calc_active_worst_quality_one_pass_vbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const unsigned int curr_frame = cpi->common.current_video_frame;
+ int active_worst_quality;
+
+ if (cpi->common.frame_type == KEY_FRAME) {
+ active_worst_quality =
+ curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] * 2;
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt2_ref_frame ||
+ cpi->refresh_alt_ref_frame)) {
+ active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 5 / 4
+ : rc->last_q[INTER_FRAME];
+ } else {
+ active_worst_quality = curr_frame == 1 ? rc->last_q[KEY_FRAME] * 2
+ : rc->last_q[INTER_FRAME] * 2;
+ }
+ }
+ return AOMMIN(active_worst_quality, rc->worst_quality);
+}
+
+// Adjust active_worst_quality level based on buffer level.
+static int calc_active_worst_quality_one_pass_cbr(const AV1_COMP *cpi) {
+ // Adjust active_worst_quality: If buffer is above the optimal/target level,
+ // bring active_worst_quality down depending on fullness of buffer.
+ // If buffer is below the optimal level, let the active_worst_quality go from
+ // ambient Q (at buffer = optimal level) to worst_quality level
+ // (at buffer = critical level).
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *rc = &cpi->rc;
+ // Buffer level below which we push active_worst to worst_quality.
+ int64_t critical_level = rc->optimal_buffer_level >> 3;
+ int64_t buff_lvl_step = 0;
+ int adjustment = 0;
+ int active_worst_quality;
+ int ambient_qp;
+ if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+ // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
+ // for the first few frames following key frame. These are both initialized
+ // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
+ // So for first few frames following key, the qp of that key frame is weighted
+ // into the active_worst_quality setting.
+ ambient_qp = (cm->current_video_frame < 5)
+ ? AOMMIN(rc->avg_frame_qindex[INTER_FRAME],
+ rc->avg_frame_qindex[KEY_FRAME])
+ : rc->avg_frame_qindex[INTER_FRAME];
+ active_worst_quality = AOMMIN(rc->worst_quality, ambient_qp * 5 / 4);
+ if (rc->buffer_level > rc->optimal_buffer_level) {
+ // Adjust down.
+ // Maximum limit for down adjustment, ~30%.
+ int max_adjustment_down = active_worst_quality / 3;
+ if (max_adjustment_down) {
+ buff_lvl_step = ((rc->maximum_buffer_size - rc->optimal_buffer_level) /
+ max_adjustment_down);
+ if (buff_lvl_step)
+ adjustment = (int)((rc->buffer_level - rc->optimal_buffer_level) /
+ buff_lvl_step);
+ active_worst_quality -= adjustment;
+ }
+ } else if (rc->buffer_level > critical_level) {
+ // Adjust up from ambient Q.
+ if (critical_level) {
+ buff_lvl_step = (rc->optimal_buffer_level - critical_level);
+ if (buff_lvl_step) {
+ adjustment = (int)((rc->worst_quality - ambient_qp) *
+ (rc->optimal_buffer_level - rc->buffer_level) /
+ buff_lvl_step);
+ }
+ active_worst_quality = ambient_qp + adjustment;
+ }
+ } else {
+ // Set to worst_quality if buffer is below critical level.
+ active_worst_quality = rc->worst_quality;
+ }
+ return active_worst_quality;
+}
+
+static int rc_pick_q_and_bounds_one_pass_cbr(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi);
+ int q;
+ int *rtc_minq;
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, rtc_minq);
+
+ if (frame_is_intra_only(cm)) {
+ active_best_quality = rc->best_quality;
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (rc->this_key_frame_forced) {
+ int qindex = rc->last_boosted_qindex;
+ double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ int delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ (last_boosted_q * 0.75), bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (cm->current_video_frame > 0) {
+ // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ active_best_quality =
+ get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ if (cm->current_video_frame > 1) {
+ if (rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[INTER_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ } else {
+ if (rc->avg_frame_qindex[KEY_FRAME] < active_worst_quality)
+ active_best_quality = rtc_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ else
+ active_best_quality = rtc_minq[active_worst_quality];
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+ !(cm->current_video_frame == 0)) {
+ int qdelta = 0;
+ aom_clear_system_state();
+ qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+ active_worst_quality, 2.0, bit_depth);
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ // Special case code to try and match quality with forced key frames
+ if (cm->frame_type == KEY_FRAME && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+static int get_active_cq_level(const RATE_CONTROL *rc,
+ const AV1EncoderConfig *const oxcf) {
+ static const double cq_adjust_threshold = 0.1;
+ int active_cq_level = oxcf->cq_level;
+ if (oxcf->rc_mode == AOM_CQ && rc->total_target_bits > 0) {
+ const double x = (double)rc->total_actual_bits / rc->total_target_bits;
+ if (x < cq_adjust_threshold) {
+ active_cq_level = (int)(active_cq_level * x / cq_adjust_threshold);
+ }
+ }
+ return active_cq_level;
+}
+
+static int rc_pick_q_and_bounds_one_pass_vbr(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int cq_level = get_active_cq_level(rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi);
+ int q;
+ int *inter_minq;
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+ if (frame_is_intra_only(cm)) {
+ if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ av1_compute_qdelta(rc, q_val, q_val * 0.25, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else if (rc->this_key_frame_forced) {
+ const int qindex = rc->last_boosted_qindex;
+ const double last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex = av1_compute_qdelta(
+ rc, last_boosted_q, last_boosted_q * 0.75, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else { // not first frame of one pass and kf_boost is set
+ double q_adj_factor = 1.0;
+
+ active_best_quality =
+ get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Convert the adjustment factor to a qindex delta on active_best_quality.
+ {
+ const double q_val =
+ av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ q = (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality)
+ ? rc->avg_frame_qindex[INTER_FRAME]
+ : rc->avg_frame_qindex[KEY_FRAME];
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+ } else if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const int delta_qindex =
+ (cpi->refresh_alt_ref_frame)
+ ? av1_compute_qdelta(rc, q_val, q_val * 0.40, bit_depth)
+ : av1_compute_qdelta(rc, q_val, q_val * 0.50, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ }
+ } else {
+ if (oxcf->rc_mode == AOM_Q) {
+ const int qindex = cq_level;
+ const double q_val = av1_convert_qindex_to_q(qindex, bit_depth);
+ const double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
+ 0.70, 1.0, 0.85, 1.0 };
+ const int delta_qindex = av1_compute_qdelta(
+ rc, q_val,
+ q_val * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+ bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ } else {
+ // Use the lower of active_worst_quality and recent/average Q.
+ active_best_quality = (cm->current_video_frame > 1)
+ ? inter_minq[rc->avg_frame_qindex[INTER_FRAME]]
+ : inter_minq[rc->avg_frame_qindex[KEY_FRAME]];
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Clip the active best and worst quality values to limits
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ // Limit Q range for the adaptive loop.
+ {
+ int qdelta = 0;
+ aom_clear_system_state();
+ if (cm->frame_type == KEY_FRAME && !rc->this_key_frame_forced &&
+ !(cm->current_video_frame == 0)) {
+ qdelta = av1_compute_qdelta_by_rate(&cpi->rc, cm->frame_type,
+ active_worst_quality, 2.0, bit_depth);
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
+ qdelta = av1_compute_qdelta_by_rate(
+ &cpi->rc, cm->frame_type, active_worst_quality, 1.75, bit_depth);
+ }
+ *top_index = active_worst_quality + qdelta;
+ *top_index = AOMMAX(*top_index, *bottom_index);
+ }
+
+ if (oxcf->rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames
+ } else if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced) {
+ q = rc->last_boosted_qindex;
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > *top_index) {
+ // Special case when we are targeting the max allowed rate
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ *top_index = q;
+ else
+ q = *top_index;
+ }
+ }
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int av1_frame_type_qdelta(const AV1_COMP *cpi, int rf_level, int q) {
+ static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] = {
+ INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME
+ };
+ const AV1_COMMON *const cm = &cpi->common;
+ int qdelta = av1_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level], q,
+ rate_factor_deltas[rf_level],
+ cm->seq_params.bit_depth);
+ return qdelta;
+}
+
+#define STATIC_MOTION_THRESH 95
+static int rc_pick_q_and_bounds_two_pass(const AV1_COMP *cpi, int width,
+ int height, int *bottom_index,
+ int *top_index, int *arf_q) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ const int cq_level = get_active_cq_level(rc, oxcf);
+ int active_best_quality;
+ int active_worst_quality = cpi->twopass.active_worst_quality;
+ int q;
+ int *inter_minq;
+ const int bit_depth = cm->seq_params.bit_depth;
+ ASSIGN_MINQ_TABLE(bit_depth, inter_minq);
+
+#if CUSTOMIZED_GF
+ const int is_intrl_arf_boost =
+ gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE;
+#else
+ const int is_intrl_arf_boost = cpi->refresh_alt2_ref_frame;
+#endif // CUSTOMIZED_GF
+
+ if (frame_is_intra_only(cm)) {
+ // Handle the special case for key frames forced when we have reached
+ // the maximum key frame interval. Here force the Q to a range
+ // based on the ambient Q to reduce the risk of popping.
+ if (rc->this_key_frame_forced) {
+ double last_boosted_q;
+ int delta_qindex;
+ int qindex;
+
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ qindex = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ active_best_quality = qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 1.25, bit_depth);
+ active_worst_quality =
+ AOMMIN(qindex + delta_qindex, active_worst_quality);
+ } else {
+ qindex = rc->last_boosted_qindex;
+ last_boosted_q = av1_convert_qindex_to_q(qindex, bit_depth);
+ delta_qindex = av1_compute_qdelta(rc, last_boosted_q,
+ last_boosted_q * 0.5, bit_depth);
+ active_best_quality = AOMMAX(qindex + delta_qindex, rc->best_quality);
+ }
+ } else {
+ // Not forced keyframe.
+ double q_adj_factor = 1.0;
+ double q_val;
+
+ // Baseline value derived from cpi->active_worst_quality and kf boost.
+ active_best_quality =
+ get_kf_active_quality(rc, active_worst_quality, bit_depth);
+
+ // Allow somewhat lower kf minq with small image formats.
+ if ((width * height) <= (352 * 288)) {
+ q_adj_factor -= 0.25;
+ }
+
+ // Make a further adjustment based on the kf zero motion measure.
+ q_adj_factor += 0.05 - (0.001 * (double)cpi->twopass.kf_zeromotion_pct);
+
+ // Convert the adjustment factor to a qindex delta
+ // on active_best_quality.
+ q_val = av1_convert_qindex_to_q(active_best_quality, bit_depth);
+ active_best_quality +=
+ av1_compute_qdelta(rc, q_val, q_val * q_adj_factor, bit_depth);
+ }
+ } else if (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || is_intrl_arf_boost ||
+ cpi->refresh_alt_ref_frame)) {
+ // Use the lower of active_worst_quality and recent
+ // average Q as basis for GF/ARF best Q limit unless last frame was
+ // a key frame.
+ if (rc->frames_since_key > 1 &&
+ rc->avg_frame_qindex[INTER_FRAME] < active_worst_quality) {
+ q = rc->avg_frame_qindex[INTER_FRAME];
+ } else {
+ q = active_worst_quality;
+ }
+ // For constrained quality dont allow Q less than the cq level
+ if (oxcf->rc_mode == AOM_CQ) {
+ if (q < cq_level) q = cq_level;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE ||
+ (is_intrl_arf_boost && !cpi->new_bwdref_update_rule)) {
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+
+ // Constrained quality use slightly lower active best.
+ active_best_quality = active_best_quality * 15 / 16;
+#if REDUCE_LAST_ALT_BOOST
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+ }
+#endif
+ *arf_q = active_best_quality;
+#if USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ } else {
+ active_best_quality = rc->arf_q;
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ ++this_height;
+ }
+ }
+#endif // USE_SYMM_MULTI_LAYER && MULTI_LVL_BOOST_VBR_CQ
+ } else if (oxcf->rc_mode == AOM_Q) {
+ if (!cpi->refresh_alt_ref_frame && !is_intrl_arf_boost) {
+ active_best_quality = cq_level;
+ } else {
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+ *arf_q = active_best_quality;
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+ } else {
+ active_best_quality = rc->arf_q;
+ }
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+ ++this_height;
+ }
+ } else {
+#endif
+ // Modify best quality for second level arfs. For mode AOM_Q this
+ // becomes the baseline frame q.
+ if (gf_group->rf_level[gf_group->index] == GF_ARF_LOW)
+ active_best_quality = (active_best_quality + cq_level + 1) / 2;
+#if USE_SYMM_MULTI_LAYER
+ }
+#endif
+ }
+ } else {
+ active_best_quality = get_gf_active_quality(rc, q, bit_depth);
+#if REDUCE_LAST_ALT_BOOST
+ const int min_boost = get_gf_high_motion_quality(q, bit_depth);
+ const int boost = min_boost - active_best_quality;
+
+ active_best_quality = min_boost - (int)(boost * rc->arf_boost_factor);
+#endif
+#if USE_SYMM_MULTI_LAYER
+ if (cpi->new_bwdref_update_rule && is_intrl_arf_boost) {
+ int this_height = gf_group->pyramid_level[gf_group->index];
+ while (this_height < gf_group->pyramid_height) {
+ active_best_quality =
+ (active_best_quality + active_worst_quality + 1) / 2;
+ ++this_height;
+ }
+ }
+#endif
+ }
+ } else {
+ if (oxcf->rc_mode == AOM_Q) {
+ active_best_quality = cq_level;
+ } else {
+ active_best_quality = inter_minq[active_worst_quality];
+
+ // For the constrained quality mode we don't want
+ // q to fall below the cq level.
+ if ((oxcf->rc_mode == AOM_CQ) && (active_best_quality < cq_level)) {
+ active_best_quality = cq_level;
+ }
+ }
+ }
+
+ // Extension to max or min Q if undershoot or overshoot is outside
+ // the permitted range.
+ if ((cpi->oxcf.rc_mode != AOM_Q) &&
+ (cpi->twopass.gf_zeromotion_pct < VLOW_MOTION_THRESHOLD)) {
+ if (frame_is_intra_only(cm) ||
+ (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || is_intrl_arf_boost ||
+ cpi->refresh_alt_ref_frame))) {
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast);
+ active_worst_quality += (cpi->twopass.extend_maxq / 2);
+ } else {
+ active_best_quality -=
+ (cpi->twopass.extend_minq + cpi->twopass.extend_minq_fast) / 2;
+ active_worst_quality += cpi->twopass.extend_maxq;
+ }
+ }
+
+ aom_clear_system_state();
+ // Static forced key frames Q restrictions dealt with elsewhere.
+ if (!(frame_is_intra_only(cm)) || !rc->this_key_frame_forced ||
+ (cpi->twopass.last_kfgroup_zeromotion_pct < STATIC_MOTION_THRESH)) {
+ int qdelta = av1_frame_type_qdelta(cpi, gf_group->rf_level[gf_group->index],
+ active_worst_quality);
+ active_worst_quality =
+ AOMMAX(active_worst_quality + qdelta, active_best_quality);
+ }
+
+ // Modify active_best_quality for downscaled normal frames.
+ if (av1_frame_scaled(cm) && !frame_is_kf_gf_arf(cpi)) {
+ int qdelta = av1_compute_qdelta_by_rate(
+ rc, cm->frame_type, active_best_quality, 2.0, bit_depth);
+ active_best_quality =
+ AOMMAX(active_best_quality + qdelta, rc->best_quality);
+ }
+
+ active_best_quality =
+ clamp(active_best_quality, rc->best_quality, rc->worst_quality);
+ active_worst_quality =
+ clamp(active_worst_quality, active_best_quality, rc->worst_quality);
+
+ if (oxcf->rc_mode == AOM_Q) {
+ q = active_best_quality;
+ // Special case code to try and match quality with forced key frames.
+ } else if (frame_is_intra_only(cm) && rc->this_key_frame_forced) {
+ // If static since last kf use better of last boosted and last kf q.
+ if (cpi->twopass.last_kfgroup_zeromotion_pct >= STATIC_MOTION_THRESH) {
+ q = AOMMIN(rc->last_kf_qindex, rc->last_boosted_qindex);
+ } else {
+ q = AOMMIN(rc->last_boosted_qindex,
+ (active_best_quality + active_worst_quality) / 2);
+ }
+ } else {
+ q = av1_rc_regulate_q(cpi, rc->this_frame_target, active_best_quality,
+ active_worst_quality, width, height);
+ if (q > active_worst_quality) {
+ // Special case when we are targeting the max allowed rate.
+ if (rc->this_frame_target >= rc->max_frame_bandwidth)
+ active_worst_quality = q;
+ else
+ q = active_worst_quality;
+ }
+ }
+ clamp(q, active_best_quality, active_worst_quality);
+
+ *top_index = active_worst_quality;
+ *bottom_index = active_best_quality;
+
+ assert(*top_index <= rc->worst_quality && *top_index >= rc->best_quality);
+ assert(*bottom_index <= rc->worst_quality &&
+ *bottom_index >= rc->best_quality);
+ assert(q <= rc->worst_quality && q >= rc->best_quality);
+ return q;
+}
+
+int av1_rc_pick_q_and_bounds(AV1_COMP *cpi, int width, int height,
+ int *bottom_index, int *top_index) {
+ int q;
+ if (cpi->oxcf.pass == 0) {
+ if (cpi->oxcf.rc_mode == AOM_CBR)
+ q = rc_pick_q_and_bounds_one_pass_cbr(cpi, width, height, bottom_index,
+ top_index);
+ else
+ q = rc_pick_q_and_bounds_one_pass_vbr(cpi, width, height, bottom_index,
+ top_index);
+ } else {
+ assert(cpi->oxcf.pass == 2 && "invalid encode pass");
+
+ GF_GROUP *gf_group = &cpi->twopass.gf_group;
+ int arf_q = 0;
+
+ q = rc_pick_q_and_bounds_two_pass(cpi, width, height, bottom_index,
+ top_index, &arf_q);
+
+ if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
+ cpi->rc.arf_q = arf_q;
+ }
+ }
+
+ return q;
+}
+
+void av1_rc_compute_frame_size_bounds(const AV1_COMP *cpi, int frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit) {
+ if (cpi->oxcf.rc_mode == AOM_Q) {
+ *frame_under_shoot_limit = 0;
+ *frame_over_shoot_limit = INT_MAX;
+ } else {
+ // For very small rate targets where the fractional adjustment
+ // may be tiny make sure there is at least a minimum range.
+ const int tolerance = (cpi->sf.recode_tolerance * frame_target) / 100;
+ *frame_under_shoot_limit = AOMMAX(frame_target - tolerance - 200, 0);
+ *frame_over_shoot_limit =
+ AOMMIN(frame_target + tolerance + 200, cpi->rc.max_frame_bandwidth);
+ }
+}
+
+static void rc_set_frame_target(AV1_COMP *cpi, int target, int width,
+ int height) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+
+ rc->this_frame_target = target;
+
+ // Modify frame size target when down-scaled.
+ if (av1_frame_scaled(cm))
+ rc->this_frame_target =
+ (int)(rc->this_frame_target * resize_rate_factor(cpi, width, height));
+
+ // Target rate per SB64 (including partial SB64s.
+ rc->sb64_target_rate =
+ (int)((int64_t)rc->this_frame_target * 64 * 64) / (width * height);
+}
+
+static void update_alt_ref_frame_stats(AV1_COMP *cpi) {
+ // this frame refreshes means next frames don't unless specified by user
+ RATE_CONTROL *const rc = &cpi->rc;
+ rc->frames_since_golden = 0;
+
+ // Mark the alt ref as done (setting to 0 means no further alt refs pending).
+ rc->source_alt_ref_pending = 0;
+
+ // Set the alternate reference frame active flag
+ rc->source_alt_ref_active = 1;
+}
+
+static void update_golden_frame_stats(AV1_COMP *cpi) {
+ RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const GF_GROUP *const gf_group = &twopass->gf_group;
+ const int is_intrnl_arf =
+ cpi->oxcf.pass == 2
+ ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+ : cpi->refresh_alt2_ref_frame;
+#else
+ const int is_intnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
+ // Update the Golden frame usage counts.
+ // NOTE(weitinglin): If we use show_existing_frame for an OVERLAY frame,
+ // only the virtual indices for the reference frame will be
+ // updated and cpi->refresh_golden_frame will still be zero.
+ if (cpi->refresh_golden_frame || rc->is_src_frame_alt_ref) {
+ // We will not use internal overlay frames to replace the golden frame
+ if (!rc->is_src_frame_ext_arf)
+ // this frame refreshes means next frames don't unless specified by user
+ rc->frames_since_golden = 0;
+
+ // If we are not using alt ref in the up and coming group clear the arf
+ // active flag. In multi arf group case, if the index is not 0 then
+ // we are overlaying a mid group arf so should not reset the flag.
+ if (cpi->oxcf.pass == 2) {
+ if (!rc->source_alt_ref_pending && (cpi->twopass.gf_group.index == 0))
+ rc->source_alt_ref_active = 0;
+ } else if (!rc->source_alt_ref_pending) {
+ rc->source_alt_ref_active = 0;
+ }
+ } else if (!cpi->refresh_alt_ref_frame && !is_intrnl_arf) {
+ rc->frames_since_golden++;
+ }
+}
+
+void av1_rc_postencode_update(AV1_COMP *cpi, uint64_t bytes_used) {
+ const AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+#if CUSTOMIZED_GF
+ const TWO_PASS *const twopass = &cpi->twopass;
+ const GF_GROUP *const gf_group = &twopass->gf_group;
+ const int is_intrnl_arf =
+ cpi->oxcf.pass == 2
+ ? gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE
+ : cpi->refresh_alt2_ref_frame;
+#else
+ const int is_intrnl_arf = cpi->refresh_alt2_ref_frame;
+#endif
+
+ const int qindex = cm->base_qindex;
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
+ av1_cyclic_refresh_postencode(cpi);
+ }
+
+ // Update rate control heuristics
+ rc->projected_frame_size = (int)(bytes_used << 3);
+
+ // Post encode loop adjustment of Q prediction.
+ av1_rc_update_rate_correction_factors(cpi, cm->width, cm->height);
+
+ // Keep a record of last Q and ambient average Q.
+ if (cm->frame_type == KEY_FRAME) {
+ rc->last_q[KEY_FRAME] = qindex;
+ rc->avg_frame_qindex[KEY_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
+ } else {
+ if (!rc->is_src_frame_alt_ref &&
+ !(cpi->refresh_golden_frame || is_intrnl_arf ||
+ cpi->refresh_alt_ref_frame)) {
+ rc->last_q[INTER_FRAME] = qindex;
+ rc->avg_frame_qindex[INTER_FRAME] =
+ ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2);
+ rc->ni_frames++;
+ rc->tot_q += av1_convert_qindex_to_q(qindex, cm->seq_params.bit_depth);
+ rc->avg_q = rc->tot_q / rc->ni_frames;
+ // Calculate the average Q for normal inter frames (not key or GFU
+ // frames).
+ rc->ni_tot_qi += qindex;
+ rc->ni_av_qi = rc->ni_tot_qi / rc->ni_frames;
+ }
+ }
+
+ // Keep record of last boosted (KF/GF/ARF) Q value.
+ // If the current frame is coded at a lower Q then we also update it.
+ // If all mbs in this group are skipped only update if the Q value is
+ // better than that already stored.
+ // This is used to help set quality in forced key frames to reduce popping
+ if ((qindex < rc->last_boosted_qindex) || (cm->frame_type == KEY_FRAME) ||
+ (!rc->constrained_gf_group &&
+ (cpi->refresh_alt_ref_frame || is_intrnl_arf ||
+ (cpi->refresh_golden_frame && !rc->is_src_frame_alt_ref)))) {
+ rc->last_boosted_qindex = qindex;
+ }
+ if (cm->frame_type == KEY_FRAME) rc->last_kf_qindex = qindex;
+
+ update_buffer_level(cpi, rc->projected_frame_size);
+
+ // Rolling monitors of whether we are over or underspending used to help
+ // regulate min and Max Q in two pass.
+ if (av1_frame_scaled(cm))
+ rc->this_frame_target =
+ (int)(rc->this_frame_target /
+ resize_rate_factor(cpi, cm->width, cm->height));
+ if (cm->frame_type != KEY_FRAME) {
+ rc->rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_target_bits * 3 + rc->this_frame_target, 2);
+ rc->rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->rolling_actual_bits * 3 + rc->projected_frame_size, 2);
+ rc->long_rolling_target_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_target_bits * 31 + rc->this_frame_target, 5);
+ rc->long_rolling_actual_bits = ROUND_POWER_OF_TWO(
+ rc->long_rolling_actual_bits * 31 + rc->projected_frame_size, 5);
+ }
+
+ // Actual bits spent
+ rc->total_actual_bits += rc->projected_frame_size;
+ // TODO(zoeliu): To investigate whether we should treat BWDREF_FRAME
+ // differently here for rc->avg_frame_bandwidth.
+ rc->total_target_bits +=
+ (cm->show_frame || rc->is_bwd_ref_frame) ? rc->avg_frame_bandwidth : 0;
+
+ rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
+
+ if (is_altref_enabled(cpi) && cpi->refresh_alt_ref_frame &&
+ (cm->frame_type != KEY_FRAME))
+ // Update the alternate reference frame stats as appropriate.
+ update_alt_ref_frame_stats(cpi);
+ else
+ // Update the Golden frame stats as appropriate.
+ update_golden_frame_stats(cpi);
+
+ if (cm->frame_type == KEY_FRAME) rc->frames_since_key = 0;
+ // if (cm->current_video_frame == 1 && cm->show_frame)
+ /*
+ rc->this_frame_target =
+ (int)(rc->this_frame_target / resize_rate_factor(cpi, cm->width,
+ cm->height));
+ */
+}
+
+void av1_rc_postencode_update_drop_frame(AV1_COMP *cpi) {
+ // Update buffer level with zero size, update frame counters, and return.
+ update_buffer_level(cpi, 0);
+ cpi->rc.frames_since_key++;
+ cpi->rc.frames_to_key--;
+ cpi->rc.rc_2_frame = 0;
+ cpi->rc.rc_1_frame = 0;
+}
+
+// Use this macro to turn on/off use of alt-refs in one-pass mode.
+#define USE_ALTREF_FOR_ONE_PASS 1
+
+static int calc_pframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int af_ratio = 10;
+ const RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+#if USE_ALTREF_FOR_ONE_PASS
+ target =
+ (!rc->is_src_frame_alt_ref &&
+ (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))
+ ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) /
+ (rc->baseline_gf_interval + af_ratio - 1)
+ : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
+ (rc->baseline_gf_interval + af_ratio - 1);
+#else
+ target = rc->avg_frame_bandwidth;
+#endif
+ return av1_rc_clamp_pframe_target_size(cpi, target);
+}
+
+static int calc_iframe_target_size_one_pass_vbr(const AV1_COMP *const cpi) {
+ static const int kf_ratio = 25;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int target = rc->avg_frame_bandwidth * kf_ratio;
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_vbr_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ int altref_enabled = is_altref_enabled(cpi);
+ int sframe_dist = cpi->oxcf.sframe_dist;
+ int sframe_mode = cpi->oxcf.sframe_mode;
+ int sframe_enabled = cpi->oxcf.sframe_enabled;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+ if (!cpi->refresh_alt_ref_frame &&
+ (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ cm->frame_type = KEY_FRAME;
+ rc->this_key_frame_forced =
+ cm->current_video_frame != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ if (sframe_enabled) {
+ if (altref_enabled) {
+ if (sframe_mode == 1) {
+ // sframe_mode == 1: insert sframe if it matches altref frame.
+
+ if (cm->current_video_frame % sframe_dist == 0 &&
+ cm->frame_type != KEY_FRAME && cm->current_video_frame != 0 &&
+ cpi->refresh_alt_ref_frame) {
+ cm->frame_type = S_FRAME;
+ }
+ } else {
+ // sframe_mode != 1: if sframe will be inserted at the next available
+ // altref frame
+
+ if (cm->current_video_frame % sframe_dist == 0 &&
+ cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+ rc->sframe_due = 1;
+ }
+
+ if (rc->sframe_due && cpi->refresh_alt_ref_frame) {
+ cm->frame_type = S_FRAME;
+ rc->sframe_due = 0;
+ }
+ }
+ } else {
+ if (cm->current_video_frame % sframe_dist == 0 &&
+ cm->frame_type != KEY_FRAME && cm->current_video_frame != 0) {
+ cm->frame_type = S_FRAME;
+ }
+ }
+ }
+ }
+ if (rc->frames_till_gf_update_due == 0) {
+ rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key) {
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ rc->constrained_gf_group = 1;
+ } else {
+ rc->constrained_gf_group = 0;
+ }
+ cpi->refresh_golden_frame = 1;
+ rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_update_parameters(cpi);
+
+ if (cm->frame_type == KEY_FRAME)
+ target = calc_iframe_target_size_one_pass_vbr(cpi);
+ else
+ target = calc_pframe_target_size_one_pass_vbr(cpi);
+ rc_set_frame_target(cpi, target, cm->width, cm->height);
+}
+
+static int calc_pframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const AV1EncoderConfig *oxcf = &cpi->oxcf;
+ const RATE_CONTROL *rc = &cpi->rc;
+ const int64_t diff = rc->optimal_buffer_level - rc->buffer_level;
+ const int64_t one_pct_bits = 1 + rc->optimal_buffer_level / 100;
+ int min_frame_target =
+ AOMMAX(rc->avg_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS);
+ int target;
+
+ if (oxcf->gf_cbr_boost_pct) {
+ const int af_ratio_pct = oxcf->gf_cbr_boost_pct + 100;
+ target = cpi->refresh_golden_frame
+ ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval *
+ af_ratio_pct) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100)
+ : (rc->avg_frame_bandwidth * rc->baseline_gf_interval * 100) /
+ (rc->baseline_gf_interval * 100 + af_ratio_pct - 100);
+ } else {
+ target = rc->avg_frame_bandwidth;
+ }
+
+ if (diff > 0) {
+ // Lower the target bandwidth for this frame.
+ const int pct_low = (int)AOMMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
+ target -= (target * pct_low) / 200;
+ } else if (diff < 0) {
+ // Increase the target bandwidth for this frame.
+ const int pct_high =
+ (int)AOMMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
+ target += (target * pct_high) / 200;
+ }
+ if (oxcf->rc_max_inter_bitrate_pct) {
+ const int max_rate =
+ rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+ target = AOMMIN(target, max_rate);
+ }
+ return AOMMAX(min_frame_target, target);
+}
+
+static int calc_iframe_target_size_one_pass_cbr(const AV1_COMP *cpi) {
+ const RATE_CONTROL *rc = &cpi->rc;
+ int target;
+ if (cpi->common.current_video_frame == 0) {
+ target = ((rc->starting_buffer_level / 2) > INT_MAX)
+ ? INT_MAX
+ : (int)(rc->starting_buffer_level / 2);
+ } else {
+ int kf_boost = 32;
+ double framerate = cpi->framerate;
+
+ kf_boost = AOMMAX(kf_boost, (int)(2 * framerate - 16));
+ if (rc->frames_since_key < framerate / 2) {
+ kf_boost = (int)(kf_boost * rc->frames_since_key / (framerate / 2));
+ }
+ target = ((16 + kf_boost) * rc->avg_frame_bandwidth) >> 4;
+ }
+ return av1_rc_clamp_iframe_target_size(cpi, target);
+}
+
+void av1_rc_get_one_pass_cbr_params(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target;
+ // TODO(yaowu): replace the "auto_key && 0" below with proper decision logic.
+ if ((cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
+ rc->frames_to_key == 0 || (cpi->oxcf.auto_key && 0))) {
+ cm->frame_type = KEY_FRAME;
+ rc->this_key_frame_forced =
+ cm->current_video_frame != 0 && rc->frames_to_key == 0;
+ rc->frames_to_key = cpi->oxcf.key_freq;
+ rc->kf_boost = DEFAULT_KF_BOOST;
+ rc->source_alt_ref_active = 0;
+ } else {
+ cm->frame_type = INTER_FRAME;
+ }
+ if (rc->frames_till_gf_update_due == 0) {
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_set_golden_update(cpi);
+ else
+ rc->baseline_gf_interval =
+ (rc->min_gf_interval + rc->max_gf_interval) / 2;
+ rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+ // NOTE: frames_till_gf_update_due must be <= frames_to_key.
+ if (rc->frames_till_gf_update_due > rc->frames_to_key)
+ rc->frames_till_gf_update_due = rc->frames_to_key;
+ cpi->refresh_golden_frame = 1;
+ rc->gfu_boost = DEFAULT_GF_BOOST;
+ }
+
+ // Any update/change of global cyclic refresh parameters (amount/delta-qp)
+ // should be done here, before the frame qp is selected.
+ if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+ av1_cyclic_refresh_update_parameters(cpi);
+
+ if (cm->frame_type == KEY_FRAME)
+ target = calc_iframe_target_size_one_pass_cbr(cpi);
+ else
+ target = calc_pframe_target_size_one_pass_cbr(cpi);
+
+ rc_set_frame_target(cpi, target, cm->width, cm->height);
+ // TODO(afergs): Decide whether to scale up, down, or not at all
+}
+
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth) {
+ int start_index = rc->worst_quality;
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Convert the average q value to an index.
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ start_index = i;
+ if (av1_convert_qindex_to_q(i, bit_depth) >= qstart) break;
+ }
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ target_index = i;
+ if (av1_convert_qindex_to_q(i, bit_depth) >= qtarget) break;
+ }
+
+ return target_index - start_index;
+}
+
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ aom_bit_depth_t bit_depth) {
+ int target_index = rc->worst_quality;
+ int i;
+
+ // Look up the current projected bits per block for the base index
+ const int base_bits_per_mb =
+ av1_rc_bits_per_mb(frame_type, qindex, 1.0, bit_depth);
+
+ // Find the target bits per mb based on the base value and given ratio.
+ const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb);
+
+ // Convert the q target to an index
+ for (i = rc->best_quality; i < rc->worst_quality; ++i) {
+ if (av1_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <=
+ target_bits_per_mb) {
+ target_index = i;
+ break;
+ }
+ }
+ return target_index - qindex;
+}
+
+void av1_rc_set_gf_interval_range(const AV1_COMP *const cpi,
+ RATE_CONTROL *const rc) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+
+ // Special case code for 1 pass fixed Q mode tests
+ if ((oxcf->pass == 0) && (oxcf->rc_mode == AOM_Q)) {
+ rc->max_gf_interval = FIXED_GF_INTERVAL;
+ rc->min_gf_interval = FIXED_GF_INTERVAL;
+ rc->static_scene_max_gf_interval = FIXED_GF_INTERVAL;
+ } else {
+ // Set Maximum gf/arf interval
+ rc->max_gf_interval = oxcf->max_gf_interval;
+ rc->min_gf_interval = oxcf->min_gf_interval;
+ if (rc->min_gf_interval == 0)
+ rc->min_gf_interval = av1_rc_get_default_min_gf_interval(
+ oxcf->width, oxcf->height, cpi->framerate);
+ if (rc->max_gf_interval == 0)
+ rc->max_gf_interval = av1_rc_get_default_max_gf_interval(
+ cpi->framerate, rc->min_gf_interval);
+
+ // Extended interval for genuinely static scenes
+ rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
+
+ if (is_altref_enabled(cpi)) {
+ if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
+ rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
+ }
+
+ if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
+ rc->max_gf_interval = rc->static_scene_max_gf_interval;
+
+ // Clamp min to max
+ rc->min_gf_interval = AOMMIN(rc->min_gf_interval, rc->max_gf_interval);
+ }
+}
+
+void av1_rc_update_framerate(AV1_COMP *cpi, int width, int height) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RATE_CONTROL *const rc = &cpi->rc;
+ int vbr_max_bits;
+ const int MBs = av1_get_MBs(width, height);
+
+ rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+ rc->min_frame_bandwidth =
+ (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
+
+ rc->min_frame_bandwidth =
+ AOMMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
+
+ // A maximum bitrate for a frame is defined.
+ // The baseline for this aligns with HW implementations that
+ // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
+ // per 16x16 MB (averaged over a frame). However this limit is extended if
+ // a very high rate is given on the command line or the the rate cannnot
+ // be acheived because of a user specificed max q (e.g. when the user
+ // specifies lossless encode.
+ vbr_max_bits =
+ (int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
+ 100);
+ rc->max_frame_bandwidth =
+ AOMMAX(AOMMAX((MBs * MAX_MB_RATE), MAXRATE_1080P), vbr_max_bits);
+
+ av1_rc_set_gf_interval_range(cpi, rc);
+}
+
+#define VBR_PCT_ADJUSTMENT_LIMIT 50
+// For VBR...adjustment to the frame target based on error from previous frames
+static void vbr_rate_correction(AV1_COMP *cpi, int *this_frame_target) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int64_t vbr_bits_off_target = rc->vbr_bits_off_target;
+ int max_delta;
+ double position_factor = 1.0;
+
+ // How far through the clip are we.
+ // This number is used to damp the per frame rate correction.
+ // Range 0 - 1.0
+ if (cpi->twopass.total_stats.count != 0.) {
+ position_factor = sqrt((double)cpi->common.current_video_frame /
+ cpi->twopass.total_stats.count);
+ }
+ max_delta = (int)(position_factor *
+ ((*this_frame_target * VBR_PCT_ADJUSTMENT_LIMIT) / 100));
+
+ // vbr_bits_off_target > 0 means we have extra bits to spend
+ if (vbr_bits_off_target > 0) {
+ *this_frame_target += (vbr_bits_off_target > max_delta)
+ ? max_delta
+ : (int)vbr_bits_off_target;
+ } else {
+ *this_frame_target -= (vbr_bits_off_target < -max_delta)
+ ? max_delta
+ : (int)-vbr_bits_off_target;
+ }
+
+ // Fast redistribution of bits arising from massive local undershoot.
+ // Dont do it for kf,arf,gf or overlay frames.
+ if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
+ rc->vbr_bits_off_target_fast) {
+ int one_frame_bits = AOMMAX(rc->avg_frame_bandwidth, *this_frame_target);
+ int fast_extra_bits;
+ fast_extra_bits = (int)AOMMIN(rc->vbr_bits_off_target_fast, one_frame_bits);
+ fast_extra_bits = (int)AOMMIN(
+ fast_extra_bits,
+ AOMMAX(one_frame_bits / 8, rc->vbr_bits_off_target_fast / 8));
+ *this_frame_target += (int)fast_extra_bits;
+ rc->vbr_bits_off_target_fast -= fast_extra_bits;
+ }
+}
+
+void av1_set_target_rate(AV1_COMP *cpi, int width, int height) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int target_rate = rc->base_frame_target;
+
+ // Correction to rate target based on prior over or under shoot.
+ if (cpi->oxcf.rc_mode == AOM_VBR || cpi->oxcf.rc_mode == AOM_CQ)
+ vbr_rate_correction(cpi, &target_rate);
+ rc_set_frame_target(cpi, target_rate, width, height);
+}
diff --git a/third_party/aom/av1/encoder/ratectrl.h b/third_party/aom/av1/encoder/ratectrl.h
new file mode 100644
index 0000000000..49d1ae15c1
--- /dev/null
+++ b/third_party/aom/av1/encoder/ratectrl.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RATECTRL_H_
+#define AOM_AV1_ENCODER_RATECTRL_H_
+
+#include "aom/aom_codec.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS 9
+
+#define CUSTOMIZED_GF 1
+
+#if CONFIG_FIX_GF_LENGTH
+#define FIXED_GF_LENGTH 16
+#define MAX_PYRAMID_LVL 4
+// We allow a frame to have at most two left/right descendants before changing
+// them into to a subtree, i.e., we allow the following structure:
+/* OUT_OF_ORDER_FRAME
+ / / \ \
+(two left children) F F F F (two right children) */
+// Therefore the max gf size supported by 4 layer structure is
+// 1 (KEY/OVERLAY) + 1 + 2 + 4 + 16 (two children on both side of their parent)
+#define MAX_PYRAMID_SIZE 24
+#define USE_SYMM_MULTI_LAYER 1
+#define REDUCE_LAST_ALT_BOOST 1
+#define REDUCE_LAST_GF_LENGTH 1
+#define MULTI_LVL_BOOST_VBR_CQ 1
+#else
+#define USE_SYMM_MULTI_LAYER 0
+#define REDUCE_LAST_ALT_BOOST 0
+#define REDUCE_LAST_GF_LENGTH 0
+#define MULTI_LVL_BOOST_VBR_CQ 0
+#endif
+
+#if USE_SYMM_MULTI_LAYER
+#define USE_MANUAL_GF4_STRUCT 0
+#endif
+
+#define MIN_GF_INTERVAL 4
+#define MAX_GF_INTERVAL 16
+#define FIXED_GF_INTERVAL 8 // Used in some testing modes only
+
+typedef enum {
+ INTER_NORMAL = 0,
+ INTER_LOW = 1,
+ INTER_HIGH = 2,
+ GF_ARF_LOW = 3,
+ GF_ARF_STD = 4,
+ KF_STD = 5,
+ RATE_FACTOR_LEVELS = 6
+} RATE_FACTOR_LEVEL;
+
+static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
+ 1.00, // INTER_NORMAL
+ 0.80, // INTER_LOW
+ 1.50, // INTER_HIGH
+ 1.25, // GF_ARF_LOW
+ 2.00, // GF_ARF_STD
+ 2.00, // KF_STD
+};
+
+typedef struct {
+ int resize_width;
+ int resize_height;
+ uint8_t superres_denom;
+} size_params_type;
+
+typedef struct {
+ // Rate targetting variables
+ int base_frame_target; // A baseline frame target before adjustment
+ // for previous under or over shoot.
+ int this_frame_target; // Actual frame target after rc adjustment.
+ int projected_frame_size;
+ int sb64_target_rate;
+ int last_q[FRAME_TYPES]; // Separate values for Intra/Inter
+ int last_boosted_qindex; // Last boosted GF/KF/ARF q
+ int last_kf_qindex; // Q index of the last key frame coded.
+
+ int gfu_boost;
+ int last_boost;
+ int kf_boost;
+
+ double rate_correction_factors[RATE_FACTOR_LEVELS];
+
+ int frames_since_golden;
+ int frames_till_gf_update_due;
+ int min_gf_interval;
+ int max_gf_interval;
+ int static_scene_max_gf_interval;
+ int baseline_gf_interval;
+ int constrained_gf_group;
+ int frames_to_key;
+ int frames_since_key;
+ int this_key_frame_forced;
+ int next_key_frame_forced;
+ int source_alt_ref_pending;
+ int source_alt_ref_active;
+ int is_src_frame_alt_ref;
+ int sframe_due;
+
+ // Length of the bi-predictive frame group interval
+ int bipred_group_interval;
+
+ // NOTE: Different types of frames may have different bits allocated
+ // accordingly, aiming to achieve the overall optimal RD performance.
+ int is_bwd_ref_frame;
+ int is_last_bipred_frame;
+ int is_bipred_frame;
+ int is_src_frame_ext_arf;
+
+ int avg_frame_bandwidth; // Average frame size target for clip
+ int min_frame_bandwidth; // Minimum allocation used for any frame
+ int max_frame_bandwidth; // Maximum burst rate allowed for a frame.
+
+ int ni_av_qi;
+ int ni_tot_qi;
+ int ni_frames;
+ int avg_frame_qindex[FRAME_TYPES];
+ double tot_q;
+ double avg_q;
+
+ int64_t buffer_level;
+ int64_t bits_off_target;
+ int64_t vbr_bits_off_target;
+ int64_t vbr_bits_off_target_fast;
+
+ int decimation_factor;
+ int decimation_count;
+
+ int rolling_target_bits;
+ int rolling_actual_bits;
+
+ int long_rolling_target_bits;
+ int long_rolling_actual_bits;
+
+ int rate_error_estimate;
+
+ int64_t total_actual_bits;
+ int64_t total_target_bits;
+ int64_t total_target_vs_actual;
+
+ int worst_quality;
+ int best_quality;
+
+ int64_t starting_buffer_level;
+ int64_t optimal_buffer_level;
+ int64_t maximum_buffer_size;
+
+ // rate control history for last frame(1) and the frame before(2).
+ // -1: undershot
+ // 1: overshoot
+ // 0: not initialized.
+ int rc_1_frame;
+ int rc_2_frame;
+ int q_1_frame;
+ int q_2_frame;
+
+ // Auto frame-scaling variables.
+ int rf_level_maxq[RATE_FACTOR_LEVELS];
+ float arf_boost_factor;
+ // Q index used for ALT frame
+ int arf_q;
+} RATE_CONTROL;
+
+struct AV1_COMP;
+struct AV1EncoderConfig;
+
+void av1_rc_init(const struct AV1EncoderConfig *oxcf, int pass,
+ RATE_CONTROL *rc);
+
+int av1_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs,
+ double correction_factor, aom_bit_depth_t bit_depth);
+
+double av1_convert_qindex_to_q(int qindex, aom_bit_depth_t bit_depth);
+
+void av1_rc_init_minq_luts(void);
+
+int av1_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note av1_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int av1_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
+// Generally at the high level, the following flow is expected
+// to be enforced for rate control:
+// First call per frame, one of:
+// av1_rc_get_one_pass_vbr_params()
+// av1_rc_get_one_pass_cbr_params()
+// av1_rc_get_first_pass_params()
+// av1_rc_get_second_pass_params()
+// depending on the usage to set the rate control encode parameters desired.
+//
+// Then, call encode_frame_to_data_rate() to perform the
+// actual encode. This function will in turn call encode_frame()
+// one or more times, followed by one of:
+// av1_rc_postencode_update()
+// av1_rc_postencode_update_drop_frame()
+//
+// The majority of rate control parameters are only expected
+// to be set in the av1_rc_get_..._params() functions and
+// updated during the av1_rc_postencode_update...() functions.
+// The only exceptions are av1_rc_drop_frame() and
+// av1_rc_update_rate_correction_factors() functions.
+
+// Functions to set parameters for encoding before the actual
+// encode_frame_to_data_rate() function.
+void av1_rc_get_one_pass_vbr_params(struct AV1_COMP *cpi);
+void av1_rc_get_one_pass_cbr_params(struct AV1_COMP *cpi);
+
+// Post encode update of the rate control parameters based
+// on bytes used
+void av1_rc_postencode_update(struct AV1_COMP *cpi, uint64_t bytes_used);
+// Post encode update of the rate control parameters for dropped frames
+void av1_rc_postencode_update_drop_frame(struct AV1_COMP *cpi);
+
+// Updates rate correction factors
+// Changes only the rate correction factors in the rate control structure.
+void av1_rc_update_rate_correction_factors(struct AV1_COMP *cpi, int width,
+ int height);
+
+// Decide if we should drop this frame: For 1-pass CBR.
+// Changes only the decimation count in the rate control structure
+int av1_rc_drop_frame(struct AV1_COMP *cpi);
+
+// Computes frame size bounds.
+void av1_rc_compute_frame_size_bounds(const struct AV1_COMP *cpi,
+ int this_frame_target,
+ int *frame_under_shoot_limit,
+ int *frame_over_shoot_limit);
+
+// Picks q and q bounds given the target for bits
+int av1_rc_pick_q_and_bounds(struct AV1_COMP *cpi, int width, int height,
+ int *bottom_index, int *top_index);
+
+// Estimates q to achieve a target bits per frame
+int av1_rc_regulate_q(const struct AV1_COMP *cpi, int target_bits_per_frame,
+ int active_best_quality, int active_worst_quality,
+ int width, int height);
+
+// Estimates bits per mb for a given qindex and correction factor.
+int av1_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex,
+ double correction_factor, aom_bit_depth_t bit_depth);
+
+// Clamping utilities for bitrate targets for iframes and pframes.
+int av1_rc_clamp_iframe_target_size(const struct AV1_COMP *const cpi,
+ int target);
+int av1_rc_clamp_pframe_target_size(const struct AV1_COMP *const cpi,
+ int target);
+// Utility to set frame_target into the RATE_CONTROL structure
+// This function is called only from the av1_rc_get_..._params() functions.
+void av1_rc_set_frame_target(struct AV1_COMP *cpi, int target);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a target q value
+int av1_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget,
+ aom_bit_depth_t bit_depth);
+
+// Computes a q delta (in "q index" terms) to get from a starting q value
+// to a value that should equate to the given rate ratio.
+int av1_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type,
+ int qindex, double rate_target_ratio,
+ aom_bit_depth_t bit_depth);
+
+int av1_frame_type_qdelta(const struct AV1_COMP *cpi, int rf_level, int q);
+
+void av1_rc_update_framerate(struct AV1_COMP *cpi, int width, int height);
+
+void av1_rc_set_gf_interval_range(const struct AV1_COMP *const cpi,
+ RATE_CONTROL *const rc);
+
+void av1_set_target_rate(struct AV1_COMP *cpi, int width, int height);
+
+int av1_resize_one_pass_cbr(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RATECTRL_H_
diff --git a/third_party/aom/av1/encoder/rd.c b/third_party/aom/av1/encoder/rd.c
new file mode 100644
index 0000000000..b87d89e50b
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.c
@@ -0,0 +1,1512 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/common.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/tokenize.h"
+
+#define RD_THRESH_POW 1.25
+
+// The baseline rd thresholds for breaking out of the rd loop for
+// certain modes are assumed to be based on 8x8 blocks.
+// This table is used to correct for block size.
+// The factors here are << 2 (2 = x0.5, 32 = x8 etc).
+static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES_ALL] = {
+ 2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32, 48, 48, 64, 4, 4, 8, 8, 16, 16
+};
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][EXT_TX_SIZES] =
+ {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ };
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][EXT_TX_SIZES] =
+ {
+ { 1, 1, 1, 1 }, // unused
+ { 1, 1, 0, 0 },
+ { 0, 0, 1, 0 },
+ { 0, 0, 0, 1 },
+ };
+
+static const int av1_ext_tx_set_idx_to_type[2][AOMMAX(EXT_TX_SETS_INTRA,
+ EXT_TX_SETS_INTER)] = {
+ {
+ // Intra
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_DTT4_IDTX_1DDCT,
+ EXT_TX_SET_DTT4_IDTX,
+ },
+ {
+ // Inter
+ EXT_TX_SET_DCTONLY,
+ EXT_TX_SET_ALL16,
+ EXT_TX_SET_DTT9_IDTX_1DDCT,
+ EXT_TX_SET_DCT_IDTX,
+ },
+};
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+ FRAME_CONTEXT *fc) {
+ int i, j;
+
+ for (i = 0; i < PARTITION_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(x->partition_cost[i], fc->partition_cdf[i], NULL);
+
+ if (cm->skip_mode_flag) {
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->skip_mode_cost[i], fc->skip_mode_cdfs[i],
+ NULL);
+ }
+ }
+
+ for (i = 0; i < SKIP_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->skip_cost[i], fc->skip_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < KF_MODE_CONTEXTS; ++i)
+ for (j = 0; j < KF_MODE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(x->y_mode_costs[i][j], fc->kf_y_cdf[i][j], NULL);
+
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+ av1_cost_tokens_from_cdf(x->mbmode_cost[i], fc->y_mode_cdf[i], NULL);
+ for (i = 0; i < CFL_ALLOWED_TYPES; ++i)
+ for (j = 0; j < INTRA_MODES; ++j)
+ av1_cost_tokens_from_cdf(x->intra_uv_mode_cost[i][j],
+ fc->uv_mode_cdf[i][j], NULL);
+
+ av1_cost_tokens_from_cdf(x->filter_intra_mode_cost, fc->filter_intra_mode_cdf,
+ NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (av1_filter_intra_allowed_bsize(cm, i))
+ av1_cost_tokens_from_cdf(x->filter_intra_cost[i],
+ fc->filter_intra_cdfs[i], NULL);
+ }
+
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(x->switchable_interp_costs[i],
+ fc->switchable_interp_cdf[i], NULL);
+
+ for (i = 0; i < PALATTE_BSIZE_CTXS; ++i) {
+ av1_cost_tokens_from_cdf(x->palette_y_size_cost[i],
+ fc->palette_y_size_cdf[i], NULL);
+ av1_cost_tokens_from_cdf(x->palette_uv_size_cost[i],
+ fc->palette_uv_size_cdf[i], NULL);
+ for (j = 0; j < PALETTE_Y_MODE_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(x->palette_y_mode_cost[i][j],
+ fc->palette_y_mode_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < PALETTE_UV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->palette_uv_mode_cost[i],
+ fc->palette_uv_mode_cdf[i], NULL);
+ }
+
+ for (i = 0; i < PALETTE_SIZES; ++i) {
+ for (j = 0; j < PALETTE_COLOR_INDEX_CONTEXTS; ++j) {
+ av1_cost_tokens_from_cdf(x->palette_y_color_cost[i][j],
+ fc->palette_y_color_index_cdf[i][j], NULL);
+ av1_cost_tokens_from_cdf(x->palette_uv_color_cost[i][j],
+ fc->palette_uv_color_index_cdf[i][j], NULL);
+ }
+ }
+
+ int sign_cost[CFL_JOINT_SIGNS];
+ av1_cost_tokens_from_cdf(sign_cost, fc->cfl_sign_cdf, NULL);
+ for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+ int *cost_u = x->cfl_cost[joint_sign][CFL_PRED_U];
+ int *cost_v = x->cfl_cost[joint_sign][CFL_PRED_V];
+ if (CFL_SIGN_U(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_u, 0, CFL_ALPHABET_SIZE * sizeof(*cost_u));
+ } else {
+ const aom_cdf_prob *cdf_u = fc->cfl_alpha_cdf[CFL_CONTEXT_U(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_u, cdf_u, NULL);
+ }
+ if (CFL_SIGN_V(joint_sign) == CFL_SIGN_ZERO) {
+ memset(cost_v, 0, CFL_ALPHABET_SIZE * sizeof(*cost_v));
+ } else {
+ const aom_cdf_prob *cdf_v = fc->cfl_alpha_cdf[CFL_CONTEXT_V(joint_sign)];
+ av1_cost_tokens_from_cdf(cost_v, cdf_v, NULL);
+ }
+ for (int u = 0; u < CFL_ALPHABET_SIZE; u++)
+ cost_u[u] += sign_cost[joint_sign];
+ }
+
+ for (i = 0; i < MAX_TX_CATS; ++i)
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+ av1_cost_tokens_from_cdf(x->tx_size_cost[i][j], fc->tx_size_cdf[i][j],
+ NULL);
+
+ for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->txfm_partition_cost[i],
+ fc->txfm_partition_cdf[i], NULL);
+ }
+
+ for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+ int s;
+ for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+ if (use_inter_ext_tx_for_txsize[s][i]) {
+ av1_cost_tokens_from_cdf(
+ x->inter_tx_type_costs[s][i], fc->inter_ext_tx_cdf[s][i],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[1][s]]);
+ }
+ }
+ for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+ if (use_intra_ext_tx_for_txsize[s][i]) {
+ for (j = 0; j < INTRA_MODES; ++j) {
+ av1_cost_tokens_from_cdf(
+ x->intra_tx_type_costs[s][i][j], fc->intra_ext_tx_cdf[s][i][j],
+ av1_ext_tx_inv[av1_ext_tx_set_idx_to_type[0][s]]);
+ }
+ }
+ }
+ }
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) {
+ av1_cost_tokens_from_cdf(x->angle_delta_cost[i], fc->angle_delta_cdf[i],
+ NULL);
+ }
+ av1_cost_tokens_from_cdf(x->switchable_restore_cost,
+ fc->switchable_restore_cdf, NULL);
+ av1_cost_tokens_from_cdf(x->wiener_restore_cost, fc->wiener_restore_cdf,
+ NULL);
+ av1_cost_tokens_from_cdf(x->sgrproj_restore_cost, fc->sgrproj_restore_cdf,
+ NULL);
+ av1_cost_tokens_from_cdf(x->intrabc_cost, fc->intrabc_cdf, NULL);
+
+ if (!frame_is_intra_only(cm)) {
+ for (i = 0; i < COMP_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->comp_inter_cost[i], fc->comp_inter_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < SINGLE_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(x->single_ref_cost[i][j],
+ fc->single_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < COMP_REF_TYPE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->comp_ref_type_cost[i],
+ fc->comp_ref_type_cdf[i], NULL);
+ }
+
+ for (i = 0; i < UNI_COMP_REF_CONTEXTS; ++i) {
+ for (j = 0; j < UNIDIR_COMP_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(x->uni_comp_ref_cost[i][j],
+ fc->uni_comp_ref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < FWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(x->comp_ref_cost[i][j], fc->comp_ref_cdf[i][j],
+ NULL);
+ }
+ }
+
+ for (i = 0; i < REF_CONTEXTS; ++i) {
+ for (j = 0; j < BWD_REFS - 1; ++j) {
+ av1_cost_tokens_from_cdf(x->comp_bwdref_cost[i][j],
+ fc->comp_bwdref_cdf[i][j], NULL);
+ }
+ }
+
+ for (i = 0; i < INTRA_INTER_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->intra_inter_cost[i], fc->intra_inter_cdf[i],
+ NULL);
+ }
+
+ for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->newmv_mode_cost[i], fc->newmv_cdf[i], NULL);
+ }
+
+ for (i = 0; i < GLOBALMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->zeromv_mode_cost[i], fc->zeromv_cdf[i], NULL);
+ }
+
+ for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->refmv_mode_cost[i], fc->refmv_cdf[i], NULL);
+ }
+
+ for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->drl_mode_cost0[i], fc->drl_cdf[i], NULL);
+ }
+ for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+ av1_cost_tokens_from_cdf(x->inter_compound_mode_cost[i],
+ fc->inter_compound_mode_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i)
+ av1_cost_tokens_from_cdf(x->compound_type_cost[i],
+ fc->compound_type_cdf[i], NULL);
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ if (get_interinter_wedge_bits(i)) {
+ av1_cost_tokens_from_cdf(x->wedge_idx_cost[i], fc->wedge_idx_cdf[i],
+ NULL);
+ }
+ }
+ for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+ av1_cost_tokens_from_cdf(x->interintra_cost[i], fc->interintra_cdf[i],
+ NULL);
+ av1_cost_tokens_from_cdf(x->interintra_mode_cost[i],
+ fc->interintra_mode_cdf[i], NULL);
+ }
+ for (i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ av1_cost_tokens_from_cdf(x->wedge_interintra_cost[i],
+ fc->wedge_interintra_cdf[i], NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(x->motion_mode_cost[i], fc->motion_mode_cdf[i],
+ NULL);
+ }
+ for (i = BLOCK_8X8; i < BLOCK_SIZES_ALL; i++) {
+ av1_cost_tokens_from_cdf(x->motion_mode_cost1[i], fc->obmc_cdf[i], NULL);
+ }
+ for (i = 0; i < COMP_INDEX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->comp_idx_cost[i], fc->compound_index_cdf[i],
+ NULL);
+ }
+ for (i = 0; i < COMP_GROUP_IDX_CONTEXTS; ++i) {
+ av1_cost_tokens_from_cdf(x->comp_group_idx_cost[i],
+ fc->comp_group_idx_cdf[i], NULL);
+ }
+ }
+}
+
+// Values are now correlated to quantizer.
+static int sad_per_bit16lut_8[QINDEX_RANGE];
+static int sad_per_bit4lut_8[QINDEX_RANGE];
+static int sad_per_bit16lut_10[QINDEX_RANGE];
+static int sad_per_bit4lut_10[QINDEX_RANGE];
+static int sad_per_bit16lut_12[QINDEX_RANGE];
+static int sad_per_bit4lut_12[QINDEX_RANGE];
+
+static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
+ aom_bit_depth_t bit_depth) {
+ int i;
+ // Initialize the sad lut tables using a formulaic calculation for now.
+ // This is to make it easier to resolve the impact of experimental changes
+ // to the quantizer tables.
+ for (i = 0; i < range; i++) {
+ const double q = av1_convert_qindex_to_q(i, bit_depth);
+ bit16lut[i] = (int)(0.0418 * q + 2.4107);
+ bit4lut[i] = (int)(0.063 * q + 2.742);
+ }
+}
+
+void av1_init_me_luts(void) {
+ init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
+ AOM_BITS_8);
+ init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
+ AOM_BITS_10);
+ init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
+ AOM_BITS_12);
+}
+
+static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
+ 8, 8, 4, 4, 2, 2, 1, 0 };
+static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
+ 128, 144, 128, 128, 144,
+ // TODO(zoeliu): To adjust further following factor values.
+ 128, 128, 128,
+ // TODO(weitinglin): We should investigate if the values should be the same
+ // as the value used by OVERLAY frame
+ 144, // INTNL_OVERLAY_UPDATE
+ 128 // INTNL_ARF_UPDATE
+};
+
+int av1_compute_rd_mult(const AV1_COMP *cpi, int qindex) {
+ const int64_t q =
+ av1_dc_quant_Q3(qindex, 0, cpi->common.seq_params.bit_depth);
+ int64_t rdmult = 0;
+ switch (cpi->common.seq_params.bit_depth) {
+ case AOM_BITS_8: rdmult = 88 * q * q / 24; break;
+ case AOM_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
+ case AOM_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+ const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
+ const int boost_index = AOMMIN(15, (cpi->rc.gfu_boost / 100));
+
+ rdmult = (rdmult * rd_frame_type_factor[frame_type]) >> 7;
+ rdmult += ((rdmult * rd_boost_factor[boost_index]) >> 7);
+ }
+ if (rdmult < 1) rdmult = 1;
+ return (int)rdmult;
+}
+
+static int compute_rd_thresh_factor(int qindex, aom_bit_depth_t bit_depth) {
+ double q;
+ switch (bit_depth) {
+ case AOM_BITS_8: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_8) / 4.0; break;
+ case AOM_BITS_10: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_10) / 16.0; break;
+ case AOM_BITS_12: q = av1_dc_quant_Q3(qindex, 0, AOM_BITS_12) / 64.0; break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+ // TODO(debargha): Adjust the function below.
+ return AOMMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
+}
+
+void av1_initialize_me_consts(const AV1_COMP *cpi, MACROBLOCK *x, int qindex) {
+ switch (cpi->common.seq_params.bit_depth) {
+ case AOM_BITS_8:
+ x->sadperbit16 = sad_per_bit16lut_8[qindex];
+ x->sadperbit4 = sad_per_bit4lut_8[qindex];
+ break;
+ case AOM_BITS_10:
+ x->sadperbit16 = sad_per_bit16lut_10[qindex];
+ x->sadperbit4 = sad_per_bit4lut_10[qindex];
+ break;
+ case AOM_BITS_12:
+ x->sadperbit16 = sad_per_bit16lut_12[qindex];
+ x->sadperbit4 = sad_per_bit4lut_12[qindex];
+ break;
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ }
+}
+
+static void set_block_thresholds(const AV1_COMMON *cm, RD_OPT *rd) {
+ int i, bsize, segment_id;
+
+ for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
+ const int qindex =
+ clamp(av1_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
+ cm->y_dc_delta_q,
+ 0, MAXQ);
+ const int q = compute_rd_thresh_factor(qindex, cm->seq_params.bit_depth);
+
+ for (bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ // Threshold here seems unnecessarily harsh but fine given actual
+ // range of values used for cpi->sf.thresh_mult[].
+ const int t = q * rd_thresh_block_size_factor[bsize];
+ const int thresh_max = INT_MAX / t;
+
+ for (i = 0; i < MAX_MODES; ++i)
+ rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
+ ? rd->thresh_mult[i] * t / 4
+ : INT_MAX;
+ }
+ }
+}
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx) {
+ (void)ref;
+ (void)ref_mv_idx;
+ x->mvcost = x->mv_cost_stack;
+ x->nmvjointcost = x->nmv_vec_cost;
+}
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+ const int num_planes) {
+ const int nplanes = AOMMIN(num_planes, PLANE_TYPES);
+ for (int eob_multi_size = 0; eob_multi_size < 7; ++eob_multi_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_EOB_COST *pcost = &x->eob_costs[eob_multi_size][plane];
+
+ for (int ctx = 0; ctx < 2; ++ctx) {
+ aom_cdf_prob *pcdf;
+ switch (eob_multi_size) {
+ case 0: pcdf = fc->eob_flag_cdf16[plane][ctx]; break;
+ case 1: pcdf = fc->eob_flag_cdf32[plane][ctx]; break;
+ case 2: pcdf = fc->eob_flag_cdf64[plane][ctx]; break;
+ case 3: pcdf = fc->eob_flag_cdf128[plane][ctx]; break;
+ case 4: pcdf = fc->eob_flag_cdf256[plane][ctx]; break;
+ case 5: pcdf = fc->eob_flag_cdf512[plane][ctx]; break;
+ case 6:
+ default: pcdf = fc->eob_flag_cdf1024[plane][ctx]; break;
+ }
+ av1_cost_tokens_from_cdf(pcost->eob_cost[ctx], pcdf, NULL);
+ }
+ }
+ }
+ for (int tx_size = 0; tx_size < TX_SIZES; ++tx_size) {
+ for (int plane = 0; plane < nplanes; ++plane) {
+ LV_MAP_COEFF_COST *pcost = &x->coeff_costs[tx_size][plane];
+
+ for (int ctx = 0; ctx < TXB_SKIP_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->txb_skip_cost[ctx],
+ fc->txb_skip_cdf[tx_size][ctx], NULL);
+
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS_EOB; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_eob_cost[ctx],
+ fc->coeff_base_eob_cdf[tx_size][plane][ctx],
+ NULL);
+ for (int ctx = 0; ctx < SIG_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->base_cost[ctx],
+ fc->coeff_base_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < EOB_COEF_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->eob_extra_cost[ctx],
+ fc->eob_extra_cdf[tx_size][plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < DC_SIGN_CONTEXTS; ++ctx)
+ av1_cost_tokens_from_cdf(pcost->dc_sign_cost[ctx],
+ fc->dc_sign_cdf[plane][ctx], NULL);
+
+ for (int ctx = 0; ctx < LEVEL_CONTEXTS; ++ctx) {
+ int br_rate[BR_CDF_SIZE];
+ int prev_cost = 0;
+ int i, j;
+ av1_cost_tokens_from_cdf(br_rate, fc->coeff_br_cdf[tx_size][plane][ctx],
+ NULL);
+ // printf("br_rate: ");
+ // for(j = 0; j < BR_CDF_SIZE; j++)
+ // printf("%4d ", br_rate[j]);
+ // printf("\n");
+ for (i = 0; i < COEFF_BASE_RANGE; i += BR_CDF_SIZE - 1) {
+ for (j = 0; j < BR_CDF_SIZE - 1; j++) {
+ pcost->lps_cost[ctx][i + j] = prev_cost + br_rate[j];
+ }
+ prev_cost += br_rate[j];
+ }
+ pcost->lps_cost[ctx][i] = prev_cost;
+ // printf("lps_cost: %d %d %2d : ", tx_size, plane, ctx);
+ // for (i = 0; i <= COEFF_BASE_RANGE; i++)
+ // printf("%5d ", pcost->lps_cost[ctx][i]);
+ // printf("\n");
+ }
+ }
+ }
+}
+
+void av1_initialize_rd_consts(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->td.mb;
+ RD_OPT *const rd = &cpi->rd;
+
+ aom_clear_system_state();
+
+ rd->RDMULT = av1_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
+
+ set_error_per_bit(x, rd->RDMULT);
+
+ set_block_thresholds(cm, rd);
+
+ if (cm->cur_frame_force_integer_mv) {
+ av1_build_nmv_cost_table(x->nmv_vec_cost, x->nmvcost, &cm->fc->nmvc,
+ MV_SUBPEL_NONE);
+ } else {
+ av1_build_nmv_cost_table(
+ x->nmv_vec_cost,
+ cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
+ cm->allow_high_precision_mv);
+ }
+
+ x->mvcost = x->mv_cost_stack;
+ x->nmvjointcost = x->nmv_vec_cost;
+
+ if (frame_is_intra_only(cm) && cm->allow_screen_content_tools &&
+ cpi->oxcf.pass != 1) {
+ int *dvcost[2] = { &cpi->dv_cost[0][MV_MAX], &cpi->dv_cost[1][MV_MAX] };
+ av1_build_nmv_cost_table(cpi->dv_joint_cost, dvcost, &cm->fc->ndvc,
+ MV_SUBPEL_NONE);
+ }
+
+ if (cpi->oxcf.pass != 1) {
+ for (int i = 0; i < TRANS_TYPES; ++i)
+ // IDENTITY: 1 bit
+ // TRANSLATION: 3 bits
+ // ROTZOOM: 2 bits
+ // AFFINE: 3 bits
+ cpi->gmtype_cost[i] = (1 + (i > 0 ? (i == ROTZOOM ? 1 : 2) : 0))
+ << AV1_PROB_COST_SHIFT;
+ }
+}
+
+static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
+ // NOTE: The tables below must be of the same size.
+
+ // The functions described below are sampled at the four most significant
+ // bits of x^2 + 8 / 256.
+
+ // Normalized rate:
+ // This table models the rate for a Laplacian source with given variance
+ // when quantized with a uniform quantizer with given stepsize. The
+ // closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
+ // and H(x) is the binary entropy function.
+ static const int rate_tab_q10[] = {
+ 65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142,
+ 4044, 3958, 3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186,
+ 3133, 3037, 2952, 2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353,
+ 2290, 2232, 2179, 2130, 2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651,
+ 1608, 1530, 1460, 1398, 1342, 1290, 1243, 1199, 1159, 1086, 1021, 963,
+ 911, 864, 821, 781, 745, 680, 623, 574, 530, 490, 455, 424,
+ 395, 345, 304, 269, 239, 213, 190, 171, 154, 126, 104, 87,
+ 73, 61, 52, 44, 38, 28, 21, 16, 12, 10, 8, 6,
+ 5, 3, 2, 1, 1, 1, 0, 0,
+ };
+ // Normalized distortion:
+ // This table models the normalized distortion for a Laplacian source
+ // with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance).
+ // Note the actual distortion is Dn * variance.
+ static const int dist_tab_q10[] = {
+ 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5,
+ 5, 6, 7, 7, 8, 9, 11, 12, 13, 15, 16, 17,
+ 18, 21, 24, 26, 29, 31, 34, 36, 39, 44, 49, 54,
+ 59, 64, 69, 73, 78, 88, 97, 106, 115, 124, 133, 142,
+ 151, 167, 184, 200, 215, 231, 245, 260, 274, 301, 327, 351,
+ 375, 397, 418, 439, 458, 495, 528, 559, 587, 613, 637, 659,
+ 680, 717, 749, 777, 801, 823, 842, 859, 874, 899, 919, 936,
+ 949, 960, 969, 977, 983, 994, 1001, 1006, 1010, 1013, 1015, 1017,
+ 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
+ };
+ static const int xsq_iq_q10[] = {
+ 0, 4, 8, 12, 16, 20, 24, 28, 32,
+ 40, 48, 56, 64, 72, 80, 88, 96, 112,
+ 128, 144, 160, 176, 192, 208, 224, 256, 288,
+ 320, 352, 384, 416, 448, 480, 544, 608, 672,
+ 736, 800, 864, 928, 992, 1120, 1248, 1376, 1504,
+ 1632, 1760, 1888, 2016, 2272, 2528, 2784, 3040, 3296,
+ 3552, 3808, 4064, 4576, 5088, 5600, 6112, 6624, 7136,
+ 7648, 8160, 9184, 10208, 11232, 12256, 13280, 14304, 15328,
+ 16352, 18400, 20448, 22496, 24544, 26592, 28640, 30688, 32736,
+ 36832, 40928, 45024, 49120, 53216, 57312, 61408, 65504, 73696,
+ 81888, 90080, 98272, 106464, 114656, 122848, 131040, 147424, 163808,
+ 180192, 196576, 212960, 229344, 245728,
+ };
+ const int tmp = (xsq_q10 >> 2) + 8;
+ const int k = get_msb(tmp) - 3;
+ const int xq = (k << 3) + ((tmp >> k) & 0x7);
+ const int one_q10 = 1 << 10;
+ const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
+ const int b_q10 = one_q10 - a_q10;
+ *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
+ *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
+}
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n_log2,
+ unsigned int qstep, int *rate,
+ int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expressions are in:
+ // Hang and Chen, "Source Model for transform video coder and its
+ // application - Part I: Fundamental Theory", IEEE Trans. Circ.
+ // Sys. for Video Tech., April 1997.
+ if (var == 0) {
+ *rate = 0;
+ *dist = 0;
+ } else {
+ int d_q10, r_q10;
+ static const uint32_t MAX_XSQ_Q10 = 245727;
+ const uint64_t xsq_q10_64 =
+ (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
+ const int xsq_q10 = (int)AOMMIN(xsq_q10_64, MAX_XSQ_Q10);
+ model_rd_norm(xsq_q10, &r_q10, &d_q10);
+ *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - AV1_PROB_COST_SHIFT);
+ *dist = (var * (int64_t)d_q10 + 512) >> 10;
+ }
+}
+
+static double interp_cubic(const double *p, double x) {
+ return p[1] + 0.5 * x *
+ (p[2] - p[0] +
+ x * (2.0 * p[0] - 5.0 * p[1] + 4.0 * p[2] - p[3] +
+ x * (3.0 * (p[1] - p[2]) + p[3] - p[0])));
+}
+
+static double interp_bicubic(const double *p, int p_stride, double x,
+ double y) {
+ double q[4];
+ q[0] = interp_cubic(p, x);
+ q[1] = interp_cubic(p + p_stride, x);
+ q[2] = interp_cubic(p + 2 * p_stride, x);
+ q[3] = interp_cubic(p + 3 * p_stride, x);
+ return interp_cubic(q, y);
+}
+
+static const double interp_rgrid_surf[65 * 18] = {
+ 0.104019, 0.245714, 0.293686, 0.358635, 0.382167, 0.412446,
+ 0.419955, 0.421388, 0.426672, 0.427990, 0.428531, 0.456868,
+ 0.569880, 0.638822, 1.016319, 2.143453, 3.565229, 4.720880,
+ 0.124618, 0.294211, 0.352023, 0.429991, 0.458206, 0.494510,
+ 0.503513, 0.505232, 0.511566, 0.513234, 0.519365, 0.570225,
+ 0.697373, 0.840624, 1.462198, 3.289054, 6.256517, 6.852788,
+ 0.118630, 0.269669, 0.346620, 0.430999, 0.459385, 0.495783,
+ 0.504808, 0.506532, 0.512884, 0.514988, 0.543437, 0.662772,
+ 0.795876, 1.313596, 2.403841, 4.163098, 7.440589, 8.616275,
+ 0.093329, 0.168205, 0.321320, 0.430607, 0.459385, 0.495783,
+ 0.504813, 0.506548, 0.512975, 0.520662, 0.571659, 0.701841,
+ 1.010727, 2.138851, 3.460626, 6.317955, 10.098127, 14.418553,
+ 0.087021, 0.142905, 0.315011, 0.430509, 0.459385, 0.495787,
+ 0.505075, 0.507599, 0.513584, 0.543182, 0.669941, 0.825620,
+ 1.362800, 2.572187, 4.205047, 7.498399, 12.303118, 16.641735,
+ 0.086923, 0.142513, 0.314913, 0.430508, 0.459385, 0.495803,
+ 0.506126, 0.511816, 0.514810, 0.549705, 0.725350, 1.127334,
+ 2.168597, 3.463686, 6.318605, 10.162284, 18.556041, 19.847042,
+ 0.086923, 0.142513, 0.314913, 0.430506, 0.459376, 0.495805,
+ 0.506388, 0.512954, 0.520772, 0.580215, 0.810474, 1.391548,
+ 2.579442, 4.205160, 7.498399, 12.381597, 21.703618, 24.015457,
+ 0.086923, 0.142513, 0.314911, 0.430353, 0.458765, 0.495652,
+ 0.506391, 0.513406, 0.544098, 0.702950, 1.121860, 2.168961,
+ 3.463798, 6.318607, 10.162284, 18.685361, 28.188192, 37.638872,
+ 0.086923, 0.142513, 0.314901, 0.429742, 0.456313, 0.495045,
+ 0.506484, 0.519195, 0.580104, 0.810126, 1.391462, 2.579441,
+ 4.205160, 7.498399, 12.381597, 21.848607, 33.367199, 42.623190,
+ 0.086923, 0.142513, 0.314899, 0.429589, 0.455706, 0.495155,
+ 0.507882, 0.542426, 0.702360, 1.119921, 2.168478, 3.463791,
+ 6.318607, 10.162284, 18.685361, 28.345760, 47.802028, 49.163533,
+ 0.086924, 0.142548, 0.315086, 0.429842, 0.455870, 0.496336,
+ 0.512412, 0.556953, 0.773373, 1.266396, 2.548277, 4.204676,
+ 7.498399, 12.381597, 21.848607, 33.548250, 54.301011, 56.262859,
+ 0.087067, 0.144957, 0.327436, 0.446616, 0.466362, 0.505706,
+ 0.522077, 0.610747, 0.972543, 1.666916, 3.338812, 6.316669,
+ 10.162284, 18.685361, 28.345760, 48.065311, 66.145302, 78.396020,
+ 0.094295, 0.164235, 0.393722, 0.534219, 0.530922, 0.579308,
+ 0.603889, 0.760870, 1.229961, 2.423214, 4.173513, 7.497916,
+ 12.381597, 21.848607, 33.548250, 54.589585, 74.875848, 86.468182,
+ 0.124096, 0.213005, 0.497188, 0.665176, 0.685973, 0.800200,
+ 0.911394, 1.077971, 1.677290, 3.332129, 6.314960, 10.162257,
+ 18.685361, 28.345760, 48.065311, 66.453506, 98.275189, 96.862588,
+ 0.140999, 0.270140, 0.658212, 0.867661, 0.970183, 1.149516,
+ 1.480599, 1.664833, 2.421893, 3.857981, 7.418830, 12.380371,
+ 21.848607, 33.548250, 54.589585, 75.188867, 106.657971, 99.762997,
+ 0.178353, 0.398001, 0.988462, 1.241473, 1.340967, 1.713568,
+ 2.335030, 2.701432, 3.348532, 5.077158, 9.829903, 18.676528,
+ 28.345700, 48.065311, 66.453506, 98.588283, 117.057193, 101.130722,
+ 0.281079, 0.548300, 1.395825, 1.780770, 2.000508, 2.702964,
+ 3.638454, 4.573843, 5.051641, 7.079129, 11.293332, 21.594861,
+ 33.544335, 54.589585, 75.188867, 106.971065, 119.957601, 101.466632,
+ 0.476762, 0.842189, 2.019678, 2.723895, 3.188467, 4.011610,
+ 5.545111, 7.508984, 8.176339, 9.774504, 14.720782, 27.334416,
+ 48.049609, 66.453506, 98.588283, 117.370357, 121.329855, 101.509242,
+ 0.993999, 1.520111, 3.013605, 4.203530, 4.982992, 6.074944,
+ 8.583581, 11.818375, 14.192544, 14.937517, 21.258160, 33.305953,
+ 54.585735, 75.188867, 106.971135, 120.279824, 121.976055, 102.690130,
+ 1.776487, 2.613655, 4.356487, 6.161726, 7.622196, 9.464193,
+ 13.077233, 18.051656, 23.221051, 24.080068, 30.085038, 48.345269,
+ 66.457698, 98.588353, 117.379415, 121.976128, 124.356210, 107.713202,
+ 3.191085, 4.495201, 5.686033, 8.365566, 11.275339, 14.706437,
+ 20.300969, 28.152237, 35.688355, 39.341382, 41.030743, 55.752262,
+ 75.211764, 106.980285, 120.608403, 124.680746, 130.222528, 112.260098,
+ 6.136611, 7.305215, 7.272532, 10.646713, 15.630815, 22.383168,
+ 31.349131, 42.419822, 52.301680, 58.983454, 58.915405, 69.161305,
+ 98.992460, 117.713855, 124.344836, 130.623638, 138.442401, 127.846670,
+ 11.707980, 13.490761, 11.640845, 14.176132, 22.131124, 33.776462,
+ 47.365711, 61.603834, 75.281056, 83.463985, 85.510533, 86.026513,
+ 108.787480, 123.031136, 130.607284, 138.954406, 160.867784, 158.958882,
+ 27.062874, 32.195139, 24.147297, 22.114632, 35.580506, 52.551674,
+ 71.652956, 88.606776, 102.107193, 110.703186, 114.398733, 111.118539,
+ 121.503578, 132.455924, 139.490806, 161.412674, 193.563210, 172.203945,
+ 35.625692, 47.953028, 42.639820, 42.276254, 58.815664, 84.977282,
+ 110.656412, 126.168446, 134.658126, 140.604482, 144.006012, 141.702382,
+ 140.125323, 153.122630, 164.748041, 194.156197, 206.854650, 174.013079,
+ 49.516447, 65.335381, 71.738306, 81.872819, 98.400740, 136.840488,
+ 163.775802, 169.440078, 172.747876, 171.222919, 171.679604, 172.173550,
+ 168.200129, 187.617133, 199.683394, 207.768200, 210.062520, 175.478356,
+ 60.341673, 92.487135, 119.907299, 136.068010, 144.778950, 189.443534,
+ 220.120077, 219.641635, 214.616503, 205.894657, 198.453924, 200.013069,
+ 195.938103, 206.118661, 210.447375, 212.061379, 216.078218, 181.162805,
+ 78.422159, 112.242899, 158.416312, 181.404320, 193.188690, 229.296967,
+ 270.461799, 275.168977, 256.511701, 244.706786, 231.344608, 226.065087,
+ 222.248618, 218.662324, 217.966722, 218.248574, 218.818588, 182.740573,
+ 88.713664, 123.594164, 172.928179, 213.781414, 245.800351, 252.063414,
+ 313.283141, 331.703831, 305.866639, 285.177142, 269.759635, 251.988739,
+ 245.998388, 232.688076, 230.588702, 230.882657, 230.319053, 192.120741,
+ 102.540561, 152.905927, 189.137131, 241.806756, 273.868497, 284.258017,
+ 339.689853, 373.561104, 362.657463, 326.291984, 311.922687, 290.460189,
+ 276.774381, 273.012072, 277.751792, 279.123748, 278.820447, 233.813798,
+ 132.983118, 176.307242, 197.415684, 243.307787, 280.893995, 332.922370,
+ 340.329043, 404.530166, 419.475405, 375.775209, 351.300889, 340.042759,
+ 315.683832, 306.123530, 306.359319, 306.733063, 307.609556, 261.647847,
+ 149.579109, 185.925581, 207.937033, 245.159084, 301.890957, 350.040480,
+ 352.250771, 418.742329, 458.112686, 430.125208, 386.460441, 380.346839,
+ 354.679150, 337.305620, 334.504124, 335.889932, 341.060725, 286.898578,
+ 153.576812, 202.105624, 219.366967, 248.524506, 314.255692, 350.607526,
+ 390.567688, 408.629209, 488.000213, 480.563823, 432.461799, 410.412624,
+ 398.607371, 400.188740, 402.780916, 408.853470, 430.449735, 363.777088,
+ 161.353129, 214.848904, 231.549852, 258.536466, 313.163177, 368.140577,
+ 412.136393, 413.409032, 499.838438, 519.571063, 485.833867, 444.562715,
+ 435.738129, 442.358549, 450.166531, 453.208524, 458.424358, 385.823139,
+ 175.109034, 227.608058, 250.069563, 286.101747, 312.256740, 378.421485,
+ 413.344147, 435.058646, 476.960941, 542.448886, 530.189154, 495.408402,
+ 475.326752, 465.017144, 464.694045, 465.144689, 466.905382, 398.669138,
+ 184.750180, 240.766694, 283.240772, 305.480150, 322.409001, 374.526162,
+ 427.141326, 452.840323, 472.604139, 545.366105, 567.676694, 541.666203,
+ 509.591873, 492.044219, 492.778569, 493.765684, 493.235693, 413.684325,
+ 194.728357, 254.928927, 289.991157, 300.193195, 324.194589, 371.563147,
+ 439.226438, 468.295088, 495.654854, 533.506353, 587.476353, 578.298989,
+ 548.041942, 527.393885, 538.965146, 545.070442, 544.295454, 454.012211,
+ 205.195287, 283.135677, 297.921431, 319.295927, 355.621830, 392.466463,
+ 446.696167, 485.053519, 516.426615, 532.264584, 588.481600, 615.906737,
+ 589.319634, 555.754316, 558.389367, 569.094521, 569.779764, 475.384946,
+ 218.552054, 298.511016, 319.188338, 351.781666, 372.789510, 412.827434,
+ 464.569387, 506.270203, 533.049810, 553.347364, 580.644599, 632.759854,
+ 622.235843, 569.960552, 580.799340, 586.553714, 579.488366, 491.826482,
+ 244.803348, 299.790203, 324.187975, 363.280782, 403.710443, 441.724083,
+ 492.732682, 534.722691, 552.193622, 575.112647, 586.097705, 635.224970,
+ 644.642944, 606.017786, 640.321218, 642.316989, 616.397020, 548.300111,
+ 256.957358, 318.638991, 355.063346, 389.889307, 433.607315, 468.209001,
+ 515.178157, 573.556591, 578.113115, 587.246475, 601.762801, 638.454644,
+ 656.574853, 641.184609, 676.908189, 684.198162, 678.387412, 574.805864,
+ 251.211502, 323.448532, 364.227424, 411.792704, 462.226488, 503.572288,
+ 549.299249, 599.124071, 601.227977, 597.118176, 613.247552, 633.278532,
+ 658.074755, 664.930719, 685.731531, 693.632845, 693.076350, 578.326477,
+ 267.695377, 354.273736, 389.976833, 438.518178, 493.332686, 544.343027,
+ 588.895829, 620.206193, 628.327410, 606.067827, 620.998532, 657.985256,
+ 683.936059, 691.345257, 693.894723, 695.175306, 693.618786, 578.517148,
+ 274.290725, 363.465288, 411.808596, 463.369805, 515.310226, 581.009306,
+ 613.070738, 636.638714, 647.333929, 629.867603, 644.646319, 687.796202,
+ 702.859596, 713.495479, 704.068069, 704.991807, 704.188594, 587.283658,
+ 302.538449, 389.174737, 438.518422, 493.398902, 547.662399, 601.981814,
+ 624.773046, 641.629484, 644.699451, 645.848784, 668.033340, 703.643523,
+ 707.422408, 717.329600, 726.298973, 744.127507, 745.365167, 617.954068,
+ 310.328188, 410.984766, 463.369805, 515.315010, 581.309832, 613.787792,
+ 634.988538, 654.145284, 662.632978, 668.413496, 706.494057, 750.545471,
+ 730.724808, 730.002100, 743.625262, 750.801609, 745.308457, 606.505800,
+ 329.948756, 437.600191, 493.398902, 547.661910, 601.917884, 622.557745,
+ 633.244395, 644.055898, 648.224221, 665.062911, 763.555733, 812.391078,
+ 769.063582, 744.865168, 727.579796, 724.950408, 722.179707, 598.564510,
+ 350.848328, 462.437458, 515.315010, 581.309823, 613.779123, 634.465309,
+ 652.056257, 662.179143, 671.466297, 726.881256, 819.824030, 880.232789,
+ 810.371672, 754.246481, 725.053473, 724.253390, 723.503395, 603.394909,
+ 373.704088, 492.408266, 547.661910, 601.917884, 622.557620, 633.236320,
+ 644.023513, 648.232514, 666.381639, 785.498283, 929.441612, 999.772800,
+ 890.339033, 775.852504, 731.840181, 726.905100, 725.251844, 604.899901,
+ 394.473422, 514.261306, 581.309823, 613.779123, 634.465309, 652.056257,
+ 662.179143, 671.466557, 727.134512, 835.764144, 981.747089, 1018.462934,
+ 939.686967, 811.276731, 739.398459, 727.365647, 725.285425, 604.923525,
+ 419.976505, 546.538939, 601.917884, 622.557620, 633.236320, 644.023513,
+ 648.232514, 666.381639, 785.545191, 932.841398, 1036.609617, 1026.945092,
+ 963.822765, 840.827315, 755.532423, 730.241865, 725.366847, 604.924155,
+ 437.281359, 580.116337, 613.779123, 634.465309, 652.056257, 662.179143,
+ 671.466557, 727.134512, 835.764859, 981.996194, 1031.896881, 1002.544732,
+ 881.157178, 828.151494, 799.340975, 751.314325, 728.316587, 605.005504,
+ 464.713920, 600.649281, 622.557620, 633.236320, 644.023513, 648.232514,
+ 666.381639, 785.545191, 932.841398, 1036.735329, 1035.037004, 995.478339,
+ 858.093733, 823.471976, 819.881754, 798.749289, 749.440463, 607.955244,
+ 495.880237, 612.473139, 634.465309, 652.056257, 662.179143, 671.466557,
+ 727.134512, 835.764859, 981.996194, 1032.339788, 1031.105117, 995.303259,
+ 857.733663, 823.435877, 822.822791, 819.873050, 796.882480, 629.038445,
+ 510.391280, 621.158273, 633.236320, 644.023513, 648.232514, 666.381639,
+ 785.545191, 932.841398, 1036.735329, 1035.566013, 1029.599350, 994.926093,
+ 857.645648, 823.435143, 822.904139, 822.822791, 817.965681, 673.856962,
+ 514.588176, 632.947715, 652.056257, 662.179143, 671.466557, 727.134512,
+ 835.764859, 981.996194, 1032.339788, 1031.547475, 1023.835377, 972.158629,
+ 851.968626, 823.347128, 822.904770, 822.904139, 820.752301, 684.418900,
+ 520.013294, 631.668183, 644.023513, 648.232514, 666.381639, 785.545191,
+ 932.841398, 1036.735329, 1035.567378, 1029.776746, 1001.044108, 880.853721,
+ 829.201546, 822.994150, 822.904770, 822.904770, 820.792975, 684.582020,
+ 531.253628, 650.479606, 662.179143, 671.466557, 727.134512, 835.764859,
+ 981.996194, 1032.339788, 1031.636855, 1029.601779, 995.366703, 858.086641,
+ 823.524524, 822.906135, 822.904770, 822.904770, 820.792975, 684.582020,
+ 528.531744, 642.424501, 648.232514, 666.381639, 785.545191, 932.841398,
+ 1036.735329, 1035.567378, 1030.219103, 1029.576226, 995.278687, 857.733663,
+ 823.436508, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 545.401164, 660.550678, 671.508859, 727.304161, 835.807162, 981.996850,
+ 1032.339788, 1031.636855, 1030.130788, 1029.487827, 994.925709, 857.645648,
+ 823.435143, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 537.684760, 646.650947, 669.110131, 796.487512, 935.569890, 1036.777631,
+ 1035.567378, 1030.219103, 1030.018584, 1023.810805, 972.158629, 851.968626,
+ 823.347128, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 552.408370, 670.001885, 738.246482, 879.690154, 992.939171, 1032.509436,
+ 1031.636855, 1030.132153, 1029.665223, 1001.043724, 880.853721, 829.201546,
+ 822.994150, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 539.835902, 667.496388, 799.216004, 946.512211, 1039.506123, 1035.609680,
+ 1030.219103, 1030.107964, 1029.577207, 995.366703, 858.086641, 823.524524,
+ 822.906135, 822.904770, 822.904770, 822.904770, 820.792975, 684.582020,
+ 558.362529, 734.277451, 877.197218, 990.478243, 1029.908393, 1028.993978,
+ 1027.488620, 1027.464048, 1026.933674, 992.724534, 855.532488, 821.323349,
+ 820.792975, 820.792975, 820.792975, 820.792975, 818.686600, 682.825198,
+ 453.127195, 649.075095, 780.278390, 867.165890, 862.469711, 857.067460,
+ 856.956321, 856.955937, 856.513579, 827.981461, 713.556496, 685.024378,
+ 684.582020, 684.582020, 684.582020, 684.582020, 682.825198, 569.510056,
+};
+
+static const double interp_dgrid_surf[65 * 18] = {
+ 10.650434, 12.204694, 12.040917, 11.843008, 11.845578, 12.051535, 12.103583,
+ 12.136780, 12.266709, 12.299107, 12.299673, 12.303120, 12.316337, 12.293431,
+ 12.092165, 11.602421, 11.141559, 8.864495, 12.770003, 14.634889, 14.437149,
+ 14.199413, 14.202487, 14.449423, 14.511827, 14.551629, 14.707410, 14.746265,
+ 14.747610, 14.753705, 14.762194, 14.699395, 14.390525, 13.690970, 12.874168,
+ 10.367121, 12.832328, 14.790730, 14.503765, 14.236403, 14.239028, 14.486600,
+ 14.549164, 14.589069, 14.745250, 14.784258, 14.788320, 14.801930, 14.762798,
+ 14.499088, 14.021544, 13.469684, 12.661560, 10.108384, 12.950520, 15.264726,
+ 14.621957, 14.238236, 14.239028, 14.486601, 14.549264, 14.589469, 14.745361,
+ 14.784949, 14.791572, 14.798652, 14.660251, 14.119394, 13.651131, 12.935657,
+ 12.176082, 9.228999, 12.979992, 15.382918, 14.651428, 14.238693, 14.239028,
+ 14.486701, 14.555710, 14.615321, 14.751849, 14.787700, 14.797104, 14.743189,
+ 14.475057, 13.944406, 13.450468, 12.687876, 11.824993, 8.906683, 12.980449,
+ 15.384750, 14.651885, 14.238700, 14.239028, 14.487102, 14.581562, 14.718998,
+ 14.777721, 14.788445, 14.778661, 14.582790, 14.099785, 13.649637, 12.935359,
+ 12.201859, 10.891931, 8.482221, 12.980449, 15.384750, 14.651886, 14.238801,
+ 14.239434, 14.487303, 14.588010, 14.744860, 14.784773, 14.786094, 14.735647,
+ 14.455704, 13.939591, 13.450393, 12.687876, 11.849334, 10.476658, 8.043672,
+ 12.980449, 15.384750, 14.651987, 14.245320, 14.265579, 14.493824, 14.588211,
+ 14.745312, 14.787263, 14.775934, 14.582036, 14.099475, 13.649563, 12.935358,
+ 12.201859, 10.911285, 9.730570, 6.696921, 12.980449, 15.384750, 14.652393,
+ 14.271466, 14.370434, 14.520069, 14.589027, 14.746028, 14.785482, 14.735605,
+ 14.455693, 13.939590, 13.450393, 12.687876, 11.849334, 10.494514, 9.195398,
+ 6.215460, 12.980449, 15.384750, 14.652494, 14.277985, 14.396679, 14.533035,
+ 14.615021, 14.754825, 14.775610, 14.582796, 14.099664, 13.649565, 12.935358,
+ 12.201859, 10.911285, 9.747361, 7.779960, 5.617541, 12.980448, 15.384731,
+ 14.652415, 14.278078, 14.397578, 14.559053, 14.718657, 14.776398, 14.747044,
+ 14.504690, 13.951810, 13.450583, 12.687876, 11.849334, 10.494514, 9.210817,
+ 7.210003, 5.164575, 12.980446, 15.383448, 14.647073, 14.277541, 14.403813,
+ 14.569546, 14.744956, 14.765103, 14.629073, 14.296161, 13.698573, 12.936118,
+ 12.201859, 10.911285, 9.747361, 7.790897, 6.322998, 3.931551, 12.981550,
+ 15.376916, 14.615597, 14.274820, 14.437479, 14.575942, 14.707492, 14.734111,
+ 14.515975, 14.000806, 13.462803, 12.688066, 11.849334, 10.494514, 9.210817,
+ 7.219566, 5.781392, 3.486081, 12.991899, 15.376201, 14.579444, 14.296898,
+ 14.473361, 14.522910, 14.491600, 14.543267, 14.288580, 13.700311, 12.936579,
+ 12.201867, 10.911285, 9.747361, 7.790897, 6.331506, 4.480348, 2.923138,
+ 13.019848, 15.383477, 14.582260, 14.385262, 14.452673, 14.436019, 14.238174,
+ 14.255993, 13.977481, 13.532342, 12.705591, 11.849605, 10.494514, 9.210817,
+ 7.219566, 5.789642, 4.018194, 2.766222, 13.028558, 15.315782, 14.439141,
+ 14.326286, 14.452429, 14.311731, 14.033235, 13.922587, 13.665868, 13.207897,
+ 12.274375, 10.912967, 9.747371, 7.790897, 6.331506, 4.488594, 3.454993,
+ 2.692682, 12.992752, 15.321471, 14.409573, 14.236340, 14.322969, 14.049072,
+ 13.764823, 13.479242, 13.250105, 12.759133, 12.019174, 10.532951, 9.211409,
+ 7.219566, 5.789642, 4.026440, 3.298077, 2.674624, 12.945493, 15.276596,
+ 14.315745, 14.026198, 14.085774, 13.844563, 13.447576, 12.964935, 12.735525,
+ 12.288592, 11.511693, 9.900227, 7.793270, 6.331506, 4.488594, 3.463236,
+ 3.224318, 2.672433, 12.757570, 15.056661, 14.095011, 13.722362, 13.812624,
+ 13.608480, 13.021206, 12.367627, 11.937931, 11.581049, 10.599552, 9.247860,
+ 7.220151, 5.789642, 4.026437, 3.305882, 3.191260, 2.615317, 12.581293,
+ 14.824658, 13.909074, 13.496158, 13.491402, 13.221550, 12.514140, 11.677229,
+ 10.936895, 10.619912, 9.634779, 7.763570, 6.331082, 4.488590, 3.462798,
+ 3.216460, 3.076315, 2.373499, 12.283499, 14.455760, 13.890593, 13.427587,
+ 13.183783, 12.763833, 11.861006, 10.740618, 9.820756, 9.354945, 8.669862,
+ 7.123268, 5.787860, 4.025994, 3.290000, 3.084410, 2.810905, 2.222916,
+ 12.010893, 14.300919, 13.986624, 13.484026, 13.025385, 12.224281, 11.064265,
+ 9.631040, 8.594396, 8.003736, 7.561587, 6.274418, 4.466637, 3.446574,
+ 3.102467, 2.816989, 2.598688, 1.951541, 11.581477, 13.831132, 13.632027,
+ 13.380414, 12.807880, 11.665651, 10.218236, 8.562237, 7.222614, 6.611808,
+ 6.261676, 5.402793, 3.938544, 3.174375, 2.818166, 2.602758, 2.213911,
+ 1.434763, 11.050735, 12.893449, 12.363152, 12.712829, 12.012961, 10.887854,
+ 9.109699, 7.421701, 5.965603, 5.272129, 4.991435, 4.423000, 3.369988,
+ 2.800371, 2.593901, 2.217431, 1.670917, 1.215265, 10.641194, 11.766277,
+ 10.777082, 10.972917, 10.689298, 9.701545, 7.719947, 6.145654, 4.872442,
+ 4.099600, 3.880934, 3.514159, 2.786474, 2.368963, 2.162376, 1.673670,
+ 1.450770, 1.185424, 10.071964, 11.107701, 9.172361, 8.551313, 8.412080,
+ 7.641397, 6.174246, 4.853916, 3.904549, 3.246810, 2.959903, 2.785066,
+ 2.240001, 1.793166, 1.585520, 1.449824, 1.405368, 1.168856, 9.213182,
+ 9.173278, 7.219231, 6.242951, 5.626013, 5.768007, 4.908666, 3.809589,
+ 3.115109, 2.617899, 2.274793, 2.172960, 1.838597, 1.505915, 1.414333,
+ 1.392666, 1.338173, 1.105611, 7.365015, 7.471370, 5.622346, 4.520127,
+ 3.936272, 4.208822, 3.623024, 2.977794, 2.450003, 2.097261, 1.824090,
+ 1.643270, 1.473525, 1.351388, 1.327504, 1.323865, 1.307894, 1.088234,
+ 6.198210, 6.580712, 4.682511, 3.416952, 2.941929, 2.766637, 2.650686,
+ 2.315439, 1.925838, 1.659784, 1.464419, 1.252806, 1.162722, 1.197518,
+ 1.199875, 1.197365, 1.194040, 0.995797, 5.402507, 5.055466, 3.728724,
+ 2.624359, 2.165810, 1.943189, 1.918190, 1.738078, 1.516328, 1.290520,
+ 1.155793, 1.015962, 0.881900, 0.807203, 0.754242, 0.743378, 0.740288,
+ 0.614158, 3.937867, 3.862507, 2.884664, 2.088147, 1.648496, 1.473584,
+ 1.340123, 1.291769, 1.165381, 1.000224, 0.893316, 0.821333, 0.691363,
+ 0.610501, 0.586766, 0.583762, 0.577840, 0.468733, 3.104660, 3.181078,
+ 2.420208, 1.747442, 1.297956, 1.109835, 0.970385, 0.943229, 0.876923,
+ 0.777584, 0.678183, 0.628623, 0.553745, 0.523430, 0.519490, 0.514394,
+ 0.492259, 0.403172, 2.593833, 2.533720, 2.010452, 1.480944, 1.060302,
+ 0.846383, 0.738703, 0.673144, 0.658010, 0.592449, 0.518236, 0.470335,
+ 0.425088, 0.393168, 0.378116, 0.355846, 0.275469, 0.213128, 2.176988,
+ 2.089575, 1.671284, 1.225008, 0.895382, 0.672008, 0.566241, 0.496746,
+ 0.488005, 0.449874, 0.400899, 0.354002, 0.318150, 0.281533, 0.238545,
+ 0.224159, 0.202399, 0.160681, 1.874679, 1.769165, 1.430124, 1.068727,
+ 0.780272, 0.557801, 0.441643, 0.377256, 0.352957, 0.338452, 0.304965,
+ 0.273172, 0.240052, 0.208724, 0.193431, 0.190845, 0.185025, 0.138166,
+ 1.590226, 1.502830, 1.193127, 0.917885, 0.670432, 0.474546, 0.355420,
+ 0.292305, 0.259035, 0.249937, 0.232079, 0.208943, 0.181936, 0.160038,
+ 0.152257, 0.151235, 0.149583, 0.120747, 1.331730, 1.255907, 1.012871,
+ 0.778422, 0.578977, 0.412432, 0.293155, 0.231824, 0.197187, 0.183921,
+ 0.174876, 0.157252, 0.140263, 0.127050, 0.110244, 0.105041, 0.104323,
+ 0.086944, 1.153994, 1.118771, 0.822355, 0.612321, 0.478249, 0.348222,
+ 0.247408, 0.186141, 0.152714, 0.135445, 0.129810, 0.119994, 0.115619,
+ 0.131626, 0.095612, 0.079343, 0.077502, 0.064550, 0.946317, 0.925894,
+ 0.677969, 0.499906, 0.397101, 0.297931, 0.214467, 0.152333, 0.120731,
+ 0.102686, 0.095062, 0.090361, 0.122319, 0.240194, 0.112687, 0.070690,
+ 0.070461, 0.054194, 0.824155, 0.787241, 0.581856, 0.419228, 0.313167,
+ 0.245582, 0.183500, 0.128101, 0.096577, 0.080267, 0.071022, 0.066851,
+ 0.085754, 0.154163, 0.075884, 0.052401, 0.054270, 0.026656, 0.716310,
+ 0.671378, 0.489580, 0.349569, 0.256155, 0.206343, 0.157853, 0.111950,
+ 0.079271, 0.062518, 0.053441, 0.049660, 0.051400, 0.063778, 0.039993,
+ 0.029133, 0.023382, 0.013725, 0.614125, 0.579096, 0.417126, 0.299465,
+ 0.217849, 0.165515, 0.129040, 0.093127, 0.065612, 0.049543, 0.041429,
+ 0.036850, 0.034416, 0.033989, 0.024216, 0.017377, 0.014833, 0.011987,
+ 0.520407, 0.487239, 0.349473, 0.251741, 0.184897, 0.135813, 0.107098,
+ 0.073607, 0.053938, 0.040531, 0.032931, 0.028876, 0.025759, 0.022168,
+ 0.016739, 0.014638, 0.014333, 0.011947, 0.449954, 0.415124, 0.299452,
+ 0.216942, 0.158874, 0.115334, 0.088821, 0.060105, 0.042610, 0.032566,
+ 0.026903, 0.023123, 0.019913, 0.016835, 0.014306, 0.013625, 0.013535,
+ 0.011284, 0.377618, 0.347773, 0.251741, 0.184839, 0.132857, 0.095439,
+ 0.070462, 0.052244, 0.036078, 0.026025, 0.021518, 0.018487, 0.015361,
+ 0.012905, 0.011470, 0.010569, 0.010283, 0.008297, 0.319953, 0.297976,
+ 0.216942, 0.158842, 0.113280, 0.080426, 0.057367, 0.041987, 0.030135,
+ 0.022295, 0.017901, 0.015121, 0.012224, 0.010035, 0.009353, 0.009108,
+ 0.008695, 0.006139, 0.267864, 0.250502, 0.184839, 0.132851, 0.095039,
+ 0.068220, 0.049135, 0.035315, 0.025144, 0.018237, 0.013857, 0.012094,
+ 0.009715, 0.007743, 0.006937, 0.006446, 0.006243, 0.004929, 0.230449,
+ 0.215895, 0.158842, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015673, 0.012133, 0.010083, 0.007801, 0.006053, 0.005401,
+ 0.003834, 0.003429, 0.002851, 0.193984, 0.183963, 0.132851, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013175, 0.010422,
+ 0.008491, 0.006397, 0.004567, 0.003494, 0.002933, 0.002825, 0.002355,
+ 0.167298, 0.158088, 0.113280, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009257, 0.007051, 0.005543, 0.003905,
+ 0.002984, 0.002825, 0.002814, 0.002347, 0.143228, 0.132220, 0.095039,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008403, 0.006661, 0.005378, 0.003545, 0.002876, 0.002818, 0.002814,
+ 0.002347, 0.122934, 0.112735, 0.080417, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007182, 0.006012, 0.003762,
+ 0.002866, 0.002739, 0.002788, 0.002810, 0.002347, 0.101934, 0.094569,
+ 0.068220, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006797, 0.005845, 0.003333, 0.002703, 0.002695, 0.002723,
+ 0.002781, 0.002343, 0.086702, 0.080014, 0.057174, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006533, 0.005839,
+ 0.003326, 0.002700, 0.002690, 0.002694, 0.002716, 0.002314, 0.073040,
+ 0.067886, 0.049133, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006468, 0.005831, 0.003325, 0.002700, 0.002690,
+ 0.002690, 0.002687, 0.002253, 0.061685, 0.056890, 0.041304, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006542, 0.006360,
+ 0.005416, 0.003221, 0.002698, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.052465, 0.048894, 0.035305, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006472, 0.005943, 0.003748, 0.002805, 0.002692,
+ 0.002690, 0.002690, 0.002683, 0.002238, 0.043838, 0.041101, 0.029959,
+ 0.021866, 0.015669, 0.011955, 0.009258, 0.007190, 0.006543, 0.006465,
+ 0.005839, 0.003333, 0.002702, 0.002690, 0.002690, 0.002690, 0.002683,
+ 0.002238, 0.037824, 0.035133, 0.025140, 0.018150, 0.013174, 0.010394,
+ 0.008405, 0.006807, 0.006480, 0.006464, 0.005838, 0.003326, 0.002700,
+ 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.031865, 0.029815,
+ 0.021866, 0.015668, 0.011955, 0.009258, 0.007190, 0.006543, 0.006475,
+ 0.006462, 0.005831, 0.003325, 0.002700, 0.002690, 0.002690, 0.002690,
+ 0.002683, 0.002238, 0.027150, 0.025016, 0.018128, 0.013083, 0.010371,
+ 0.008405, 0.006807, 0.006480, 0.006472, 0.006359, 0.005416, 0.003221,
+ 0.002698, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238, 0.023094,
+ 0.021760, 0.015577, 0.011590, 0.009167, 0.007188, 0.006543, 0.006475,
+ 0.006466, 0.005943, 0.003748, 0.002805, 0.002692, 0.002690, 0.002690,
+ 0.002690, 0.002683, 0.002238, 0.019269, 0.018038, 0.013060, 0.010280,
+ 0.008382, 0.006806, 0.006480, 0.006474, 0.006464, 0.005839, 0.003333,
+ 0.002702, 0.002690, 0.002690, 0.002690, 0.002690, 0.002683, 0.002238,
+ 0.016874, 0.015472, 0.011566, 0.009148, 0.007171, 0.006527, 0.006458,
+ 0.006457, 0.006447, 0.005823, 0.003318, 0.002693, 0.002683, 0.002683,
+ 0.002683, 0.002683, 0.002676, 0.002232, 0.011968, 0.011056, 0.008762,
+ 0.007219, 0.005717, 0.005391, 0.005386, 0.005386, 0.005377, 0.004856,
+ 0.002767, 0.002246, 0.002238, 0.002238, 0.002238, 0.002238, 0.002232,
+ 0.001862,
+};
+
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *dist_f) {
+ const double x_start = -0.5;
+ const double x_end = 16.5;
+ const double x_step = 1;
+ const double y_start = -15.5;
+ const double y_end = 16.5;
+ const double y_step = 0.5;
+ const double epsilon = 1e-6;
+ const int stride = (int)rint((x_end - x_start) / x_step) + 1;
+ (void)y_end;
+
+ xm = AOMMAX(xm, x_start + x_step + epsilon);
+ xm = AOMMIN(xm, x_end - x_step - epsilon);
+ yl = AOMMAX(yl, y_start + y_step + epsilon);
+ yl = AOMMIN(yl, y_end - y_step - epsilon);
+
+ const double y = (yl - y_start) / y_step;
+ const double x = (xm - x_start) / x_step;
+
+ const int yi = (int)floor(y);
+ const int xi = (int)floor(x);
+ assert(xi > 0);
+ assert(yi > 0);
+
+ const double yo = y - yi;
+ const double xo = x - xi;
+ const double *prate = &interp_rgrid_surf[(yi - 1) * stride + (xi - 1)];
+ const double *pdist = &interp_dgrid_surf[(yi - 1) * stride + (xi - 1)];
+ *rate_f = interp_bicubic(prate, stride, xo, yo);
+ *dist_f = interp_bicubic(pdist, stride, xo, yo);
+}
+
+static const double interp_rgrid_curv[65] = {
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 4.759876,
+ 8.132086, 13.651828, 21.908271, 33.522054, 48.782376, 71.530983,
+ 106.728649, 151.942795, 199.893011, 242.850965, 283.933923, 322.154203,
+ 360.684608, 394.801656, 426.879017, 460.234313, 484.103987, 508.261495,
+ 536.486763, 558.196737, 586.285894, 614.764511, 634.166333, 647.706472,
+ 658.211478, 681.360407, 701.052141, 727.007310, 768.663973, 804.407660,
+ 884.627751, 1065.658131, 1238.875214, 1440.185176, 1678.377931, 1962.243390,
+ 2300.571467, 2702.152072, 3175.775119, 3730.230519, 4374.308184, 5116.798028,
+ 5966.489961, 6932.173897, 8022.639747, 9246.677424, 10613.076839,
+};
+
+static const double interp_dgrid_curv[65] = {
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.604855,
+ 14.604855, 14.604855, 14.604855, 14.604855, 14.604855, 14.555776, 14.533692,
+ 14.439920, 14.257791, 13.977230, 13.623229, 13.064884, 12.355411, 11.560773,
+ 10.728960, 9.861975, 8.643612, 6.916021, 5.154769, 3.734940, 2.680051,
+ 1.925506, 1.408410, 1.042223, 0.767641, 0.565392, 0.420116, 0.310427,
+ 0.231711, 0.172999, 0.128293, 0.094992, 0.072171, 0.052972, 0.039354,
+ 0.029555, 0.022857, 0.016832, 0.013297, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,
+ 0.000000, 0.000000,
+};
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f) {
+ const double x_start = -15.5;
+ const double x_end = 16.5;
+ const double x_step = 0.5;
+ const double epsilon = 1e-6;
+ (void)x_end;
+
+ xqr = AOMMAX(xqr, x_start + x_step + epsilon);
+ xqr = AOMMIN(xqr, x_end - x_step - epsilon);
+ const double x = (xqr - x_start) / x_step;
+ const int xi = (int)floor(x);
+ const double xo = x - xi;
+
+ assert(xi > 0);
+
+ const double *prate = &interp_rgrid_curv[(xi - 1)];
+ const double *pdist = &interp_dgrid_curv[(xi - 1)];
+ *rate_f = interp_cubic(prate, xo);
+ *distbysse_f = interp_cubic(pdist, xo);
+}
+
+static void get_entropy_contexts_plane(BLOCK_SIZE plane_bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ const int num_4x4_w = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int num_4x4_h = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const ENTROPY_CONTEXT *const above = pd->above_context;
+ const ENTROPY_CONTEXT *const left = pd->left_context;
+
+ memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
+ memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
+}
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]) {
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ get_entropy_contexts_plane(plane_bsize, pd, t_above, t_left);
+}
+
+void av1_mv_pred(const AV1_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
+ int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
+ int i;
+ int zero_seen = 0;
+ int best_sad = INT_MAX;
+ int this_sad = INT_MAX;
+ int max_mv = 0;
+ uint8_t *src_y_ptr = x->plane[0].src.buf;
+ uint8_t *ref_y_ptr;
+ MV pred_mv[MAX_MV_REF_CANDIDATES + 1];
+ int num_mv_refs = 0;
+ const MV_REFERENCE_FRAME ref_frames[2] = { ref_frame, NONE_FRAME };
+ const int_mv ref_mv =
+ av1_get_ref_mv_from_stack(0, ref_frames, 0, x->mbmi_ext);
+ const int_mv ref_mv1 =
+ av1_get_ref_mv_from_stack(0, ref_frames, 1, x->mbmi_ext);
+
+ pred_mv[num_mv_refs++] = ref_mv.as_mv;
+ if (ref_mv.as_int != ref_mv1.as_int) {
+ pred_mv[num_mv_refs++] = ref_mv1.as_mv;
+ }
+ if (cpi->sf.adaptive_motion_search && block_size < x->max_partition_size)
+ pred_mv[num_mv_refs++] = x->pred_mv[ref_frame];
+
+ assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
+
+ // Get the sad for each candidate reference mv.
+ for (i = 0; i < num_mv_refs; ++i) {
+ const MV *this_mv = &pred_mv[i];
+ int fp_row, fp_col;
+ fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
+ fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
+ max_mv = AOMMAX(max_mv, AOMMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
+
+ if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
+ zero_seen |= (fp_row == 0 && fp_col == 0);
+
+ ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
+ // Find sad for current vector.
+ this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
+ ref_y_ptr, ref_y_stride);
+ // Note if it is the best so far.
+ if (this_sad < best_sad) {
+ best_sad = this_sad;
+ }
+ }
+
+ // Note the index of the mv that worked best in the reference list.
+ x->max_mv_context[ref_frame] = max_mv;
+ x->pred_mv_sad[ref_frame] = best_sad;
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes) {
+ int i;
+
+ dst[0].buf = src->y_buffer;
+ dst[0].stride = src->y_stride;
+ dst[1].buf = src->u_buffer;
+ dst[2].buf = src->v_buffer;
+ dst[1].stride = dst[2].stride = src->uv_stride;
+
+ for (i = 0; i < num_planes; ++i) {
+ setup_pred_plane(dst + i, xd->mi[0]->sb_type, dst[i].buf,
+ i ? src->uv_crop_width : src->y_crop_width,
+ i ? src->uv_crop_height : src->y_crop_height,
+ dst[i].stride, mi_row, mi_col, i ? scale_uv : scale,
+ xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
+ }
+}
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride) {
+ const int bw = mi_size_wide_log2[plane_bsize];
+ const int y = 4 * (raster_block >> bw);
+ const int x = 4 * (raster_block & ((1 << bw) - 1));
+ return y * stride + x;
+}
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base) {
+ const int stride = block_size_wide[plane_bsize];
+ return base + av1_raster_block_offset(plane_bsize, raster_block, stride);
+}
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const AV1_COMP *cpi,
+ int ref_frame) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
+ const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+ return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
+ ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
+ : NULL;
+}
+
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+ const MACROBLOCKD *xd) {
+ if (cm->interp_filter == SWITCHABLE) {
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ int inter_filter_cost = 0;
+ int dir;
+
+ for (dir = 0; dir < 2; ++dir) {
+ const int ctx = av1_get_pred_context_switchable_interp(xd, dir);
+ const InterpFilter filter =
+ av1_extract_interp_filter(mbmi->interp_filters, dir);
+ inter_filter_cost += x->switchable_interp_costs[ctx][filter];
+ }
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+ } else {
+ return 0;
+ }
+}
+
+void av1_set_rd_speed_thresholds(AV1_COMP *cpi) {
+ int i;
+ RD_OPT *const rd = &cpi->rd;
+ SPEED_FEATURES *const sf = &cpi->sf;
+
+ // Set baseline threshold values.
+ for (i = 0; i < MAX_MODES; ++i) rd->thresh_mult[i] = cpi->oxcf.mode == 0;
+
+ if (sf->adaptive_rd_thresh) {
+ rd->thresh_mult[THR_NEARESTMV] = 300;
+ rd->thresh_mult[THR_NEARESTL2] = 300;
+ rd->thresh_mult[THR_NEARESTL3] = 300;
+ rd->thresh_mult[THR_NEARESTB] = 300;
+ rd->thresh_mult[THR_NEARESTA2] = 300;
+ rd->thresh_mult[THR_NEARESTA] = 300;
+ rd->thresh_mult[THR_NEARESTG] = 300;
+ } else {
+ rd->thresh_mult[THR_NEARESTMV] = 0;
+ rd->thresh_mult[THR_NEARESTL2] = 0;
+ rd->thresh_mult[THR_NEARESTL3] = 0;
+ rd->thresh_mult[THR_NEARESTB] = 0;
+ rd->thresh_mult[THR_NEARESTA2] = 0;
+ rd->thresh_mult[THR_NEARESTA] = 0;
+ rd->thresh_mult[THR_NEARESTG] = 0;
+ }
+
+ rd->thresh_mult[THR_NEWMV] += 1000;
+ rd->thresh_mult[THR_NEWL2] += 1000;
+ rd->thresh_mult[THR_NEWL3] += 1000;
+ rd->thresh_mult[THR_NEWB] += 1000;
+ rd->thresh_mult[THR_NEWA2] = 1000;
+ rd->thresh_mult[THR_NEWA] += 1000;
+ rd->thresh_mult[THR_NEWG] += 1000;
+
+ rd->thresh_mult[THR_NEARMV] += 1000;
+ rd->thresh_mult[THR_NEARL2] += 1000;
+ rd->thresh_mult[THR_NEARL3] += 1000;
+ rd->thresh_mult[THR_NEARB] += 1000;
+ rd->thresh_mult[THR_NEARA2] = 1000;
+ rd->thresh_mult[THR_NEARA] += 1000;
+ rd->thresh_mult[THR_NEARG] += 1000;
+
+ rd->thresh_mult[THR_GLOBALMV] += 2000;
+ rd->thresh_mult[THR_GLOBALL2] += 2000;
+ rd->thresh_mult[THR_GLOBALL3] += 2000;
+ rd->thresh_mult[THR_GLOBALB] += 2000;
+ rd->thresh_mult[THR_GLOBALA2] = 2000;
+ rd->thresh_mult[THR_GLOBALG] += 2000;
+ rd->thresh_mult[THR_GLOBALA] += 2000;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA2] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A2] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A2] += 1000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA2] += 1000;
+
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL2] += 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLL3] += 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTLG] += 2000;
+ rd->thresh_mult[THR_COMP_NEAREST_NEARESTBA] += 2000;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLB] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2B] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3B] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGB] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLA2] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLA2] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLA2] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLA2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARLA2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWLA2] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLA2] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL2A2] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL2A2] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL2A2] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL2A2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL2A2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL2A2] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL2A2] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARL3A2] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWL3A2] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTL3A2] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWL3A2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARL3A2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWL3A2] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALL3A2] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARGA2] += 1200;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWGA2] += 1500;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTGA2] += 1500;
+ rd->thresh_mult[THR_COMP_NEAR_NEWGA2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEARGA2] += 1700;
+ rd->thresh_mult[THR_COMP_NEW_NEWGA2] += 2000;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALGA2] += 2500;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL2] += 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL2] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL2] += 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL2] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL2] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL2] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL2] += 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLL3] += 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLL3] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLL3] += 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLL3] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARLL3] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLL3] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLL3] += 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARLG] += 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWLG] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTLG] += 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWLG] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARLG] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWLG] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALLG] += 3200;
+
+ rd->thresh_mult[THR_COMP_NEAR_NEARBA] += 1600;
+ rd->thresh_mult[THR_COMP_NEAREST_NEWBA] += 2000;
+ rd->thresh_mult[THR_COMP_NEW_NEARESTBA] += 2000;
+ rd->thresh_mult[THR_COMP_NEAR_NEWBA] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEARBA] += 2200;
+ rd->thresh_mult[THR_COMP_NEW_NEWBA] += 2400;
+ rd->thresh_mult[THR_COMP_GLOBAL_GLOBALBA] += 3200;
+
+ rd->thresh_mult[THR_DC] += 1000;
+ rd->thresh_mult[THR_PAETH] += 1000;
+ rd->thresh_mult[THR_SMOOTH] += 2000;
+ rd->thresh_mult[THR_SMOOTH_V] += 2000;
+ rd->thresh_mult[THR_SMOOTH_H] += 2000;
+ rd->thresh_mult[THR_H_PRED] += 2000;
+ rd->thresh_mult[THR_V_PRED] += 2000;
+ rd->thresh_mult[THR_D135_PRED] += 2500;
+ rd->thresh_mult[THR_D203_PRED] += 2500;
+ rd->thresh_mult[THR_D157_PRED] += 2500;
+ rd->thresh_mult[THR_D67_PRED] += 2500;
+ rd->thresh_mult[THR_D113_PRED] += 2500;
+ rd->thresh_mult[THR_D45_PRED] += 2500;
+}
+
+void av1_set_rd_speed_thresholds_sub8x8(AV1_COMP *cpi) {
+ static const int thresh_mult[MAX_REFS] = { 2500, 2500, 2500, 2500, 2500,
+ 2500, 2500, 4500, 4500, 4500,
+ 4500, 4500, 4500, 4500, 4500,
+ 4500, 4500, 4500, 4500, 2500 };
+ RD_OPT *const rd = &cpi->rd;
+ memcpy(rd->thresh_mult_sub8x8, thresh_mult, sizeof(thresh_mult));
+}
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*factor_buf)[MAX_MODES], int rd_thresh,
+ int bsize, int best_mode_index) {
+ if (rd_thresh > 0) {
+ const int top_mode = MAX_MODES;
+ int mode;
+ for (mode = 0; mode < top_mode; ++mode) {
+ const BLOCK_SIZE min_size = AOMMAX(bsize - 1, BLOCK_4X4);
+ const BLOCK_SIZE max_size =
+ AOMMIN(bsize + 2, (int)cm->seq_params.sb_size);
+ BLOCK_SIZE bs;
+ for (bs = min_size; bs <= max_size; ++bs) {
+ int *const fact = &factor_buf[bs][mode];
+ if (mode == best_mode_index) {
+ *fact -= (*fact >> 4);
+ } else {
+ *fact = AOMMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
+ }
+ }
+ }
+ }
+}
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth) {
+ const int q = av1_dc_quant_Q3(qindex, qdelta, bit_depth);
+ switch (bit_depth) {
+ case AOM_BITS_8: return 20 * q;
+ case AOM_BITS_10: return 5 * q;
+ case AOM_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
+ default:
+ assert(0 && "bit_depth should be AOM_BITS_8, AOM_BITS_10 or AOM_BITS_12");
+ return -1;
+ }
+}
diff --git a/third_party/aom/av1/encoder/rd.h b/third_party/aom/av1/encoder/rd.h
new file mode 100644
index 0000000000..755b61df50
--- /dev/null
+++ b/third_party/aom/av1/encoder/rd.h
@@ -0,0 +1,464 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RD_H_
+#define AOM_AV1_ENCODER_RD_H_
+
+#include <limits.h>
+
+#include "av1/common/blockd.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/cost.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RDDIV_BITS 7
+#define RD_EPB_SHIFT 6
+
+#define RDCOST(RM, R, D) \
+ (ROUND_POWER_OF_TWO(((int64_t)(R)) * (RM), AV1_PROB_COST_SHIFT) + \
+ ((D) * (1 << RDDIV_BITS)))
+
+#define RDCOST_DBL(RM, R, D) \
+ (((((double)(R)) * (RM)) / (double)(1 << AV1_PROB_COST_SHIFT)) + \
+ ((double)(D) * (1 << RDDIV_BITS)))
+
+#define QIDX_SKIP_THRESH 115
+
+#define MV_COST_WEIGHT 108
+#define MV_COST_WEIGHT_SUB 120
+
+#define RD_THRESH_MAX_FACT 64
+#define RD_THRESH_INC 1
+
+// Factor to weigh the rate for switchable interp filters.
+#define SWITCHABLE_INTERP_RATE_FACTOR 1
+
+// This enumerator type needs to be kept aligned with the mode order in
+// const MODE_DEFINITION av1_mode_order[MAX_MODES] used in the rd code.
+typedef enum {
+ THR_NEARESTMV,
+ THR_NEARESTL2,
+ THR_NEARESTL3,
+ THR_NEARESTB,
+ THR_NEARESTA2,
+ THR_NEARESTA,
+ THR_NEARESTG,
+
+ THR_NEWMV,
+ THR_NEWL2,
+ THR_NEWL3,
+ THR_NEWB,
+ THR_NEWA2,
+ THR_NEWA,
+ THR_NEWG,
+
+ THR_NEARMV,
+ THR_NEARL2,
+ THR_NEARL3,
+ THR_NEARB,
+ THR_NEARA2,
+ THR_NEARA,
+ THR_NEARG,
+
+ THR_GLOBALMV,
+ THR_GLOBALL2,
+ THR_GLOBALL3,
+ THR_GLOBALB,
+ THR_GLOBALA2,
+ THR_GLOBALA,
+ THR_GLOBALG,
+
+ THR_COMP_NEAREST_NEARESTLA,
+ THR_COMP_NEAREST_NEARESTL2A,
+ THR_COMP_NEAREST_NEARESTL3A,
+ THR_COMP_NEAREST_NEARESTGA,
+ THR_COMP_NEAREST_NEARESTLB,
+ THR_COMP_NEAREST_NEARESTL2B,
+ THR_COMP_NEAREST_NEARESTL3B,
+ THR_COMP_NEAREST_NEARESTGB,
+ THR_COMP_NEAREST_NEARESTLA2,
+ THR_COMP_NEAREST_NEARESTL2A2,
+ THR_COMP_NEAREST_NEARESTL3A2,
+ THR_COMP_NEAREST_NEARESTGA2,
+ THR_COMP_NEAREST_NEARESTLL2,
+ THR_COMP_NEAREST_NEARESTLL3,
+ THR_COMP_NEAREST_NEARESTLG,
+ THR_COMP_NEAREST_NEARESTBA,
+
+ THR_COMP_NEAR_NEARLA,
+ THR_COMP_NEW_NEARESTLA,
+ THR_COMP_NEAREST_NEWLA,
+ THR_COMP_NEW_NEARLA,
+ THR_COMP_NEAR_NEWLA,
+ THR_COMP_NEW_NEWLA,
+ THR_COMP_GLOBAL_GLOBALLA,
+
+ THR_COMP_NEAR_NEARL2A,
+ THR_COMP_NEW_NEARESTL2A,
+ THR_COMP_NEAREST_NEWL2A,
+ THR_COMP_NEW_NEARL2A,
+ THR_COMP_NEAR_NEWL2A,
+ THR_COMP_NEW_NEWL2A,
+ THR_COMP_GLOBAL_GLOBALL2A,
+
+ THR_COMP_NEAR_NEARL3A,
+ THR_COMP_NEW_NEARESTL3A,
+ THR_COMP_NEAREST_NEWL3A,
+ THR_COMP_NEW_NEARL3A,
+ THR_COMP_NEAR_NEWL3A,
+ THR_COMP_NEW_NEWL3A,
+ THR_COMP_GLOBAL_GLOBALL3A,
+
+ THR_COMP_NEAR_NEARGA,
+ THR_COMP_NEW_NEARESTGA,
+ THR_COMP_NEAREST_NEWGA,
+ THR_COMP_NEW_NEARGA,
+ THR_COMP_NEAR_NEWGA,
+ THR_COMP_NEW_NEWGA,
+ THR_COMP_GLOBAL_GLOBALGA,
+
+ THR_COMP_NEAR_NEARLB,
+ THR_COMP_NEW_NEARESTLB,
+ THR_COMP_NEAREST_NEWLB,
+ THR_COMP_NEW_NEARLB,
+ THR_COMP_NEAR_NEWLB,
+ THR_COMP_NEW_NEWLB,
+ THR_COMP_GLOBAL_GLOBALLB,
+
+ THR_COMP_NEAR_NEARL2B,
+ THR_COMP_NEW_NEARESTL2B,
+ THR_COMP_NEAREST_NEWL2B,
+ THR_COMP_NEW_NEARL2B,
+ THR_COMP_NEAR_NEWL2B,
+ THR_COMP_NEW_NEWL2B,
+ THR_COMP_GLOBAL_GLOBALL2B,
+
+ THR_COMP_NEAR_NEARL3B,
+ THR_COMP_NEW_NEARESTL3B,
+ THR_COMP_NEAREST_NEWL3B,
+ THR_COMP_NEW_NEARL3B,
+ THR_COMP_NEAR_NEWL3B,
+ THR_COMP_NEW_NEWL3B,
+ THR_COMP_GLOBAL_GLOBALL3B,
+
+ THR_COMP_NEAR_NEARGB,
+ THR_COMP_NEW_NEARESTGB,
+ THR_COMP_NEAREST_NEWGB,
+ THR_COMP_NEW_NEARGB,
+ THR_COMP_NEAR_NEWGB,
+ THR_COMP_NEW_NEWGB,
+ THR_COMP_GLOBAL_GLOBALGB,
+
+ THR_COMP_NEAR_NEARLA2,
+ THR_COMP_NEW_NEARESTLA2,
+ THR_COMP_NEAREST_NEWLA2,
+ THR_COMP_NEW_NEARLA2,
+ THR_COMP_NEAR_NEWLA2,
+ THR_COMP_NEW_NEWLA2,
+ THR_COMP_GLOBAL_GLOBALLA2,
+
+ THR_COMP_NEAR_NEARL2A2,
+ THR_COMP_NEW_NEARESTL2A2,
+ THR_COMP_NEAREST_NEWL2A2,
+ THR_COMP_NEW_NEARL2A2,
+ THR_COMP_NEAR_NEWL2A2,
+ THR_COMP_NEW_NEWL2A2,
+ THR_COMP_GLOBAL_GLOBALL2A2,
+
+ THR_COMP_NEAR_NEARL3A2,
+ THR_COMP_NEW_NEARESTL3A2,
+ THR_COMP_NEAREST_NEWL3A2,
+ THR_COMP_NEW_NEARL3A2,
+ THR_COMP_NEAR_NEWL3A2,
+ THR_COMP_NEW_NEWL3A2,
+ THR_COMP_GLOBAL_GLOBALL3A2,
+
+ THR_COMP_NEAR_NEARGA2,
+ THR_COMP_NEW_NEARESTGA2,
+ THR_COMP_NEAREST_NEWGA2,
+ THR_COMP_NEW_NEARGA2,
+ THR_COMP_NEAR_NEWGA2,
+ THR_COMP_NEW_NEWGA2,
+ THR_COMP_GLOBAL_GLOBALGA2,
+
+ THR_COMP_NEAR_NEARLL2,
+ THR_COMP_NEW_NEARESTLL2,
+ THR_COMP_NEAREST_NEWLL2,
+ THR_COMP_NEW_NEARLL2,
+ THR_COMP_NEAR_NEWLL2,
+ THR_COMP_NEW_NEWLL2,
+ THR_COMP_GLOBAL_GLOBALLL2,
+
+ THR_COMP_NEAR_NEARLL3,
+ THR_COMP_NEW_NEARESTLL3,
+ THR_COMP_NEAREST_NEWLL3,
+ THR_COMP_NEW_NEARLL3,
+ THR_COMP_NEAR_NEWLL3,
+ THR_COMP_NEW_NEWLL3,
+ THR_COMP_GLOBAL_GLOBALLL3,
+
+ THR_COMP_NEAR_NEARLG,
+ THR_COMP_NEW_NEARESTLG,
+ THR_COMP_NEAREST_NEWLG,
+ THR_COMP_NEW_NEARLG,
+ THR_COMP_NEAR_NEWLG,
+ THR_COMP_NEW_NEWLG,
+ THR_COMP_GLOBAL_GLOBALLG,
+
+ THR_COMP_NEAR_NEARBA,
+ THR_COMP_NEW_NEARESTBA,
+ THR_COMP_NEAREST_NEWBA,
+ THR_COMP_NEW_NEARBA,
+ THR_COMP_NEAR_NEWBA,
+ THR_COMP_NEW_NEWBA,
+ THR_COMP_GLOBAL_GLOBALBA,
+
+ THR_DC,
+ THR_PAETH,
+ THR_SMOOTH,
+ THR_SMOOTH_V,
+ THR_SMOOTH_H,
+ THR_H_PRED,
+ THR_V_PRED,
+ THR_D135_PRED,
+ THR_D203_PRED,
+ THR_D157_PRED,
+ THR_D67_PRED,
+ THR_D113_PRED,
+ THR_D45_PRED,
+
+ MAX_MODES,
+
+ LAST_SINGLE_REF_MODES = THR_GLOBALG,
+ MAX_SINGLE_REF_MODES = LAST_SINGLE_REF_MODES + 1,
+ LAST_COMP_REF_MODES = THR_COMP_GLOBAL_GLOBALBA,
+ MAX_COMP_REF_MODES = LAST_COMP_REF_MODES + 1
+} THR_MODES;
+
+typedef enum {
+ THR_LAST,
+ THR_LAST2,
+ THR_LAST3,
+ THR_BWDR,
+ THR_ALTR2,
+ THR_GOLD,
+ THR_ALTR,
+
+ THR_COMP_LA,
+ THR_COMP_L2A,
+ THR_COMP_L3A,
+ THR_COMP_GA,
+
+ THR_COMP_LB,
+ THR_COMP_L2B,
+ THR_COMP_L3B,
+ THR_COMP_GB,
+
+ THR_COMP_LA2,
+ THR_COMP_L2A2,
+ THR_COMP_L3A2,
+ THR_COMP_GA2,
+
+ THR_INTRA,
+
+ MAX_REFS
+} THR_MODES_SUB8X8;
+
+typedef struct RD_OPT {
+ // Thresh_mult is used to set a threshold for the rd score. A higher value
+ // means that we will accept the best mode so far more often. This number
+ // is used in combination with the current block size, and thresh_freq_fact
+ // to pick a threshold.
+ int thresh_mult[MAX_MODES];
+ int thresh_mult_sub8x8[MAX_REFS];
+
+ int threshes[MAX_SEGMENTS][BLOCK_SIZES_ALL][MAX_MODES];
+
+ int64_t prediction_type_threshes[REF_FRAMES][REFERENCE_MODES];
+
+ int RDMULT;
+} RD_OPT;
+
+static INLINE void av1_init_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = 0;
+ rd_stats->dist = 0;
+ rd_stats->rdcost = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip = 1;
+ rd_stats->zero_rate = 0;
+ rd_stats->invalid_rate = 0;
+ rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = 0;
+ {
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+ rd_stats->txb_coeff_cost_map[plane][r][c] = 0;
+ }
+ }
+#endif
+}
+
+static INLINE void av1_invalid_rd_stats(RD_STATS *rd_stats) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats->rate = INT_MAX;
+ rd_stats->dist = INT64_MAX;
+ rd_stats->rdcost = INT64_MAX;
+ rd_stats->sse = INT64_MAX;
+ rd_stats->skip = 0;
+ rd_stats->zero_rate = 0;
+ rd_stats->invalid_rate = 1;
+ rd_stats->ref_rdcost = INT64_MAX;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats->txb_coeff_cost[plane] = INT_MAX;
+ {
+ int r, c;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c)
+ rd_stats->txb_coeff_cost_map[plane][r][c] = INT_MAX;
+ }
+ }
+#endif
+}
+
+static INLINE void av1_merge_rd_stats(RD_STATS *rd_stats_dst,
+ const RD_STATS *rd_stats_src) {
+#if CONFIG_RD_DEBUG
+ int plane;
+#endif
+ rd_stats_dst->rate += rd_stats_src->rate;
+ if (!rd_stats_dst->zero_rate)
+ rd_stats_dst->zero_rate = rd_stats_src->zero_rate;
+ rd_stats_dst->dist += rd_stats_src->dist;
+ rd_stats_dst->sse += rd_stats_src->sse;
+ rd_stats_dst->skip &= rd_stats_src->skip;
+ rd_stats_dst->invalid_rate &= rd_stats_src->invalid_rate;
+#if CONFIG_RD_DEBUG
+ // This may run into problems when monochrome video is
+ // encoded, as there will only be 1 plane
+ for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+ rd_stats_dst->txb_coeff_cost[plane] += rd_stats_src->txb_coeff_cost[plane];
+ {
+ // TODO(angiebird): optimize this part
+ int r, c;
+ int ref_txb_coeff_cost = 0;
+ for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r)
+ for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+ rd_stats_dst->txb_coeff_cost_map[plane][r][c] +=
+ rd_stats_src->txb_coeff_cost_map[plane][r][c];
+ ref_txb_coeff_cost += rd_stats_dst->txb_coeff_cost_map[plane][r][c];
+ }
+ assert(ref_txb_coeff_cost == rd_stats_dst->txb_coeff_cost[plane]);
+ }
+ }
+#endif
+}
+
+struct TileInfo;
+struct TileDataEnc;
+struct AV1_COMP;
+struct macroblock;
+
+int av1_compute_rd_mult(const struct AV1_COMP *cpi, int qindex);
+
+void av1_initialize_rd_consts(struct AV1_COMP *cpi);
+
+void av1_initialize_me_consts(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ int qindex);
+
+void av1_model_rd_from_var_lapndz(int64_t var, unsigned int n,
+ unsigned int qstep, int *rate, int64_t *dist);
+
+void av1_model_rd_curvfit(double xqr, double *rate_f, double *distbysse_f);
+void av1_model_rd_surffit(double xm, double yl, double *rate_f,
+ double *distbysse_f);
+
+int av1_get_switchable_rate(const AV1_COMMON *const cm, MACROBLOCK *x,
+ const MACROBLOCKD *xd);
+
+int av1_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
+ int stride);
+
+int16_t *av1_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
+ int16_t *base);
+
+YV12_BUFFER_CONFIG *av1_get_scaled_ref_frame(const struct AV1_COMP *cpi,
+ int ref_frame);
+
+void av1_init_me_luts(void);
+
+void av1_set_mvcost(MACROBLOCK *x, int ref, int ref_mv_idx);
+
+void av1_get_entropy_contexts(BLOCK_SIZE bsize,
+ const struct macroblockd_plane *pd,
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE],
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE]);
+
+void av1_set_rd_speed_thresholds(struct AV1_COMP *cpi);
+
+void av1_set_rd_speed_thresholds_sub8x8(struct AV1_COMP *cpi);
+
+void av1_update_rd_thresh_fact(const AV1_COMMON *const cm,
+ int (*fact)[MAX_MODES], int rd_thresh, int bsize,
+ int best_mode_index);
+
+static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
+ int thresh_fact) {
+ return best_rd < ((int64_t)thresh * thresh_fact >> 5) || thresh == INT_MAX;
+}
+
+void av1_mv_pred(const struct AV1_COMP *cpi, MACROBLOCK *x,
+ uint8_t *ref_y_buffer, int ref_y_stride, int ref_frame,
+ BLOCK_SIZE block_size);
+
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+ x->errorperbit = rdmult >> RD_EPB_SHIFT;
+ x->errorperbit += (x->errorperbit == 0);
+}
+
+void av1_setup_pred_block(const MACROBLOCKD *xd,
+ struct buf_2d dst[MAX_MB_PLANE],
+ const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
+ const struct scale_factors *scale,
+ const struct scale_factors *scale_uv,
+ const int num_planes);
+
+int av1_get_intra_cost_penalty(int qindex, int qdelta,
+ aom_bit_depth_t bit_depth);
+
+void av1_fill_mode_rates(AV1_COMMON *const cm, MACROBLOCK *x,
+ FRAME_CONTEXT *fc);
+
+void av1_fill_coeff_costs(MACROBLOCK *x, FRAME_CONTEXT *fc,
+ const int num_planes);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RD_H_
diff --git a/third_party/aom/av1/encoder/rdopt.c b/third_party/aom/av1/encoder/rdopt.c
new file mode 100644
index 0000000000..c2d15534f0
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.c
@@ -0,0 +1,12199 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/system_state.h"
+
+#include "av1/common/cfl.h"
+#include "av1/common/common.h"
+#include "av1/common/common_data.h"
+#include "av1/common/entropy.h"
+#include "av1/common/entropymode.h"
+#include "av1/common/idct.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/obmc.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+#include "av1/common/txb_common.h"
+#include "av1/common/warped_motion.h"
+
+#include "av1/encoder/aq_variance.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encodemb.h"
+#include "av1/encoder/encodemv.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/hybrid_fwd_txfm.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/ml.h"
+#include "av1/encoder/palette.h"
+#include "av1/encoder/pustats.h"
+#include "av1/encoder/random.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/rd.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/tokenize.h"
+#include "av1/encoder/tx_prune_model_weights.h"
+
+typedef void (*model_rd_for_sb_type)(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+typedef void (*model_rd_from_sse_type)(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist);
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist);
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist);
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist);
+
+typedef enum {
+ MODELRD_LEGACY,
+ MODELRD_CURVFIT,
+ MODELRD_SUFFIT,
+ MODELRD_DNN,
+ MODELRD_FULLRDY,
+ MODELRD_TYPES
+} ModelRdType;
+
+static model_rd_for_sb_type model_rd_sb_fn[MODELRD_TYPES] = {
+ model_rd_for_sb, model_rd_for_sb_with_curvfit, model_rd_for_sb_with_surffit,
+ model_rd_for_sb_with_dnn, model_rd_for_sb_with_fullrdy
+};
+
+static model_rd_from_sse_type model_rd_sse_fn[MODELRD_TYPES] = {
+ model_rd_from_sse, model_rd_with_curvfit, model_rd_with_surffit,
+ model_rd_with_dnn, NULL
+};
+
+// 0: Legacy model
+// 1: Curve fit model
+// 2: Surface fit model
+// 3: DNN regression model
+// 4: Full rd model
+#define MODELRD_TYPE_INTERP_FILTER 1
+#define MODELRD_TYPE_TX_SEARCH_PRUNE 2
+#define MODELRD_TYPE_MASKED_COMPOUND 1
+#define MODELRD_TYPE_INTERINTRA 1
+#define MODELRD_TYPE_INTRA 1
+#define MODELRD_TYPE_JNT_COMPOUND 1
+
+#define DUAL_FILTER_SET_SIZE (SWITCHABLE_FILTERS * SWITCHABLE_FILTERS)
+static const InterpFilters filter_sets[DUAL_FILTER_SET_SIZE] = {
+ 0x00000000, 0x00010000, 0x00020000, // y = 0
+ 0x00000001, 0x00010001, 0x00020001, // y = 1
+ 0x00000002, 0x00010002, 0x00020002, // y = 2
+};
+
+#define SECOND_REF_FRAME_MASK \
+ ((1 << ALTREF_FRAME) | (1 << ALTREF2_FRAME) | (1 << BWDREF_FRAME) | \
+ (1 << GOLDEN_FRAME) | (1 << LAST2_FRAME) | 0x01)
+
+#define ANGLE_SKIP_THRESH 10
+
+static const double ADST_FLIP_SVM[8] = {
+ /* vertical */
+ -6.6623, -2.8062, -3.2531, 3.1671,
+ /* horizontal */
+ -7.7051, -3.2234, -3.6193, 3.4533
+};
+
+typedef struct {
+ PREDICTION_MODE mode;
+ MV_REFERENCE_FRAME ref_frame[2];
+} MODE_DEFINITION;
+
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
+
+typedef enum {
+ FTXS_NONE = 0,
+ FTXS_DCT_AND_1D_DCT_ONLY = 1 << 0,
+ FTXS_DISABLE_TRELLIS_OPT = 1 << 1,
+ FTXS_USE_TRANSFORM_DOMAIN = 1 << 2
+} FAST_TX_SEARCH_MODE;
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t ref_best_rd);
+
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode);
+
+struct rdcost_block_args {
+ const AV1_COMP *cpi;
+ MACROBLOCK *x;
+ ENTROPY_CONTEXT t_above[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT t_left[MAX_MIB_SIZE];
+ RD_STATS rd_stats;
+ int64_t this_rd;
+ int64_t best_rd;
+ int exit_early;
+ int incomplete_exit;
+ int use_fast_coef_costing;
+ FAST_TX_SEARCH_MODE ftxs_mode;
+};
+
+#define LAST_NEW_MV_INDEX 6
+static const MODE_DEFINITION av1_mode_order[MAX_MODES] = {
+ { NEARESTMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARESTMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARESTMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARESTMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEWMV, { LAST_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEWMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEWMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEWMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEWMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { NEARMV, { LAST_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST2_FRAME, NONE_FRAME } },
+ { NEARMV, { LAST3_FRAME, NONE_FRAME } },
+ { NEARMV, { BWDREF_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { NEARMV, { ALTREF_FRAME, NONE_FRAME } },
+ { NEARMV, { GOLDEN_FRAME, NONE_FRAME } },
+
+ { GLOBALMV, { LAST_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { LAST3_FRAME, NONE_FRAME } },
+ { GLOBALMV, { BWDREF_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF2_FRAME, NONE_FRAME } },
+ { GLOBALMV, { GOLDEN_FRAME, NONE_FRAME } },
+ { GLOBALMV, { ALTREF_FRAME, NONE_FRAME } },
+
+ // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, BWDREF_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST2_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST2_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { LAST3_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST3_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARESTMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAREST_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEARMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEAR_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { NEW_NEWMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+ { GLOBAL_GLOBALMV, { GOLDEN_FRAME, ALTREF2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST2_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST2_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, LAST3_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, LAST3_FRAME } },
+
+ { NEAR_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARESTMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAREST_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEARMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEAR_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { NEW_NEWMV, { LAST_FRAME, GOLDEN_FRAME } },
+ { GLOBAL_GLOBALMV, { LAST_FRAME, GOLDEN_FRAME } },
+
+ { NEAR_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARESTMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAREST_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEARMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEAR_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { NEW_NEWMV, { BWDREF_FRAME, ALTREF_FRAME } },
+ { GLOBAL_GLOBALMV, { BWDREF_FRAME, ALTREF_FRAME } },
+
+ // intra modes
+ { DC_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { PAETH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { SMOOTH_H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { H_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { V_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D135_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D203_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D157_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D67_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D113_PRED, { INTRA_FRAME, NONE_FRAME } },
+ { D45_PRED, { INTRA_FRAME, NONE_FRAME } },
+};
+
+static const int16_t intra_to_mode_idx[INTRA_MODE_NUM] = {
+ 7, // DC_PRED,
+ 134, // V_PRED,
+ 133, // H_PRED,
+ 140, // D45_PRED,
+ 135, // D135_PRED,
+ 139, // D113_PRED,
+ 137, // D157_PRED,
+ 136, // D203_PRED,
+ 138, // D67_PRED,
+ 46, // SMOOTH_PRED,
+ 47, // SMOOTH_V_PRED,
+ 48, // SMOOTH_H_PRED,
+ 45, // PAETH_PRED,
+};
+
+/* clang-format off */
+static const int16_t single_inter_to_mode_idx[SINGLE_INTER_MODE_NUM]
+ [REF_FRAMES] = {
+ // NEARESTMV,
+ { -1, 0, 1, 2, 6, 3, 4, 5, },
+ // NEARMV,
+ { -1, 15, 16, 17, 21, 18, 19, 20, },
+ // GLOBALMV,
+ { -1, 22, 23, 24, 27, 25, 26, 28, },
+ // NEWMV,
+ { -1, 8, 9, 10, 14, 11, 12, 13, },
+};
+/* clang-format on */
+
+/* clang-format off */
+static const int16_t comp_inter_to_mode_idx[COMP_INTER_MODE_NUM][REF_FRAMES]
+ [REF_FRAMES] = {
+ // NEAREST_NEARESTMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 41, 42, 43, 33, 37, 29, },
+ { -1, -1, -1, -1, -1, 34, 38, 30, },
+ { -1, -1, -1, -1, -1, 35, 39, 31, },
+ { -1, -1, -1, -1, -1, 36, 40, 32, },
+ { -1, -1, -1, -1, -1, -1, -1, 44, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEAR_NEARMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 141, 148, 155, 77, 105, 49, },
+ { -1, -1, -1, -1, -1, 84, 112, 56, },
+ { -1, -1, -1, -1, -1, 91, 119, 63, },
+ { -1, -1, -1, -1, -1, 98, 126, 70, },
+ { -1, -1, -1, -1, -1, -1, -1, 162, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEAREST_NEWMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 143, 150, 157, 79, 107, 51, },
+ { -1, -1, -1, -1, -1, 86, 114, 58, },
+ { -1, -1, -1, -1, -1, 93, 121, 65, },
+ { -1, -1, -1, -1, -1, 100, 128, 72, },
+ { -1, -1, -1, -1, -1, -1, -1, 164, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEW_NEARESTMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 142, 149, 156, 78, 106, 50, },
+ { -1, -1, -1, -1, -1, 85, 113, 57, },
+ { -1, -1, -1, -1, -1, 92, 120, 64, },
+ { -1, -1, -1, -1, -1, 99, 127, 71, },
+ { -1, -1, -1, -1, -1, -1, -1, 163, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEAR_NEWMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 145, 152, 159, 81, 109, 53, },
+ { -1, -1, -1, -1, -1, 88, 116, 60, },
+ { -1, -1, -1, -1, -1, 95, 123, 67, },
+ { -1, -1, -1, -1, -1, 102, 130, 74, },
+ { -1, -1, -1, -1, -1, -1, -1, 166, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEW_NEARMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 144, 151, 158, 80, 108, 52, },
+ { -1, -1, -1, -1, -1, 87, 115, 59, },
+ { -1, -1, -1, -1, -1, 94, 122, 66, },
+ { -1, -1, -1, -1, -1, 101, 129, 73, },
+ { -1, -1, -1, -1, -1, -1, -1, 165, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // GLOBAL_GLOBALMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 147, 154, 161, 83, 111, 55, },
+ { -1, -1, -1, -1, -1, 90, 118, 62, },
+ { -1, -1, -1, -1, -1, 97, 125, 69, },
+ { -1, -1, -1, -1, -1, 104, 132, 76, },
+ { -1, -1, -1, -1, -1, -1, -1, 168, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+ // NEW_NEWMV,
+ {
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, 146, 153, 160, 82, 110, 54, },
+ { -1, -1, -1, -1, -1, 89, 117, 61, },
+ { -1, -1, -1, -1, -1, 96, 124, 68, },
+ { -1, -1, -1, -1, -1, 103, 131, 75, },
+ { -1, -1, -1, -1, -1, -1, -1, 167, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ { -1, -1, -1, -1, -1, -1, -1, -1, },
+ },
+};
+/* clang-format on */
+
+static int get_prediction_mode_idx(PREDICTION_MODE this_mode,
+ MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME second_ref_frame) {
+ if (this_mode < INTRA_MODE_END) {
+ assert(ref_frame == INTRA_FRAME);
+ assert(second_ref_frame == NONE_FRAME);
+ return intra_to_mode_idx[this_mode - INTRA_MODE_START];
+ }
+ if (this_mode >= SINGLE_INTER_MODE_START &&
+ this_mode < SINGLE_INTER_MODE_END) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ return single_inter_to_mode_idx[this_mode - SINGLE_INTER_MODE_START]
+ [ref_frame];
+ }
+ if (this_mode >= COMP_INTER_MODE_START && this_mode < COMP_INTER_MODE_END) {
+ assert((ref_frame > INTRA_FRAME) && (ref_frame <= ALTREF_FRAME));
+ assert((second_ref_frame > INTRA_FRAME) &&
+ (second_ref_frame <= ALTREF_FRAME));
+ return comp_inter_to_mode_idx[this_mode - COMP_INTER_MODE_START][ref_frame]
+ [second_ref_frame];
+ }
+ assert(0);
+ return -1;
+}
+
+static const PREDICTION_MODE intra_rd_search_mode_order[INTRA_MODES] = {
+ DC_PRED, H_PRED, V_PRED, SMOOTH_PRED, PAETH_PRED,
+ SMOOTH_V_PRED, SMOOTH_H_PRED, D135_PRED, D203_PRED, D157_PRED,
+ D67_PRED, D113_PRED, D45_PRED,
+};
+
+static const UV_PREDICTION_MODE uv_rd_search_mode_order[UV_INTRA_MODES] = {
+ UV_DC_PRED, UV_CFL_PRED, UV_H_PRED, UV_V_PRED,
+ UV_SMOOTH_PRED, UV_PAETH_PRED, UV_SMOOTH_V_PRED, UV_SMOOTH_H_PRED,
+ UV_D135_PRED, UV_D203_PRED, UV_D157_PRED, UV_D67_PRED,
+ UV_D113_PRED, UV_D45_PRED,
+};
+
+typedef struct SingleInterModeState {
+ int64_t rd;
+ MV_REFERENCE_FRAME ref_frame;
+ int valid;
+} SingleInterModeState;
+
+typedef struct InterModeSearchState {
+ int64_t best_rd;
+ MB_MODE_INFO best_mbmode;
+ int best_rate_y;
+ int best_rate_uv;
+ int best_mode_skippable;
+ int best_skip2;
+ int best_mode_index;
+ int skip_intra_modes;
+ int num_available_refs;
+ int64_t dist_refs[REF_FRAMES];
+ int dist_order_refs[REF_FRAMES];
+ int64_t mode_threshold[MAX_MODES];
+ PREDICTION_MODE best_intra_mode;
+ int64_t best_intra_rd;
+ int angle_stats_ready;
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+ unsigned int best_pred_sse;
+ int rate_uv_intra[TX_SIZES_ALL];
+ int rate_uv_tokenonly[TX_SIZES_ALL];
+ int64_t dist_uvs[TX_SIZES_ALL];
+ int skip_uvs[TX_SIZES_ALL];
+ UV_PREDICTION_MODE mode_uv[TX_SIZES_ALL];
+ PALETTE_MODE_INFO pmi_uv[TX_SIZES_ALL];
+ int8_t uv_angle_delta[TX_SIZES_ALL];
+ int64_t best_pred_rd[REFERENCE_MODES];
+ int64_t best_pred_diff[REFERENCE_MODES];
+ // Save a set of single_newmv for each checked ref_mv.
+ int_mv single_newmv[MAX_REF_MV_SERCH][REF_FRAMES];
+ int single_newmv_rate[MAX_REF_MV_SERCH][REF_FRAMES];
+ int single_newmv_valid[MAX_REF_MV_SERCH][REF_FRAMES];
+ int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+ // The rd of simple translation in single inter modes
+ int64_t simple_rd[MB_MODE_COUNT][MAX_REF_MV_SERCH][REF_FRAMES];
+
+ // Single search results by [directions][modes][reference frames]
+ SingleInterModeState single_state[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+ int single_state_cnt[2][SINGLE_INTER_MODE_NUM];
+ SingleInterModeState single_state_modelled[2][SINGLE_INTER_MODE_NUM]
+ [FWD_REFS];
+ int single_state_modelled_cnt[2][SINGLE_INTER_MODE_NUM];
+
+ MV_REFERENCE_FRAME single_rd_order[2][SINGLE_INTER_MODE_NUM][FWD_REFS];
+} InterModeSearchState;
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+int inter_mode_data_block_idx(BLOCK_SIZE bsize) {
+ if (bsize == BLOCK_8X8) return 1;
+ if (bsize == BLOCK_16X16) return 2;
+ if (bsize == BLOCK_32X32) return 3;
+ return -1;
+}
+
+void av1_inter_mode_data_init(TileDataEnc *tile_data) {
+ for (int i = 0; i < BLOCK_SIZES_ALL; ++i) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[i];
+ md->ready = 0;
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+}
+
+static int get_est_rate_dist(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int *est_residue_cost,
+ int64_t *est_dist) {
+ aom_clear_system_state();
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (md->ready) {
+ const double est_ld = md->a * sse + md->b;
+ if (sse < md->dist_mean) {
+ *est_residue_cost = 0;
+ *est_dist = sse;
+ } else {
+ *est_residue_cost = (int)round((sse - md->dist_mean) / est_ld);
+ *est_dist = (int64_t)round(md->dist_mean);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int64_t get_est_rd(TileDataEnc *tile_data, BLOCK_SIZE bsize, int rdmult,
+ int64_t sse, int curr_cost) {
+ int est_residue_cost;
+ int64_t est_dist;
+ if (get_est_rate_dist(tile_data, bsize, sse, &est_residue_cost, &est_dist)) {
+ int rate = est_residue_cost + curr_cost;
+ int64_t est_rd = RDCOST(rdmult, rate, est_dist);
+ return est_rd;
+ }
+ return 0;
+}
+
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult) {
+ aom_clear_system_state();
+ for (int bsize = 0; bsize < BLOCK_SIZES_ALL; ++bsize) {
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ if (block_idx == -1) continue;
+ if ((md->ready == 0 && md->num < 200) || (md->ready == 1 && md->num < 64)) {
+ continue;
+ } else {
+ if (md->ready == 0) {
+ md->dist_mean = md->dist_sum / md->num;
+ md->ld_mean = md->ld_sum / md->num;
+ md->sse_mean = md->sse_sum / md->num;
+ md->sse_sse_mean = md->sse_sse_sum / md->num;
+ md->sse_ld_mean = md->sse_ld_sum / md->num;
+ } else {
+ const double factor = 3;
+ md->dist_mean =
+ (md->dist_mean * factor + (md->dist_sum / md->num)) / (factor + 1);
+ md->ld_mean =
+ (md->ld_mean * factor + (md->ld_sum / md->num)) / (factor + 1);
+ md->sse_mean =
+ (md->sse_mean * factor + (md->sse_sum / md->num)) / (factor + 1);
+ md->sse_sse_mean =
+ (md->sse_sse_mean * factor + (md->sse_sse_sum / md->num)) /
+ (factor + 1);
+ md->sse_ld_mean =
+ (md->sse_ld_mean * factor + (md->sse_ld_sum / md->num)) /
+ (factor + 1);
+ }
+
+ const double my = md->ld_mean;
+ const double mx = md->sse_mean;
+ const double dx = sqrt(md->sse_sse_mean);
+ const double dxy = md->sse_ld_mean;
+
+ md->a = (dxy - mx * my) / (dx * dx - mx * mx);
+ md->b = my - md->a * mx;
+ md->ready = 1;
+
+ md->num = 0;
+ md->dist_sum = 0;
+ md->ld_sum = 0;
+ md->sse_sum = 0;
+ md->sse_sse_sum = 0;
+ md->sse_ld_sum = 0;
+ }
+ (void)rdmult;
+ }
+}
+
+static void inter_mode_data_push(TileDataEnc *tile_data, BLOCK_SIZE bsize,
+ int64_t sse, int64_t dist, int residue_cost) {
+ if (residue_cost == 0 || sse == dist) return;
+ const int block_idx = inter_mode_data_block_idx(bsize);
+ if (block_idx == -1) return;
+ InterModeRdModel *rd_model = &tile_data->inter_mode_rd_models[bsize];
+ if (rd_model->num < INTER_MODE_RD_DATA_OVERALL_SIZE) {
+ aom_clear_system_state();
+ const double ld = (sse - dist) * 1. / residue_cost;
+ ++rd_model->num;
+ rd_model->dist_sum += dist;
+ rd_model->ld_sum += ld;
+ rd_model->sse_sum += sse;
+ rd_model->sse_sse_sum += sse * sse;
+ rd_model->sse_ld_sum += sse * ld;
+ }
+}
+
+static void inter_modes_info_push(InterModesInfo *inter_modes_info,
+ int mode_rate, int64_t sse, int64_t est_rd,
+ const MB_MODE_INFO *mbmi) {
+ const int num = inter_modes_info->num;
+ assert(num < MAX_INTER_MODES);
+ inter_modes_info->mbmi_arr[num] = *mbmi;
+ inter_modes_info->mode_rate_arr[num] = mode_rate;
+ inter_modes_info->sse_arr[num] = sse;
+ inter_modes_info->est_rd_arr[num] = est_rd;
+ ++inter_modes_info->num;
+}
+
+static int compare_rd_idx_pair(const void *a, const void *b) {
+ if (((RdIdxPair *)a)->rd == ((RdIdxPair *)b)->rd) {
+ return 0;
+ } else if (((const RdIdxPair *)a)->rd > ((const RdIdxPair *)b)->rd) {
+ return 1;
+ } else {
+ return -1;
+ }
+}
+
+static void inter_modes_info_sort(const InterModesInfo *inter_modes_info,
+ RdIdxPair *rd_idx_pair_arr) {
+ if (inter_modes_info->num == 0) {
+ return;
+ }
+ for (int i = 0; i < inter_modes_info->num; ++i) {
+ rd_idx_pair_arr[i].idx = i;
+ rd_idx_pair_arr[i].rd = inter_modes_info->est_rd_arr[i];
+ }
+ qsort(rd_idx_pair_arr, inter_modes_info->num, sizeof(rd_idx_pair_arr[0]),
+ compare_rd_idx_pair);
+}
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+
+static INLINE int write_uniform_cost(int n, int v) {
+ const int l = get_unsigned_bits(n);
+ const int m = (1 << l) - n;
+ if (l == 0) return 0;
+ if (v < m)
+ return av1_cost_literal(l - 1);
+ else
+ return av1_cost_literal(l);
+}
+
+// Similar to store_cfl_required(), but for use during the RDO process,
+// where we haven't yet determined whether this block uses CfL.
+static INLINE CFL_ALLOWED_TYPE store_cfl_required_rdo(const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (cm->seq_params.monochrome || x->skip_chroma_rd) return CFL_DISALLOWED;
+
+ if (!xd->cfl.is_chroma_reference) {
+ // For non-chroma-reference blocks, we should always store the luma pixels,
+ // in case the corresponding chroma-reference block uses CfL.
+ // Note that this can only happen for block sizes which are <8 on
+ // their shortest side, as otherwise they would be chroma reference
+ // blocks.
+ return CFL_ALLOWED;
+ }
+
+ // For chroma reference blocks, we should store data in the encoder iff we're
+ // allowed to try out CfL.
+ return is_cfl_allowed(xd);
+}
+
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
+
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode);
+
+static unsigned pixel_dist_visible_only(
+ const AV1_COMP *const cpi, const MACROBLOCK *x, const uint8_t *src,
+ const int src_stride, const uint8_t *dst, const int dst_stride,
+ const BLOCK_SIZE tx_bsize, int txb_rows, int txb_cols, int visible_rows,
+ int visible_cols) {
+ unsigned sse;
+
+ if (txb_rows == visible_rows && txb_cols == visible_cols) {
+ cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ return sse;
+ }
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint64_t sse64 = aom_highbd_sse_odd_size(src, src_stride, dst, dst_stride,
+ visible_cols, visible_rows);
+ return (unsigned int)ROUND_POWER_OF_TWO(sse64, (xd->bd - 8) * 2);
+ }
+ sse = aom_sse_odd_size(src, src_stride, dst, dst_stride, visible_cols,
+ visible_rows);
+ return sse;
+}
+
+#if CONFIG_DIST_8X8
+static uint64_t cdef_dist_8x8_16bit(uint16_t *dst, int dstride, uint16_t *src,
+ int sstride, int coeff_shift) {
+ uint64_t svar = 0;
+ uint64_t dvar = 0;
+ uint64_t sum_s = 0;
+ uint64_t sum_d = 0;
+ uint64_t sum_s2 = 0;
+ uint64_t sum_d2 = 0;
+ uint64_t sum_sd = 0;
+ uint64_t dist = 0;
+
+ int i, j;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++) {
+ sum_s += src[i * sstride + j];
+ sum_d += dst[i * dstride + j];
+ sum_s2 += src[i * sstride + j] * src[i * sstride + j];
+ sum_d2 += dst[i * dstride + j] * dst[i * dstride + j];
+ sum_sd += src[i * sstride + j] * dst[i * dstride + j];
+ }
+ }
+ /* Compute the variance -- the calculation cannot go negative. */
+ svar = sum_s2 - ((sum_s * sum_s + 32) >> 6);
+ dvar = sum_d2 - ((sum_d * sum_d + 32) >> 6);
+
+ // Tuning of jm's original dering distortion metric used in CDEF tool,
+ // suggested by jm
+ const uint64_t a = 4;
+ const uint64_t b = 2;
+ const uint64_t c1 = (400 * a << 2 * coeff_shift);
+ const uint64_t c2 = (b * 20000 * a * a << 4 * coeff_shift);
+
+ dist = (uint64_t)floor(.5 + (sum_d2 + sum_s2 - 2 * sum_sd) * .5 *
+ (svar + dvar + c1) /
+ (sqrt(svar * (double)dvar + c2)));
+
+ // Calibrate dist to have similar rate for the same QP with MSE only
+ // distortion (as in master branch)
+ dist = (uint64_t)((float)dist * 0.75);
+
+ return dist;
+}
+
+static int od_compute_var_4x4(uint16_t *x, int stride) {
+ int sum;
+ int s2;
+ int i;
+ sum = 0;
+ s2 = 0;
+ for (i = 0; i < 4; i++) {
+ int j;
+ for (j = 0; j < 4; j++) {
+ int t;
+
+ t = x[i * stride + j];
+ sum += t;
+ s2 += t * t;
+ }
+ }
+
+ return (s2 - (sum * sum >> 4)) >> 4;
+}
+
+/* OD_DIST_LP_MID controls the frequency weighting filter used for computing
+ the distortion. For a value X, the filter is [1 X 1]/(X + 2) and
+ is applied both horizontally and vertically. For X=5, the filter is
+ a good approximation for the OD_QM8_Q4_HVS quantization matrix. */
+#define OD_DIST_LP_MID (5)
+#define OD_DIST_LP_NORM (OD_DIST_LP_MID + 2)
+
+static double od_compute_dist_8x8(int use_activity_masking, uint16_t *x,
+ uint16_t *y, od_coeff *e_lp, int stride) {
+ double sum;
+ int min_var;
+ double mean_var;
+ double var_stat;
+ double activity;
+ double calibration;
+ int i;
+ int j;
+ double vardist;
+
+ vardist = 0;
+
+#if 1
+ min_var = INT_MAX;
+ mean_var = 0;
+ for (i = 0; i < 3; i++) {
+ for (j = 0; j < 3; j++) {
+ int varx;
+ int vary;
+ varx = od_compute_var_4x4(x + 2 * i * stride + 2 * j, stride);
+ vary = od_compute_var_4x4(y + 2 * i * stride + 2 * j, stride);
+ min_var = OD_MINI(min_var, varx);
+ mean_var += 1. / (1 + varx);
+ /* The cast to (double) is to avoid an overflow before the sqrt.*/
+ vardist += varx - 2 * sqrt(varx * (double)vary) + vary;
+ }
+ }
+ /* We use a different variance statistic depending on whether activity
+ masking is used, since the harmonic mean appeared slightly worse with
+ masking off. The calibration constant just ensures that we preserve the
+ rate compared to activity=1. */
+ if (use_activity_masking) {
+ calibration = 1.95;
+ var_stat = 9. / mean_var;
+ } else {
+ calibration = 1.62;
+ var_stat = min_var;
+ }
+ /* 1.62 is a calibration constant, 0.25 is a noise floor and 1/6 is the
+ activity masking constant. */
+ activity = calibration * pow(.25 + var_stat, -1. / 6);
+#else
+ activity = 1;
+#endif // 1
+ sum = 0;
+ for (i = 0; i < 8; i++) {
+ for (j = 0; j < 8; j++)
+ sum += e_lp[i * stride + j] * (double)e_lp[i * stride + j];
+ }
+ /* Normalize the filter to unit DC response. */
+ sum *= 1. / (OD_DIST_LP_NORM * OD_DIST_LP_NORM * OD_DIST_LP_NORM *
+ OD_DIST_LP_NORM);
+ return activity * activity * (sum + vardist);
+}
+
+// Note : Inputs x and y are in a pixel domain
+static double od_compute_dist_common(int activity_masking, uint16_t *x,
+ uint16_t *y, int bsize_w, int bsize_h,
+ int qindex, od_coeff *tmp,
+ od_coeff *e_lp) {
+ int i, j;
+ double sum = 0;
+ const int mid = OD_DIST_LP_MID;
+
+ for (j = 0; j < bsize_w; j++) {
+ e_lp[j] = mid * tmp[j] + 2 * tmp[bsize_w + j];
+ e_lp[(bsize_h - 1) * bsize_w + j] = mid * tmp[(bsize_h - 1) * bsize_w + j] +
+ 2 * tmp[(bsize_h - 2) * bsize_w + j];
+ }
+ for (i = 1; i < bsize_h - 1; i++) {
+ for (j = 0; j < bsize_w; j++) {
+ e_lp[i * bsize_w + j] = mid * tmp[i * bsize_w + j] +
+ tmp[(i - 1) * bsize_w + j] +
+ tmp[(i + 1) * bsize_w + j];
+ }
+ }
+ for (i = 0; i < bsize_h; i += 8) {
+ for (j = 0; j < bsize_w; j += 8) {
+ sum += od_compute_dist_8x8(activity_masking, &x[i * bsize_w + j],
+ &y[i * bsize_w + j], &e_lp[i * bsize_w + j],
+ bsize_w);
+ }
+ }
+ /* Scale according to linear regression against SSE, for 8x8 blocks. */
+ if (activity_masking) {
+ sum *= 2.2 + (1.7 - 2.2) * (qindex - 99) / (210 - 99) +
+ (qindex < 99 ? 2.5 * (qindex - 99) / 99 * (qindex - 99) / 99 : 0);
+ } else {
+ sum *= qindex >= 128
+ ? 1.4 + (0.9 - 1.4) * (qindex - 128) / (209 - 128)
+ : qindex <= 43 ? 1.5 + (2.0 - 1.5) * (qindex - 43) / (16 - 43)
+ : 1.5 + (1.4 - 1.5) * (qindex - 43) / (128 - 43);
+ }
+
+ return sum;
+}
+
+static double od_compute_dist(uint16_t *x, uint16_t *y, int bsize_w,
+ int bsize_h, int qindex) {
+ assert(bsize_w >= 8 && bsize_h >= 8);
+
+ int activity_masking = 0;
+
+ int i, j;
+ DECLARE_ALIGNED(16, od_coeff, e[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
+ for (i = 0; i < bsize_h; i++) {
+ for (j = 0; j < bsize_w; j++) {
+ e[i * bsize_w + j] = x[i * bsize_w + j] - y[i * bsize_w + j];
+ }
+ }
+ int mid = OD_DIST_LP_MID;
+ for (i = 0; i < bsize_h; i++) {
+ tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+ tmp[i * bsize_w + bsize_w - 1] =
+ mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+ for (j = 1; j < bsize_w - 1; j++) {
+ tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+ e[i * bsize_w + j + 1];
+ }
+ }
+ return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+ qindex, tmp, e_lp);
+}
+
+static double od_compute_dist_diff(uint16_t *x, int16_t *e, int bsize_w,
+ int bsize_h, int qindex) {
+ assert(bsize_w >= 8 && bsize_h >= 8);
+
+ int activity_masking = 0;
+
+ DECLARE_ALIGNED(16, uint16_t, y[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, tmp[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, od_coeff, e_lp[MAX_SB_SQUARE]);
+ int i, j;
+ for (i = 0; i < bsize_h; i++) {
+ for (j = 0; j < bsize_w; j++) {
+ y[i * bsize_w + j] = x[i * bsize_w + j] - e[i * bsize_w + j];
+ }
+ }
+ int mid = OD_DIST_LP_MID;
+ for (i = 0; i < bsize_h; i++) {
+ tmp[i * bsize_w] = mid * e[i * bsize_w] + 2 * e[i * bsize_w + 1];
+ tmp[i * bsize_w + bsize_w - 1] =
+ mid * e[i * bsize_w + bsize_w - 1] + 2 * e[i * bsize_w + bsize_w - 2];
+ for (j = 1; j < bsize_w - 1; j++) {
+ tmp[i * bsize_w + j] = mid * e[i * bsize_w + j] + e[i * bsize_w + j - 1] +
+ e[i * bsize_w + j + 1];
+ }
+ }
+ return od_compute_dist_common(activity_masking, x, y, bsize_w, bsize_h,
+ qindex, tmp, e_lp);
+}
+
+int64_t av1_dist_8x8(const AV1_COMP *const cpi, const MACROBLOCK *x,
+ const uint8_t *src, int src_stride, const uint8_t *dst,
+ int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+ int bsh, int visible_w, int visible_h, int qindex) {
+ int64_t d = 0;
+ int i, j;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint16_t, rec[MAX_SB_SQUARE]);
+
+ assert(bsw >= 8);
+ assert(bsh >= 8);
+ assert((bsw & 0x07) == 0);
+ assert((bsh & 0x07) == 0);
+
+ if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+ x->tune_metric == AOM_TUNE_DAALA_DIST) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+
+ if ((bsw == visible_w) && (bsh == visible_h)) {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+ } else {
+ for (j = 0; j < visible_h; j++)
+ for (i = 0; i < visible_w; i++)
+ rec[j * bsw + i] = CONVERT_TO_SHORTPTR(dst)[j * dst_stride + i];
+
+ if (visible_w < bsw) {
+ for (j = 0; j < bsh; j++)
+ for (i = visible_w; i < bsw; i++)
+ rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+ }
+
+ if (visible_h < bsh) {
+ for (j = visible_h; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ rec[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+ }
+ }
+ } else {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+
+ if ((bsw == visible_w) && (bsh == visible_h)) {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++) rec[j * bsw + i] = dst[j * dst_stride + i];
+ } else {
+ for (j = 0; j < visible_h; j++)
+ for (i = 0; i < visible_w; i++)
+ rec[j * bsw + i] = dst[j * dst_stride + i];
+
+ if (visible_w < bsw) {
+ for (j = 0; j < bsh; j++)
+ for (i = visible_w; i < bsw; i++)
+ rec[j * bsw + i] = src[j * src_stride + i];
+ }
+
+ if (visible_h < bsh) {
+ for (j = visible_h; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ rec[j * bsw + i] = src[j * src_stride + i];
+ }
+ }
+ }
+ }
+
+ if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+ d = (int64_t)od_compute_dist(orig, rec, bsw, bsh, qindex);
+ } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+ int coeff_shift = AOMMAX(xd->bd - 8, 0);
+
+ for (i = 0; i < bsh; i += 8) {
+ for (j = 0; j < bsw; j += 8) {
+ d += cdef_dist_8x8_16bit(&rec[i * bsw + j], bsw, &orig[i * bsw + j],
+ bsw, coeff_shift);
+ }
+ }
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ d = ((uint64_t)d) >> 2 * coeff_shift;
+ } else {
+ // Otherwise, MSE by default
+ d = pixel_dist_visible_only(cpi, x, src, src_stride, dst, dst_stride,
+ tx_bsize, bsh, bsw, visible_h, visible_w);
+ }
+
+ return d;
+}
+
+static int64_t dist_8x8_diff(const MACROBLOCK *x, const uint8_t *src,
+ int src_stride, const int16_t *diff,
+ int diff_stride, int bsw, int bsh, int visible_w,
+ int visible_h, int qindex) {
+ int64_t d = 0;
+ int i, j;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ DECLARE_ALIGNED(16, uint16_t, orig[MAX_SB_SQUARE]);
+ DECLARE_ALIGNED(16, int16_t, diff16[MAX_SB_SQUARE]);
+
+ assert(bsw >= 8);
+ assert(bsh >= 8);
+ assert((bsw & 0x07) == 0);
+ assert((bsh & 0x07) == 0);
+
+ if (x->tune_metric == AOM_TUNE_CDEF_DIST ||
+ x->tune_metric == AOM_TUNE_DAALA_DIST) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ orig[j * bsw + i] = CONVERT_TO_SHORTPTR(src)[j * src_stride + i];
+ } else {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++) orig[j * bsw + i] = src[j * src_stride + i];
+ }
+
+ if ((bsw == visible_w) && (bsh == visible_h)) {
+ for (j = 0; j < bsh; j++)
+ for (i = 0; i < bsw; i++)
+ diff16[j * bsw + i] = diff[j * diff_stride + i];
+ } else {
+ for (j = 0; j < visible_h; j++)
+ for (i = 0; i < visible_w; i++)
+ diff16[j * bsw + i] = diff[j * diff_stride + i];
+
+ if (visible_w < bsw) {
+ for (j = 0; j < bsh; j++)
+ for (i = visible_w; i < bsw; i++) diff16[j * bsw + i] = 0;
+ }
+
+ if (visible_h < bsh) {
+ for (j = visible_h; j < bsh; j++)
+ for (i = 0; i < bsw; i++) diff16[j * bsw + i] = 0;
+ }
+ }
+ }
+
+ if (x->tune_metric == AOM_TUNE_DAALA_DIST) {
+ d = (int64_t)od_compute_dist_diff(orig, diff16, bsw, bsh, qindex);
+ } else if (x->tune_metric == AOM_TUNE_CDEF_DIST) {
+ int coeff_shift = AOMMAX(xd->bd - 8, 0);
+ DECLARE_ALIGNED(16, uint16_t, dst16[MAX_SB_SQUARE]);
+
+ for (i = 0; i < bsh; i++) {
+ for (j = 0; j < bsw; j++) {
+ dst16[i * bsw + j] = orig[i * bsw + j] - diff16[i * bsw + j];
+ }
+ }
+
+ for (i = 0; i < bsh; i += 8) {
+ for (j = 0; j < bsw; j += 8) {
+ d += cdef_dist_8x8_16bit(&dst16[i * bsw + j], bsw, &orig[i * bsw + j],
+ bsw, coeff_shift);
+ }
+ }
+ // Don't scale 'd' for HBD since it will be done by caller side for diff
+ // input
+ } else {
+ // Otherwise, MSE by default
+ d = aom_sum_squares_2d_i16(diff, diff_stride, visible_w, visible_h);
+ }
+
+ return d;
+}
+#endif // CONFIG_DIST_8X8
+
+static void get_energy_distribution_fine(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride,
+ int need_4th, double *hordist,
+ double *verdist) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ unsigned int esq[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ if (bsize < BLOCK_16X16 || (bsize >= BLOCK_4X16 && bsize <= BLOCK_32X8)) {
+ // Special cases: calculate 'esq' values manually, as we don't have 'vf'
+ // functions for the 16 (very small) sub-blocks of this block.
+ const int w_shift = (bw == 4) ? 0 : (bw == 8) ? 1 : (bw == 16) ? 2 : 3;
+ const int h_shift = (bh == 4) ? 0 : (bh == 8) ? 1 : (bh == 16) ? 2 : 3;
+ assert(bw <= 32);
+ assert(bh <= 32);
+ assert(((bw - 1) >> w_shift) + (((bh - 1) >> h_shift) << 2) == 15);
+ if (cpi->common.seq_params.use_highbitdepth) {
+ const uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] +=
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]) *
+ (src16[j + i * src_stride] - dst16[j + i * dst_stride]);
+ }
+ } else {
+ for (int i = 0; i < bh; ++i)
+ for (int j = 0; j < bw; ++j) {
+ const int index = (j >> w_shift) + ((i >> h_shift) << 2);
+ esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+ (src[j + i * src_stride] - dst[j + i * dst_stride]);
+ }
+ }
+ } else { // Calculate 'esq' values using 'vf' functions on the 16 sub-blocks.
+ const int f_index =
+ (bsize < BLOCK_SIZES) ? bsize - BLOCK_16X16 : bsize - BLOCK_8X16;
+ assert(f_index >= 0 && f_index < BLOCK_SIZES_ALL);
+ const BLOCK_SIZE subsize = (BLOCK_SIZE)f_index;
+ assert(block_size_wide[bsize] == 4 * block_size_wide[subsize]);
+ assert(block_size_high[bsize] == 4 * block_size_high[subsize]);
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[0]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[1]);
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[2]);
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[3]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[4]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[5]);
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[6]);
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[7]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[8]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[9]);
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[10]);
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[11]);
+ src += bh / 4 * src_stride;
+ dst += bh / 4 * dst_stride;
+
+ cpi->fn_ptr[subsize].vf(src, src_stride, dst, dst_stride, &esq[12]);
+ cpi->fn_ptr[subsize].vf(src + bw / 4, src_stride, dst + bw / 4, dst_stride,
+ &esq[13]);
+ cpi->fn_ptr[subsize].vf(src + bw / 2, src_stride, dst + bw / 2, dst_stride,
+ &esq[14]);
+ cpi->fn_ptr[subsize].vf(src + 3 * bw / 4, src_stride, dst + 3 * bw / 4,
+ dst_stride, &esq[15]);
+ }
+
+ double total = (double)esq[0] + esq[1] + esq[2] + esq[3] + esq[4] + esq[5] +
+ esq[6] + esq[7] + esq[8] + esq[9] + esq[10] + esq[11] +
+ esq[12] + esq[13] + esq[14] + esq[15];
+ if (total > 0) {
+ const double e_recip = 1.0 / total;
+ hordist[0] = ((double)esq[0] + esq[4] + esq[8] + esq[12]) * e_recip;
+ hordist[1] = ((double)esq[1] + esq[5] + esq[9] + esq[13]) * e_recip;
+ hordist[2] = ((double)esq[2] + esq[6] + esq[10] + esq[14]) * e_recip;
+ if (need_4th) {
+ hordist[3] = ((double)esq[3] + esq[7] + esq[11] + esq[15]) * e_recip;
+ }
+ verdist[0] = ((double)esq[0] + esq[1] + esq[2] + esq[3]) * e_recip;
+ verdist[1] = ((double)esq[4] + esq[5] + esq[6] + esq[7]) * e_recip;
+ verdist[2] = ((double)esq[8] + esq[9] + esq[10] + esq[11]) * e_recip;
+ if (need_4th) {
+ verdist[3] = ((double)esq[12] + esq[13] + esq[14] + esq[15]) * e_recip;
+ }
+ } else {
+ hordist[0] = verdist[0] = 0.25;
+ hordist[1] = verdist[1] = 0.25;
+ hordist[2] = verdist[2] = 0.25;
+ if (need_4th) {
+ hordist[3] = verdist[3] = 0.25;
+ }
+ }
+}
+
+static int adst_vs_flipadst(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const uint8_t *src, int src_stride,
+ const uint8_t *dst, int dst_stride) {
+ int prune_bitmask = 0;
+ double svm_proj_h = 0, svm_proj_v = 0;
+ double hdist[3] = { 0, 0, 0 }, vdist[3] = { 0, 0, 0 };
+ get_energy_distribution_fine(cpi, bsize, src, src_stride, dst, dst_stride, 0,
+ hdist, vdist);
+
+ svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] + vdist[1] * ADST_FLIP_SVM[1] +
+ vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+ svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] + hdist[1] * ADST_FLIP_SVM[5] +
+ hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+ if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << FLIPADST_1D;
+ else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << ADST_1D;
+
+ if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << (FLIPADST_1D + 8);
+ else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+ prune_bitmask |= 1 << (ADST_1D + 8);
+
+ return prune_bitmask;
+}
+
+static void get_horver_correlation(const int16_t *diff, int stride, int w,
+ int h, double *hcorr, double *vcorr) {
+ // Returns hor/ver correlation coefficient
+ const int num = (h - 1) * (w - 1);
+ double num_r;
+ int i, j;
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+ int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+ double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+ *hcorr = *vcorr = 1;
+
+ assert(num > 0);
+ num_r = 1.0 / num;
+ for (i = 1; i < h; ++i) {
+ for (j = 1; j < w; ++j) {
+ const int16_t x = diff[i * stride + j];
+ const int16_t y = diff[i * stride + j - 1];
+ const int16_t z = diff[(i - 1) * stride + j];
+ xy_sum += x * y;
+ xz_sum += x * z;
+ x_sum += x;
+ y_sum += y;
+ z_sum += z;
+ x2_sum += x * x;
+ y2_sum += y * y;
+ z2_sum += z * z;
+ }
+ }
+ x_var_n = x2_sum - (x_sum * x_sum) * num_r;
+ y_var_n = y2_sum - (y_sum * y_sum) * num_r;
+ z_var_n = z2_sum - (z_sum * z_sum) * num_r;
+ xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+ xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+ if (x_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ }
+ if (x_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ }
+}
+
+static int dct_vs_idtx(const int16_t *diff, int stride, int w, int h) {
+ double hcorr, vcorr;
+ int prune_bitmask = 0;
+ get_horver_correlation(diff, stride, w, h, &hcorr, &vcorr);
+
+ if (vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << IDTX_1D;
+ else if (vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << DCT_1D;
+
+ if (hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << (IDTX_1D + 8);
+ else if (hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+ prune_bitmask |= 1 << (DCT_1D + 8);
+ return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, const MACROBLOCKD *xd,
+ int adst_flipadst, int dct_idtx) {
+ int prune = 0;
+
+ if (adst_flipadst) {
+ const struct macroblock_plane *const p = &x->plane[0];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride);
+ }
+ if (dct_idtx) {
+ av1_subtract_plane(x, bsize, 0);
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ prune |= dct_vs_idtx(p->src_diff, bw, bw, bh);
+ }
+
+ return prune;
+}
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const AV1_COMP *cpi, BLOCK_SIZE bsize,
+ const MACROBLOCK *x, const MACROBLOCKD *xd) {
+ const struct macroblock_plane *const p = &x->plane[0];
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride);
+}
+
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+ { 1, 0, 0, 0 },
+ { 1, 1, 1, 1 },
+ { 1, 1, 1, 1 },
+ { 1, 0, 0, 1 },
+};
+
+static void get_energy_distribution_finer(const int16_t *diff, int stride,
+ int bw, int bh, float *hordist,
+ float *verdist) {
+ // First compute downscaled block energy values (esq); downscale factors
+ // are defined by w_shift and h_shift.
+ unsigned int esq[256];
+ const int w_shift = bw <= 8 ? 0 : 1;
+ const int h_shift = bh <= 8 ? 0 : 1;
+ const int esq_w = bw >> w_shift;
+ const int esq_h = bh >> h_shift;
+ const int esq_sz = esq_w * esq_h;
+ int i, j;
+ memset(esq, 0, esq_sz * sizeof(esq[0]));
+ if (w_shift) {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j += 2) {
+ cur_esq_row[j >> 1] += (cur_diff_row[j] * cur_diff_row[j] +
+ cur_diff_row[j + 1] * cur_diff_row[j + 1]);
+ }
+ }
+ } else {
+ for (i = 0; i < bh; i++) {
+ unsigned int *cur_esq_row = esq + (i >> h_shift) * esq_w;
+ const int16_t *cur_diff_row = diff + i * stride;
+ for (j = 0; j < bw; j++) {
+ cur_esq_row[j] += cur_diff_row[j] * cur_diff_row[j];
+ }
+ }
+ }
+
+ uint64_t total = 0;
+ for (i = 0; i < esq_sz; i++) total += esq[i];
+
+ // Output hordist and verdist arrays are normalized 1D projections of esq
+ if (total == 0) {
+ float hor_val = 1.0f / esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] = hor_val;
+ float ver_val = 1.0f / esq_h;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] = ver_val;
+ return;
+ }
+
+ const float e_recip = 1.0f / (float)total;
+ memset(hordist, 0, (esq_w - 1) * sizeof(hordist[0]));
+ memset(verdist, 0, (esq_h - 1) * sizeof(verdist[0]));
+ const unsigned int *cur_esq_row;
+ for (i = 0; i < esq_h - 1; i++) {
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) {
+ hordist[j] += (float)cur_esq_row[j];
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ verdist[i] += (float)cur_esq_row[j];
+ }
+ cur_esq_row = esq + i * esq_w;
+ for (j = 0; j < esq_w - 1; j++) hordist[j] += (float)cur_esq_row[j];
+
+ for (j = 0; j < esq_w - 1; j++) hordist[j] *= e_recip;
+ for (i = 0; i < esq_h - 1; i++) verdist[i] *= e_recip;
+}
+
+// Similar to get_horver_correlation, but also takes into account first
+// row/column, when computing horizontal/vertical correlation.
+static void get_horver_correlation_full(const int16_t *diff, int stride, int w,
+ int h, float *hcorr, float *vcorr) {
+ const float num_hor = (float)(h * (w - 1));
+ const float num_ver = (float)((h - 1) * w);
+ int i, j;
+
+ // The following notation is used:
+ // x - current pixel
+ // y - left neighbor pixel
+ // z - top neighbor pixel
+ int64_t xy_sum = 0, xz_sum = 0;
+ int64_t xhor_sum = 0, xver_sum = 0, y_sum = 0, z_sum = 0;
+ int64_t x2hor_sum = 0, x2ver_sum = 0, y2_sum = 0, z2_sum = 0;
+
+ int16_t x, y, z;
+ for (j = 1; j < w; ++j) {
+ x = diff[j];
+ y = diff[j - 1];
+ xy_sum += x * y;
+ xhor_sum += x;
+ y_sum += y;
+ x2hor_sum += x * x;
+ y2_sum += y * y;
+ }
+ for (i = 1; i < h; ++i) {
+ x = diff[i * stride];
+ z = diff[(i - 1) * stride];
+ xz_sum += x * z;
+ xver_sum += x;
+ z_sum += z;
+ x2ver_sum += x * x;
+ z2_sum += z * z;
+ for (j = 1; j < w; ++j) {
+ x = diff[i * stride + j];
+ y = diff[i * stride + j - 1];
+ z = diff[(i - 1) * stride + j];
+ xy_sum += x * y;
+ xz_sum += x * z;
+ xhor_sum += x;
+ xver_sum += x;
+ y_sum += y;
+ z_sum += z;
+ x2hor_sum += x * x;
+ x2ver_sum += x * x;
+ y2_sum += y * y;
+ z2_sum += z * z;
+ }
+ }
+ const float xhor_var_n = x2hor_sum - (xhor_sum * xhor_sum) / num_hor;
+ const float y_var_n = y2_sum - (y_sum * y_sum) / num_hor;
+ const float xy_var_n = xy_sum - (xhor_sum * y_sum) / num_hor;
+ const float xver_var_n = x2ver_sum - (xver_sum * xver_sum) / num_ver;
+ const float z_var_n = z2_sum - (z_sum * z_sum) / num_ver;
+ const float xz_var_n = xz_sum - (xver_sum * z_sum) / num_ver;
+
+ *hcorr = *vcorr = 1;
+ if (xhor_var_n > 0 && y_var_n > 0) {
+ *hcorr = xy_var_n / sqrtf(xhor_var_n * y_var_n);
+ *hcorr = *hcorr < 0 ? 0 : *hcorr;
+ }
+ if (xver_var_n > 0 && z_var_n > 0) {
+ *vcorr = xz_var_n / sqrtf(xver_var_n * z_var_n);
+ *vcorr = *vcorr < 0 ? 0 : *vcorr;
+ }
+}
+
+// Transforms raw scores into a probability distribution across 16 TX types
+static void score_2D_transform_pow8(float *scores_2D, float shift) {
+ float sum = 0.0f;
+ int i;
+
+ for (i = 0; i < 16; i++) {
+ float v, v2, v4;
+ v = AOMMAX(scores_2D[i] + shift, 0.0f);
+ v2 = v * v;
+ v4 = v2 * v2;
+ scores_2D[i] = v4 * v4;
+ sum += scores_2D[i];
+ }
+ for (i = 0; i < 16; i++) scores_2D[i] /= sum;
+}
+
+// These thresholds were calibrated to provide a certain number of TX types
+// pruned by the model on average, i.e. selecting a threshold with index i
+// will lead to pruning i+1 TX types on average
+static const float *prune_2D_adaptive_thresholds[] = {
+ // TX_4X4
+ (float[]){ 0.00549f, 0.01306f, 0.02039f, 0.02747f, 0.03406f, 0.04065f,
+ 0.04724f, 0.05383f, 0.06067f, 0.06799f, 0.07605f, 0.08533f,
+ 0.09778f, 0.11780f },
+ // TX_8X8
+ (float[]){ 0.00037f, 0.00183f, 0.00525f, 0.01038f, 0.01697f, 0.02502f,
+ 0.03381f, 0.04333f, 0.05286f, 0.06287f, 0.07434f, 0.08850f,
+ 0.10803f, 0.14124f },
+ // TX_16X16
+ (float[]){ 0.01404f, 0.02820f, 0.04211f, 0.05164f, 0.05798f, 0.06335f,
+ 0.06897f, 0.07629f, 0.08875f, 0.11169f },
+ // TX_32X32
+ NULL,
+ // TX_64X64
+ NULL,
+ // TX_4X8
+ (float[]){ 0.00183f, 0.00745f, 0.01428f, 0.02185f, 0.02966f, 0.03723f,
+ 0.04456f, 0.05188f, 0.05920f, 0.06702f, 0.07605f, 0.08704f,
+ 0.10168f, 0.12585f },
+ // TX_8X4
+ (float[]){ 0.00085f, 0.00476f, 0.01135f, 0.01892f, 0.02698f, 0.03528f,
+ 0.04358f, 0.05164f, 0.05994f, 0.06848f, 0.07849f, 0.09021f,
+ 0.10583f, 0.13123f },
+ // TX_8X16
+ (float[]){ 0.00037f, 0.00232f, 0.00671f, 0.01257f, 0.01965f, 0.02722f,
+ 0.03552f, 0.04382f, 0.05237f, 0.06189f, 0.07336f, 0.08728f,
+ 0.10730f, 0.14221f },
+ // TX_16X8
+ (float[]){ 0.00061f, 0.00330f, 0.00818f, 0.01453f, 0.02185f, 0.02966f,
+ 0.03772f, 0.04578f, 0.05383f, 0.06262f, 0.07288f, 0.08582f,
+ 0.10339f, 0.13464f },
+ // TX_16X32
+ NULL,
+ // TX_32X16
+ NULL,
+ // TX_32X64
+ NULL,
+ // TX_64X32
+ NULL,
+ // TX_4X16
+ (float[]){ 0.00232f, 0.00671f, 0.01257f, 0.01941f, 0.02673f, 0.03430f,
+ 0.04211f, 0.04968f, 0.05750f, 0.06580f, 0.07507f, 0.08655f,
+ 0.10242f, 0.12878f },
+ // TX_16X4
+ (float[]){ 0.00110f, 0.00525f, 0.01208f, 0.01990f, 0.02795f, 0.03601f,
+ 0.04358f, 0.05115f, 0.05896f, 0.06702f, 0.07629f, 0.08752f,
+ 0.10217f, 0.12610f },
+ // TX_8X32
+ NULL,
+ // TX_32X8
+ NULL,
+ // TX_16X64
+ NULL,
+ // TX_64X16
+ NULL,
+};
+
+static uint16_t prune_tx_2D(MACROBLOCK *x, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int blk_row, int blk_col, TxSetType tx_set_type,
+ TX_TYPE_PRUNE_MODE prune_mode) {
+ static const int tx_type_table_2D[16] = {
+ DCT_DCT, DCT_ADST, DCT_FLIPADST, V_DCT,
+ ADST_DCT, ADST_ADST, ADST_FLIPADST, V_ADST,
+ FLIPADST_DCT, FLIPADST_ADST, FLIPADST_FLIPADST, V_FLIPADST,
+ H_DCT, H_ADST, H_FLIPADST, IDTX
+ };
+ if (tx_set_type != EXT_TX_SET_ALL16 &&
+ tx_set_type != EXT_TX_SET_DTT9_IDTX_1DDCT)
+ return 0;
+ const NN_CONFIG *nn_config_hor = av1_tx_type_nnconfig_map_hor[tx_size];
+ const NN_CONFIG *nn_config_ver = av1_tx_type_nnconfig_map_ver[tx_size];
+ if (!nn_config_hor || !nn_config_ver) return 0; // Model not established yet.
+
+ aom_clear_system_state();
+ float hfeatures[16], vfeatures[16];
+ float hscores[4], vscores[4];
+ float scores_2D[16];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ const int hfeatures_num = bw <= 8 ? bw : bw / 2;
+ const int vfeatures_num = bh <= 8 ? bh : bh / 2;
+ assert(hfeatures_num <= 16);
+ assert(vfeatures_num <= 16);
+
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff = p->src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ get_energy_distribution_finer(diff, diff_stride, bw, bh, hfeatures,
+ vfeatures);
+ get_horver_correlation_full(diff, diff_stride, bw, bh,
+ &hfeatures[hfeatures_num - 1],
+ &vfeatures[vfeatures_num - 1]);
+ av1_nn_predict(hfeatures, nn_config_hor, hscores);
+ av1_nn_predict(vfeatures, nn_config_ver, vscores);
+
+ float score_2D_average = 0.0f;
+ for (int i = 0; i < 4; i++) {
+ float *cur_scores_2D = scores_2D + i * 4;
+ cur_scores_2D[0] = vscores[i] * hscores[0];
+ cur_scores_2D[1] = vscores[i] * hscores[1];
+ cur_scores_2D[2] = vscores[i] * hscores[2];
+ cur_scores_2D[3] = vscores[i] * hscores[3];
+ score_2D_average += cur_scores_2D[0] + cur_scores_2D[1] + cur_scores_2D[2] +
+ cur_scores_2D[3];
+ }
+ score_2D_average /= 16;
+
+ const int prune_aggr_table[2][2] = { { 6, 4 }, { 10, 7 } };
+ int pruning_aggressiveness = 1;
+ if (tx_set_type == EXT_TX_SET_ALL16) {
+ score_2D_transform_pow8(scores_2D, (10 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][0];
+ } else if (tx_set_type == EXT_TX_SET_DTT9_IDTX_1DDCT) {
+ score_2D_transform_pow8(scores_2D, (20 - score_2D_average));
+ pruning_aggressiveness =
+ prune_aggr_table[prune_mode - PRUNE_2D_ACCURATE][1];
+ }
+
+ // Always keep the TX type with the highest score, prune all others with
+ // score below score_thresh.
+ int max_score_i = 0;
+ float max_score = 0.0f;
+ for (int i = 0; i < 16; i++) {
+ if (scores_2D[i] > max_score &&
+ av1_ext_tx_used[tx_set_type][tx_type_table_2D[i]]) {
+ max_score = scores_2D[i];
+ max_score_i = i;
+ }
+ }
+
+ const float score_thresh =
+ prune_2D_adaptive_thresholds[tx_size][pruning_aggressiveness - 1];
+
+ uint16_t prune_bitmask = 0;
+ for (int i = 0; i < 16; i++) {
+ if (scores_2D[i] < score_thresh && i != max_score_i)
+ prune_bitmask |= (1 << tx_type_table_2D[i]);
+ }
+ return prune_bitmask;
+}
+
+// ((prune >> vtx_tab[tx_type]) & 1)
+static const uint16_t prune_v_mask[] = {
+ 0x0000, 0x0425, 0x108a, 0x14af, 0x4150, 0x4575, 0x51da, 0x55ff,
+ 0xaa00, 0xae25, 0xba8a, 0xbeaf, 0xeb50, 0xef75, 0xfbda, 0xffff,
+};
+
+// ((prune >> (htx_tab[tx_type] + 8)) & 1)
+static const uint16_t prune_h_mask[] = {
+ 0x0000, 0x0813, 0x210c, 0x291f, 0x80e0, 0x88f3, 0xa1ec, 0xa9ff,
+ 0x5600, 0x5e13, 0x770c, 0x7f1f, 0xd6e0, 0xdef3, 0xf7ec, 0xffff,
+};
+
+static INLINE uint16_t gen_tx_search_prune_mask(int tx_search_prune) {
+ uint8_t prune_v = tx_search_prune & 0x0F;
+ uint8_t prune_h = (tx_search_prune >> 8) & 0x0F;
+ return (prune_v_mask[prune_v] & prune_h_mask[prune_h]);
+}
+
+static void prune_tx(const AV1_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
+ const MACROBLOCKD *const xd, int tx_set_type) {
+ x->tx_search_prune[tx_set_type] = 0;
+ x->tx_split_prune_flag = 0;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ if (!is_inter_block(mbmi) || cpi->sf.tx_type_search.prune_mode == NO_PRUNE ||
+ x->use_default_inter_tx_type || xd->lossless[mbmi->segment_id] ||
+ x->cb_partition_scan)
+ return;
+ int tx_set = ext_tx_set_index[1][tx_set_type];
+ assert(tx_set >= 0);
+ const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+ int prune = 0;
+ switch (cpi->sf.tx_type_search.prune_mode) {
+ case NO_PRUNE: return;
+ case PRUNE_ONE:
+ if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) return;
+ prune = prune_one_for_sby(cpi, bsize, x, xd);
+ x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
+ break;
+ case PRUNE_TWO:
+ if (!(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+ if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) return;
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+ } else if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D])) {
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+ } else {
+ prune = prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+ }
+ x->tx_search_prune[tx_set_type] = gen_tx_search_prune_mask(prune);
+ break;
+ case PRUNE_2D_ACCURATE:
+ case PRUNE_2D_FAST: break;
+ default: assert(0);
+ }
+}
+
+static void model_rd_from_sse(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ (void)num_samples;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+
+ // Fast approximate the modelling function.
+ if (cpi->sf.simple_model_rd_from_var) {
+ const int64_t square_error = sse;
+ int quantizer = pd->dequant_Q3[1] >> dequant_shift;
+ if (quantizer < 120)
+ *rate = (int)AOMMIN(
+ (square_error * (280 - quantizer)) >> (16 - AV1_PROB_COST_SHIFT),
+ INT_MAX);
+ else
+ *rate = 0;
+ assert(*rate >= 0);
+ *dist = (square_error * quantizer) >> 8;
+ } else {
+ av1_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[plane_bsize],
+ pd->dequant_Q3[1] >> dequant_shift, rate,
+ dist);
+ }
+ *dist <<= 4;
+}
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+static int64_t get_sse(const AV1_COMP *cpi, const MACROBLOCK *x) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y);
+ unsigned int sse;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ &sse);
+ total_sse += sse;
+ }
+ total_sse <<= 4;
+ return total_sse;
+}
+#endif
+
+static void model_rd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int mi_row, int mi_col,
+ int *out_rate_sum, int64_t *out_dist_sum,
+ int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse,
+ int64_t *plane_dist) {
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ int plane;
+ (void)mi_row;
+ (void)mi_col;
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
+ int rate;
+ int64_t dist;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+ model_rd_from_sse(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ assert(rate_sum >= 0);
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ rate_sum = AOMMIN(rate_sum, INT_MAX);
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static void check_block_skip(const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ MACROBLOCK *x, MACROBLOCKD *xd, int plane_from,
+ int plane_to, int *skip_txfm_sb) {
+ *skip_txfm_sb = 1;
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bs =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ unsigned int sse;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ // Since fast HBD variance functions scale down sse by 4 bit, we first use
+ // fast vf implementation to rule out blocks with non-zero scaled sse. Then,
+ // only if the source is HBD and the scaled sse is 0, accurate sse
+ // computation is applied to determine if the sse is really 0. This step is
+ // necessary for HBD lossless coding.
+ cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ &sse);
+ if (sse) {
+ *skip_txfm_sb = 0;
+ return;
+ } else if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint64_t sse64 = aom_highbd_sse_odd_size(
+ p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+ block_size_wide[bs], block_size_high[bs]);
+
+ if (sse64) {
+ *skip_txfm_sb = 0;
+ return;
+ }
+ }
+ }
+ return;
+}
+
+int64_t av1_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += coeff[i] * coeff[i];
+ }
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+int64_t av1_highbd_block_error_c(const tran_low_t *coeff,
+ const tran_low_t *dqcoeff, intptr_t block_size,
+ int64_t *ssz, int bd) {
+ int i;
+ int64_t error = 0, sqcoeff = 0;
+ int shift = 2 * (bd - 8);
+ int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i++) {
+ const int64_t diff = coeff[i] - dqcoeff[i];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i] * (int64_t)coeff[i];
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
+
+// Get transform block visible dimensions cropped to the MI units.
+static void get_txb_dimensions(const MACROBLOCKD *xd, int plane,
+ BLOCK_SIZE plane_bsize, int blk_row, int blk_col,
+ BLOCK_SIZE tx_bsize, int *width, int *height,
+ int *visible_width, int *visible_height) {
+ assert(tx_bsize <= plane_bsize);
+ int txb_height = block_size_high[tx_bsize];
+ int txb_width = block_size_wide[tx_bsize];
+ const int block_height = block_size_high[plane_bsize];
+ const int block_width = block_size_wide[plane_bsize];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // TODO(aconverse@google.com): Investigate using crop_width/height here rather
+ // than the MI size
+ const int block_rows =
+ (xd->mb_to_bottom_edge >= 0)
+ ? block_height
+ : (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + block_height;
+ const int block_cols =
+ (xd->mb_to_right_edge >= 0)
+ ? block_width
+ : (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + block_width;
+ const int tx_unit_size = tx_size_wide_log2[0];
+ if (width) *width = txb_width;
+ if (height) *height = txb_height;
+ *visible_width = clamp(block_cols - (blk_col << tx_unit_size), 0, txb_width);
+ *visible_height =
+ clamp(block_rows - (blk_row << tx_unit_size), 0, txb_height);
+}
+
+// Compute the pixel domain distortion from src and dst on all visible 4x4s in
+// the
+// transform block.
+static unsigned pixel_dist(const AV1_COMP *const cpi, const MACROBLOCK *x,
+ int plane, const uint8_t *src, const int src_stride,
+ const uint8_t *dst, const int dst_stride,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int txb_rows, txb_cols, visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize,
+ &txb_cols, &txb_rows, &visible_cols, &visible_rows);
+ assert(visible_rows > 0);
+ assert(visible_cols > 0);
+
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8 && plane == 0)
+ return (unsigned)av1_dist_8x8(cpi, x, src, src_stride, dst, dst_stride,
+ tx_bsize, txb_cols, txb_rows, visible_cols,
+ visible_rows, x->qindex);
+#endif // CONFIG_DIST_8X8
+
+ unsigned sse = pixel_dist_visible_only(cpi, x, src, src_stride, dst,
+ dst_stride, tx_bsize, txb_rows,
+ txb_cols, visible_rows, visible_cols);
+
+ return sse;
+}
+
+// Compute the pixel domain distortion from diff on all visible 4x4s in the
+// transform block.
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+ int blk_row, int blk_col,
+ const BLOCK_SIZE plane_bsize,
+ const BLOCK_SIZE tx_bsize) {
+ int visible_rows, visible_cols;
+ const MACROBLOCKD *xd = &x->e_mbd;
+ get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+ NULL, &visible_cols, &visible_rows);
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+#if CONFIG_DIST_8X8
+ int txb_height = block_size_high[tx_bsize];
+ int txb_width = block_size_wide[tx_bsize];
+ if (x->using_dist_8x8 && plane == 0) {
+ const int src_stride = x->plane[plane].src.stride;
+ const int src_idx = (blk_row * src_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const int diff_idx = (blk_row * diff_stride + blk_col)
+ << tx_size_wide_log2[0];
+ const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+ return dist_8x8_diff(x, src, src_stride, diff + diff_idx, diff_stride,
+ txb_width, txb_height, visible_cols, visible_rows,
+ x->qindex);
+ }
+#endif
+ diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
+ return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
+}
+
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count) {
+ const int max_pix_val = 1 << 8;
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int this_val = src[r * stride + c];
+ assert(this_val < max_pix_val);
+ ++val_count[this_val];
+ }
+ }
+ int n = 0;
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ return n;
+}
+
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+ int bit_depth, int *val_count) {
+ assert(bit_depth <= 12);
+ const int max_pix_val = 1 << bit_depth;
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ memset(val_count, 0, max_pix_val * sizeof(val_count[0]));
+ for (int r = 0; r < rows; ++r) {
+ for (int c = 0; c < cols; ++c) {
+ const int this_val = src[r * stride + c];
+ assert(this_val < max_pix_val);
+ if (this_val >= max_pix_val) return 0;
+ ++val_count[this_val];
+ }
+ }
+ int n = 0;
+ for (int i = 0; i < max_pix_val; ++i) {
+ if (val_count[i]) ++n;
+ }
+ return n;
+}
+
+static void inverse_transform_block_facade(MACROBLOCKD *xd, int plane,
+ int block, int blk_row, int blk_col,
+ int eob, int reduced_tx_set) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const TX_SIZE tx_size = av1_get_tx_size(plane, xd);
+ const TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col,
+ tx_size, reduced_tx_set);
+ const int dst_stride = pd->dst.stride;
+ uint8_t *dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, dst,
+ dst_stride, eob, reduced_tx_set);
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record, const uint32_t hash);
+
+static uint32_t get_intra_txb_hash(MACROBLOCK *x, int plane, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size) {
+ int16_t tmp_data[64 * 64];
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *diff = x->plane[plane].src_diff;
+ const int16_t *cur_diff_row = diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int txb_w = tx_size_wide[tx_size];
+ const int txb_h = tx_size_high[tx_size];
+ uint8_t *hash_data = (uint8_t *)cur_diff_row;
+ if (txb_w != diff_stride) {
+ int16_t *cur_hash_row = tmp_data;
+ for (int i = 0; i < txb_h; i++) {
+ memcpy(cur_hash_row, cur_diff_row, sizeof(*diff) * txb_w);
+ cur_hash_row += txb_w;
+ cur_diff_row += diff_stride;
+ }
+ hash_data = (uint8_t *)tmp_data;
+ }
+ CRC32C *crc = &x->mb_rd_record.crc_calculator;
+ const uint32_t hash = av1_get_crc32c_value(crc, hash_data, 2 * txb_w * txb_h);
+ return (hash << 5) + tx_size;
+}
+
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+ TX_SIZE tx_size, int64_t *out_dist,
+ int64_t *out_sse) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ // Transform domain distortion computation is more efficient as it does
+ // not involve an inverse transform, but it is less accurate.
+ const int buffer_length = av1_get_max_eob(tx_size);
+ int64_t this_sse;
+ // TX-domain results need to shift down to Q2/D10 to match pixel
+ // domain distortion values which are in Q2^2
+ int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+ tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+ tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+ xd->bd);
+ else
+ *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+
+ *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+ *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+ int plane, BLOCK_SIZE plane_bsize,
+ int block, int blk_row, int blk_col,
+ TX_SIZE tx_size) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const uint16_t eob = p->eobs[block];
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const int bsw = block_size_wide[tx_bsize];
+ const int bsh = block_size_high[tx_bsize];
+ const int src_stride = x->plane[plane].src.stride;
+ const int dst_stride = xd->plane[plane].dst.stride;
+ // Scale the transform block index to pixel unit.
+ const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+ const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
+ const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+ const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+ const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+ assert(cpi != NULL);
+ assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
+
+ uint8_t *recon;
+ DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ recon = CONVERT_TO_BYTEPTR(recon16);
+ av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+ CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+ bsh, NULL, NULL, 0, 0, NULL, xd->bd);
+ } else {
+ recon = (uint8_t *)recon16;
+ av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+ NULL, 0, 0, NULL);
+ }
+
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
+ cpi->common.reduced_tx_set_used);
+ av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+ MAX_TX_SIZE, eob,
+ cpi->common.reduced_tx_set_used);
+
+ return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+ blk_row, blk_col, plane_bsize, tx_bsize);
+}
+
+static double get_mean(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += diff[j * stride + i];
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_sse_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ const int err = diff[j * stride + i];
+ sum += err * err;
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static double get_sad_norm(const int16_t *diff, int stride, int w, int h) {
+ double sum = 0.0;
+ for (int j = 0; j < h; ++j) {
+ for (int i = 0; i < w; ++i) {
+ sum += abs(diff[j * stride + i]);
+ }
+ }
+ assert(w > 0 && h > 0);
+ return sum / (w * h);
+}
+
+static void get_2x2_normalized_sses_and_sads(
+ const AV1_COMP *const cpi, BLOCK_SIZE tx_bsize, const uint8_t *const src,
+ int src_stride, const uint8_t *const dst, int dst_stride,
+ const int16_t *const src_diff, int diff_stride, double *const sse_norm_arr,
+ double *const sad_norm_arr) {
+ const BLOCK_SIZE tx_bsize_half =
+ get_partition_subsize(tx_bsize, PARTITION_SPLIT);
+ if (tx_bsize_half == BLOCK_INVALID) { // manually calculate stats
+ const int half_width = block_size_wide[tx_bsize] / 2;
+ const int half_height = block_size_high[tx_bsize] / 2;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const int16_t *const this_src_diff =
+ src_diff + row * half_height * diff_stride + col * half_width;
+ if (sse_norm_arr) {
+ sse_norm_arr[row * 2 + col] =
+ get_sse_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ if (sad_norm_arr) {
+ sad_norm_arr[row * 2 + col] =
+ get_sad_norm(this_src_diff, diff_stride, half_width, half_height);
+ }
+ }
+ }
+ } else { // use function pointers to calculate stats
+ const int half_width = block_size_wide[tx_bsize_half];
+ const int half_height = block_size_high[tx_bsize_half];
+ const int num_samples_half = half_width * half_height;
+ for (int row = 0; row < 2; ++row) {
+ for (int col = 0; col < 2; ++col) {
+ const uint8_t *const this_src =
+ src + row * half_height * src_stride + col * half_width;
+ const uint8_t *const this_dst =
+ dst + row * half_height * dst_stride + col * half_width;
+
+ if (sse_norm_arr) {
+ unsigned int this_sse;
+ cpi->fn_ptr[tx_bsize_half].vf(this_src, src_stride, this_dst,
+ dst_stride, &this_sse);
+ sse_norm_arr[row * 2 + col] = (double)this_sse / num_samples_half;
+ }
+
+ if (sad_norm_arr) {
+ const unsigned int this_sad = cpi->fn_ptr[tx_bsize_half].sdf(
+ this_src, src_stride, this_dst, dst_stride);
+ sad_norm_arr[row * 2 + col] = (double)this_sad / num_samples_half;
+ }
+ }
+ }
+ }
+}
+
+// NOTE: CONFIG_COLLECT_RD_STATS has 3 possible values
+// 0: Do not collect any RD stats
+// 1: Collect RD stats for transform units
+// 2: Collect RD stats for partition units
+#if CONFIG_COLLECT_RD_STATS
+
+#if CONFIG_COLLECT_RD_STATS == 1
+static void PrintTransformUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const RD_STATS *const rd_stats, int blk_row,
+ int blk_col, BLOCK_SIZE plane_bsize,
+ TX_SIZE tx_size, TX_TYPE tx_type,
+ int64_t rd) {
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 21743;
+ if (lcg_rand16(&seed) % 256 > 0) return;
+
+ const char output_file[] = "tu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int txw = tx_size_wide[tx_size];
+ const int txh = tx_size_high[tx_size];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+ const double num_samples = txw * txh;
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+
+ fprintf(fout, "%g %g", rate_norm, dist_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src =
+ &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst =
+ &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+ unsigned int sse;
+ cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &sse);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->fn_ptr[tx_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm = (double)sad / num_samples;
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int16_t *const src_diff =
+ &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, tx_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ const TX_TYPE_1D tx_type_1d_row = htx_tab[tx_type];
+ const TX_TYPE_1D tx_type_1d_col = vtx_tab[tx_type];
+
+ fprintf(fout, " %d %d %d %d %d", q_step, tx_size_wide[tx_size],
+ tx_size_high[tx_size], tx_type_1d_row, tx_type_1d_col);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, tx_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g", model_rate_norm, model_dist_norm);
+
+ const double mean = get_mean(src_diff, diff_stride, txw, txh);
+ double hor_corr, vert_corr;
+ get_horver_correlation(src_diff, diff_stride, txw, txh, &hor_corr,
+ &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, tx_bsize, src, src_stride, dst, dst_stride,
+ 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ fprintf(fout, " %d %" PRId64, x->rdmult, rd);
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+#if CONFIG_COLLECT_RD_STATS >= 2
+static void PrintPredictionUnitStats(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const RD_STATS *const rd_stats,
+ BLOCK_SIZE plane_bsize) {
+ if (rd_stats->invalid_rate) return;
+ if (rd_stats->rate == INT_MAX || rd_stats->dist == INT64_MAX) return;
+
+ // Generate small sample to restrict output size.
+ static unsigned int seed = 95014;
+ if (lcg_rand16(&seed) % 256 > 0) return;
+
+ const char output_file[] = "pu_stats.txt";
+ FILE *fout = fopen(output_file, "a");
+ if (!fout) return;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const int plane = 0;
+ struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int diff_stride = block_size_wide[plane_bsize];
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int num_samples = bw * bh;
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int q_step = pd->dequant_Q3[1] >> dequant_shift;
+
+ const double rate_norm = (double)rd_stats->rate / num_samples;
+ const double dist_norm = (double)rd_stats->dist / num_samples;
+ const double rdcost_norm =
+ (double)RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) / num_samples;
+
+ fprintf(fout, "%g %g %g", rate_norm, dist_norm, rdcost_norm);
+
+ const int src_stride = p->src.stride;
+ const uint8_t *const src = p->src.buf;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst = pd->dst.buf;
+ const int16_t *const src_diff = p->src_diff;
+ const int shift = (xd->bd - 8);
+
+ int64_t sse = aom_sum_squares_2d_i16(src_diff, diff_stride, bw, bh);
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ const double sse_norm = (double)sse / num_samples;
+
+ const unsigned int sad =
+ cpi->fn_ptr[plane_bsize].sdf(src, src_stride, dst, dst_stride);
+ const double sad_norm =
+ (double)sad / (1 << num_pels_log2_lookup[plane_bsize]);
+
+ fprintf(fout, " %g %g", sse_norm, sad_norm);
+
+ double sse_norm_arr[4], sad_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, sad_norm_arr);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ for (int k = 0; k < 4; ++k) sad_norm_arr[k] /= (1 << shift);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sse_norm_arr[i]);
+ }
+ for (int i = 0; i < 4; ++i) {
+ fprintf(fout, " %g", sad_norm_arr[i]);
+ }
+
+ fprintf(fout, " %d %d %d %d", q_step, x->rdmult, bw, bh);
+
+ int model_rate;
+ int64_t model_dist;
+ model_rd_sse_fn[MODELRD_CURVFIT](cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ const double model_rdcost_norm =
+ (double)RDCOST(x->rdmult, model_rate, model_dist) / num_samples;
+ const double model_rate_norm = (double)model_rate / num_samples;
+ const double model_dist_norm = (double)model_dist / num_samples;
+ fprintf(fout, " %g %g %g", model_rate_norm, model_dist_norm,
+ model_rdcost_norm);
+
+ double mean = get_mean(src_diff, diff_stride, bw, bh);
+ mean /= (1 << shift);
+ double hor_corr, vert_corr;
+ get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+ fprintf(fout, " %g %g %g", mean, hor_corr, vert_corr);
+
+ double hdist[4] = { 0 }, vdist[4] = { 0 };
+ get_energy_distribution_fine(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, 1, hdist, vdist);
+ fprintf(fout, " %g %g %g %g %g %g %g %g", hdist[0], hdist[1], hdist[2],
+ hdist[3], vdist[0], vdist[1], vdist[2], vdist[3]);
+
+ fprintf(fout, "\n");
+ fclose(fout);
+}
+#endif // CONFIG_COLLECT_RD_STATS >= 2
+#endif // CONFIG_COLLECT_RD_STATS
+
+static void model_rd_with_dnn(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x, BLOCK_SIZE plane_bsize,
+ int plane, int64_t sse, int num_samples,
+ int *rate, int64_t *dist) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int log_numpels = num_pels_log2_lookup[plane_bsize];
+
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int q_step = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL, &bw,
+ &bh);
+ const int src_stride = p->src.stride;
+ const uint8_t *const src = p->src.buf;
+ const int dst_stride = pd->dst.stride;
+ const uint8_t *const dst = pd->dst.buf;
+ const int16_t *const src_diff = p->src_diff;
+ const int diff_stride = block_size_wide[plane_bsize];
+ const int shift = (xd->bd - 8);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ if (plane) {
+ int model_rate;
+ int64_t model_dist;
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, num_samples,
+ &model_rate, &model_dist);
+ if (rate) *rate = model_rate;
+ if (dist) *dist = model_dist;
+ return;
+ }
+
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+
+ double sse_norm_arr[4];
+ get_2x2_normalized_sses_and_sads(cpi, plane_bsize, src, src_stride, dst,
+ dst_stride, src_diff, diff_stride,
+ sse_norm_arr, NULL);
+ double mean = get_mean(src_diff, bw, bw, bh);
+ if (shift) {
+ for (int k = 0; k < 4; ++k) sse_norm_arr[k] /= (1 << (2 * shift));
+ mean /= (1 << shift);
+ }
+ double sse_norm_sum = 0.0, sse_frac_arr[3];
+ for (int k = 0; k < 4; ++k) sse_norm_sum += sse_norm_arr[k];
+ for (int k = 0; k < 3; ++k)
+ sse_frac_arr[k] =
+ sse_norm_sum > 0.0 ? sse_norm_arr[k] / sse_norm_sum : 0.25;
+ const double q_sqr = (double)(q_step * q_step);
+ const double q_sqr_by_sse_norm = q_sqr / (sse_norm + 1.0);
+ const double mean_sqr_by_sse_norm = mean * mean / (sse_norm + 1.0);
+ double hor_corr, vert_corr;
+ get_horver_correlation(src_diff, diff_stride, bw, bh, &hor_corr, &vert_corr);
+
+ float features[NUM_FEATURES_PUSTATS];
+ features[0] = (float)hor_corr;
+ features[1] = (float)log_numpels;
+ features[2] = (float)mean_sqr_by_sse_norm;
+ features[3] = (float)q_sqr_by_sse_norm;
+ features[4] = (float)sse_frac_arr[0];
+ features[5] = (float)sse_frac_arr[1];
+ features[6] = (float)sse_frac_arr[2];
+ features[7] = (float)vert_corr;
+
+ float rate_f, dist_by_sse_norm_f;
+ av1_nn_predict(features, &av1_pustats_dist_nnconfig, &dist_by_sse_norm_f);
+ av1_nn_predict(features, &av1_pustats_rate_nnconfig, &rate_f);
+ const float dist_f = (float)((double)dist_by_sse_norm_f * (1.0 + sse_norm));
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+ return;
+}
+
+static void model_rd_for_sb_with_dnn(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ int bw, bh;
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_dnn(cpi, x, plane_bsize, plane, sse, bw * bh, &rate, &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a surface for rate and distortion using as features:
+// log2(sse_norm + 1) and log2(sse_norm/qstep^2)
+static void model_rd_with_surffit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xm = log(sse_norm + 1.0) / log(2.0);
+ const double yl = log(sse_norm / qstepsqr) / log(2.0);
+ double rate_f, dist_by_sse_norm_f;
+
+ av1_model_rd_surffit(xm, yl, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_surffit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+
+ model_rd_with_surffit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+// Fits a curve for rate and distortion using as feature:
+// log2(sse_norm/qstep^2)
+static void model_rd_with_curvfit(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ BLOCK_SIZE plane_bsize, int plane,
+ int64_t sse, int num_samples, int *rate,
+ int64_t *dist) {
+ (void)cpi;
+ (void)plane_bsize;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int dequant_shift =
+ (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+ const int qstep = AOMMAX(pd->dequant_Q3[1] >> dequant_shift, 1);
+
+ if (sse == 0) {
+ if (rate) *rate = 0;
+ if (dist) *dist = 0;
+ return;
+ }
+ aom_clear_system_state();
+ const double sse_norm = (double)sse / num_samples;
+ const double qstepsqr = (double)qstep * qstep;
+ const double xqr = log(sse_norm / qstepsqr) / log(2.0);
+
+ double rate_f, dist_by_sse_norm_f;
+ av1_model_rd_curvfit(xqr, &rate_f, &dist_by_sse_norm_f);
+
+ const double dist_f = dist_by_sse_norm_f * sse_norm;
+ int rate_i = (int)(AOMMAX(0.0, rate_f * num_samples) + 0.5);
+ int64_t dist_i = (int64_t)(AOMMAX(0.0, dist_f * num_samples) + 0.5);
+ aom_clear_system_state();
+
+ // Check if skip is better
+ if (rate_i == 0) {
+ dist_i = sse << 4;
+ } else if (RDCOST(x->rdmult, rate_i, dist_i) >=
+ RDCOST(x->rdmult, 0, sse << 4)) {
+ rate_i = 0;
+ dist_i = sse << 4;
+ }
+
+ if (rate) *rate = rate_i;
+ if (dist) *dist = dist_i;
+}
+
+static void model_rd_for_sb_with_curvfit(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ (void)mi_row;
+ (void)mi_col;
+ // Note our transform coeffs are 8 times an orthogonal transform.
+ // Hence quantizer step is also 8 times. To get effective quantizer
+ // we need to divide by 8 before sending to modeling function.
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ int64_t dist, sse;
+ int rate;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ int bw, bh;
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const int shift = (xd->bd - 8);
+ get_txb_dimensions(xd, plane, plane_bsize, 0, 0, plane_bsize, NULL, NULL,
+ &bw, &bh);
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+
+ sse = ROUND_POWER_OF_TWO(sse, shift * 2);
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static void model_rd_for_sb_with_fullrdy(
+ const AV1_COMP *const cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd,
+ int plane_from, int plane_to, int mi_row, int mi_col, int *out_rate_sum,
+ int64_t *out_dist_sum, int *skip_txfm_sb, int64_t *skip_sse_sb,
+ int *plane_rate, int64_t *plane_sse, int64_t *plane_dist) {
+ const int ref = xd->mi[0]->ref_frame[0];
+
+ int64_t rate_sum = 0;
+ int64_t dist_sum = 0;
+ int64_t total_sse = 0;
+
+ for (int plane = plane_from; plane <= plane_to; ++plane) {
+ struct macroblock_plane *const p = &x->plane[plane];
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ int64_t sse;
+ int rate;
+ int64_t dist;
+
+ if (x->skip_chroma_rd && plane) continue;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ sse = aom_highbd_sse(p->src.buf, p->src.stride, pd->dst.buf,
+ pd->dst.stride, bw, bh);
+ } else {
+ sse = aom_sse(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, bw,
+ bh);
+ }
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+
+ RD_STATS rd_stats;
+ if (plane == 0) {
+ select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+ if (rd_stats.invalid_rate) {
+ rate = 0;
+ dist = sse << 4;
+ } else {
+ rate = rd_stats.rate;
+ dist = rd_stats.dist;
+ }
+ } else {
+ model_rd_with_curvfit(cpi, x, plane_bsize, plane, sse, bw * bh, &rate,
+ &dist);
+ }
+
+ if (plane == 0) x->pred_sse[ref] = (unsigned int)AOMMIN(sse, UINT_MAX);
+
+ total_sse += sse;
+ rate_sum += rate;
+ dist_sum += dist;
+
+ if (plane_rate) plane_rate[plane] = rate;
+ if (plane_sse) plane_sse[plane] = sse;
+ if (plane_dist) plane_dist[plane] = dist;
+ }
+
+ if (skip_txfm_sb) *skip_txfm_sb = total_sse == 0;
+ if (skip_sse_sb) *skip_sse_sb = total_sse << 4;
+ *out_rate_sum = (int)rate_sum;
+ *out_dist_sum = dist_sum;
+}
+
+static int64_t search_txk_type(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
+ int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+ const TXB_CTX *const txb_ctx,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ int use_fast_coef_costing, int64_t ref_best_rd,
+ RD_STATS *best_rd_stats) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ int64_t best_rd = INT64_MAX;
+ uint16_t best_eob = 0;
+ TX_TYPE best_tx_type = DCT_DCT;
+ TX_TYPE last_tx_type = TX_TYPES;
+ const int fast_tx_search = ftxs_mode & FTXS_DCT_AND_1D_DCT_ONLY;
+ // The buffer used to swap dqcoeff in macroblockd_plane so we can keep dqcoeff
+ // of the best tx_type
+ DECLARE_ALIGNED(32, tran_low_t, this_dqcoeff[MAX_SB_SQUARE]);
+ tran_low_t *orig_dqcoeff = pd->dqcoeff;
+ tran_low_t *best_dqcoeff = this_dqcoeff;
+ const int txk_type_idx =
+ av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+ av1_invalid_rd_stats(best_rd_stats);
+
+ TXB_RD_INFO *intra_txb_rd_info = NULL;
+ uint16_t cur_joint_ctx = 0;
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[plane_bsize] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[plane_bsize] < xd->tile.mi_col_end);
+ if (within_border && cpi->sf.use_intra_txb_hash && frame_is_intra_only(cm) &&
+ !is_inter && plane == 0 &&
+ tx_size_wide[tx_size] == tx_size_high[tx_size]) {
+ const uint32_t intra_hash =
+ get_intra_txb_hash(x, plane, blk_row, blk_col, plane_bsize, tx_size);
+ const int intra_hash_idx =
+ find_tx_size_rd_info(&x->txb_rd_record_intra, intra_hash);
+ intra_txb_rd_info = &x->txb_rd_record_intra.tx_rd_info[intra_hash_idx];
+
+ cur_joint_ctx = (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+ if (intra_txb_rd_info->entropy_context == cur_joint_ctx &&
+ x->txb_rd_record_intra.tx_rd_info[intra_hash_idx].valid) {
+ mbmi->txk_type[txk_type_idx] = intra_txb_rd_info->tx_type;
+ const TX_TYPE ref_tx_type =
+ av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+ tx_size, cpi->common.reduced_tx_set_used);
+ if (ref_tx_type == intra_txb_rd_info->tx_type) {
+ best_rd_stats->rate = intra_txb_rd_info->rate;
+ best_rd_stats->dist = intra_txb_rd_info->dist;
+ best_rd_stats->sse = intra_txb_rd_info->sse;
+ best_rd_stats->skip = intra_txb_rd_info->eob == 0;
+ x->plane[plane].eobs[block] = intra_txb_rd_info->eob;
+ x->plane[plane].txb_entropy_ctx[block] =
+ intra_txb_rd_info->txb_entropy_ctx;
+ best_rd = RDCOST(x->rdmult, best_rd_stats->rate, best_rd_stats->dist);
+ best_eob = intra_txb_rd_info->eob;
+ best_tx_type = intra_txb_rd_info->tx_type;
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ best_tx_type);
+ goto RECON_INTRA;
+ }
+ }
+ }
+
+ int rate_cost = 0;
+ TX_TYPE txk_start = DCT_DCT;
+ TX_TYPE txk_end = TX_TYPES - 1;
+ if ((!is_inter && x->use_default_intra_tx_type) ||
+ (is_inter && x->use_default_inter_tx_type)) {
+ txk_start = txk_end = get_default_tx_type(0, xd, tx_size);
+ } else if (x->rd_model == LOW_TXFM_RD || x->cb_partition_scan) {
+ if (plane == 0) txk_end = DCT_DCT;
+ }
+
+ uint8_t best_txb_ctx = 0;
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(tx_size, is_inter, cm->reduced_tx_set_used);
+
+ TX_TYPE uv_tx_type = DCT_DCT;
+ if (plane) {
+ // tx_type of PLANE_TYPE_UV should be the same as PLANE_TYPE_Y
+ uv_tx_type = txk_start = txk_end =
+ av1_get_tx_type(get_plane_type(plane), xd, blk_row, blk_col, tx_size,
+ cm->reduced_tx_set_used);
+ }
+ const uint16_t ext_tx_used_flag = av1_ext_tx_used_flag[tx_set_type];
+ if (xd->lossless[mbmi->segment_id] || txsize_sqr_up_map[tx_size] > TX_32X32 ||
+ ext_tx_used_flag == 0x0001) {
+ txk_start = txk_end = DCT_DCT;
+ }
+ uint16_t allowed_tx_mask = 0; // 1: allow; 0: skip.
+ if (txk_start == txk_end) {
+ allowed_tx_mask = 1 << txk_start;
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else if (fast_tx_search) {
+ allowed_tx_mask = 0x0c01; // V_DCT, H_DCT, DCT_DCT
+ allowed_tx_mask &= ext_tx_used_flag;
+ } else {
+ assert(plane == 0);
+ allowed_tx_mask = ext_tx_used_flag;
+ // !fast_tx_search && txk_end != txk_start && plane == 0
+ const int do_prune = cpi->sf.tx_type_search.prune_mode > NO_PRUNE;
+ if (do_prune && is_inter) {
+ if (cpi->sf.tx_type_search.prune_mode >= PRUNE_2D_ACCURATE) {
+ const uint16_t prune =
+ prune_tx_2D(x, plane_bsize, tx_size, blk_row, blk_col, tx_set_type,
+ cpi->sf.tx_type_search.prune_mode);
+ allowed_tx_mask &= (~prune);
+ } else {
+ allowed_tx_mask &= (~x->tx_search_prune[tx_set_type]);
+ }
+ }
+ }
+ // Need to have at least one transform type allowed.
+ if (allowed_tx_mask == 0) {
+ txk_start = txk_end = (plane ? uv_tx_type : DCT_DCT);
+ allowed_tx_mask = (1 << txk_start);
+ }
+
+ int use_transform_domain_distortion =
+ (cpi->sf.use_transform_domain_distortion > 0) &&
+ // Any 64-pt transforms only preserves half the coefficients.
+ // Therefore transform domain distortion is not valid for these
+ // transform sizes.
+ txsize_sqr_up_map[tx_size] != TX_64X64;
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) use_transform_domain_distortion = 0;
+#endif
+ int calc_pixel_domain_distortion_final =
+ cpi->sf.use_transform_domain_distortion == 1 &&
+ use_transform_domain_distortion && x->rd_model != LOW_TXFM_RD &&
+ !x->cb_partition_scan;
+ if (calc_pixel_domain_distortion_final &&
+ (txk_start == txk_end || allowed_tx_mask == 0x0001))
+ calc_pixel_domain_distortion_final = use_transform_domain_distortion = 0;
+
+ const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+ const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+ int64_t block_sse =
+ pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+ block_sse *= 16;
+
+ for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
+ if (!(allowed_tx_mask & (1 << tx_type))) continue;
+ if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
+ RD_STATS this_rd_stats;
+ av1_invalid_rd_stats(&this_rd_stats);
+
+ if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size, tx_type,
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+ rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+ txb_ctx, use_fast_coef_costing);
+ } else {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, tx_type, AV1_XFORM_QUANT_FP);
+ if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
+ eobs_ptr[block] >= 4) {
+ // Calculate distortion quickly in transform domain.
+ dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+
+ const int64_t best_rd_ = AOMMIN(best_rd, ref_best_rd);
+ const int64_t dist_cost_estimate =
+ RDCOST(x->rdmult, 0, AOMMIN(this_rd_stats.dist, this_rd_stats.sse));
+ if (dist_cost_estimate - (dist_cost_estimate >> 3) > best_rd_) continue;
+
+ rate_cost = av1_cost_coeffs(cm, x, plane, block, tx_size, tx_type,
+ txb_ctx, use_fast_coef_costing);
+ const int64_t rd_estimate =
+ AOMMIN(RDCOST(x->rdmult, rate_cost, this_rd_stats.dist),
+ RDCOST(x->rdmult, 0, this_rd_stats.sse));
+ if (rd_estimate - (rd_estimate >> 3) > best_rd_) continue;
+ }
+ av1_optimize_b(cpi, x, plane, block, tx_size, tx_type, txb_ctx, 1,
+ &rate_cost);
+ }
+ if (eobs_ptr[block] == 0) {
+ // When eob is 0, pixel domain distortion is more efficient and accurate.
+ this_rd_stats.dist = this_rd_stats.sse = block_sse;
+ } else if (use_transform_domain_distortion) {
+ dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+ &this_rd_stats.sse);
+ } else {
+ this_rd_stats.dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ this_rd_stats.sse = block_sse;
+ }
+
+ this_rd_stats.rate = rate_cost;
+
+ const int64_t rd =
+ RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+
+ if (rd < best_rd) {
+ best_rd = rd;
+ *best_rd_stats = this_rd_stats;
+ best_tx_type = tx_type;
+ best_txb_ctx = x->plane[plane].txb_entropy_ctx[block];
+ best_eob = x->plane[plane].eobs[block];
+ last_tx_type = best_tx_type;
+
+ // Swap qcoeff and dqcoeff buffers
+ tran_low_t *const tmp_dqcoeff = best_dqcoeff;
+ best_dqcoeff = pd->dqcoeff;
+ pd->dqcoeff = tmp_dqcoeff;
+ }
+
+#if CONFIG_COLLECT_RD_STATS == 1
+ if (plane == 0) {
+ PrintTransformUnitStats(cpi, x, &this_rd_stats, blk_row, blk_col,
+ plane_bsize, tx_size, tx_type, rd);
+ }
+#endif // CONFIG_COLLECT_RD_STATS == 1
+
+ if (cpi->sf.adaptive_txb_search_level) {
+ if ((best_rd - (best_rd >> cpi->sf.adaptive_txb_search_level)) >
+ ref_best_rd) {
+ break;
+ }
+ }
+
+ // Skip transform type search when we found the block has been quantized to
+ // all zero and at the same time, it has better rdcost than doing transform.
+ if (cpi->sf.tx_type_search.skip_tx_search && !best_eob) break;
+ }
+
+ assert(best_rd != INT64_MAX);
+
+ best_rd_stats->skip = best_eob == 0;
+ if (plane == 0) {
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ best_tx_type);
+ }
+ x->plane[plane].txb_entropy_ctx[block] = best_txb_ctx;
+ x->plane[plane].eobs[block] = best_eob;
+
+ pd->dqcoeff = best_dqcoeff;
+
+ if (calc_pixel_domain_distortion_final && best_eob) {
+ best_rd_stats->dist = dist_block_px_domain(
+ cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+ best_rd_stats->sse = block_sse;
+ }
+
+ if (intra_txb_rd_info != NULL) {
+ intra_txb_rd_info->valid = 1;
+ intra_txb_rd_info->entropy_context = cur_joint_ctx;
+ intra_txb_rd_info->rate = best_rd_stats->rate;
+ intra_txb_rd_info->dist = best_rd_stats->dist;
+ intra_txb_rd_info->sse = best_rd_stats->sse;
+ intra_txb_rd_info->eob = best_eob;
+ intra_txb_rd_info->txb_entropy_ctx = best_txb_ctx;
+ if (plane == 0) intra_txb_rd_info->tx_type = best_tx_type;
+ }
+
+RECON_INTRA:
+ if (!is_inter && best_eob &&
+ (blk_row + tx_size_high_unit[tx_size] < mi_size_high[plane_bsize] ||
+ blk_col + tx_size_wide_unit[tx_size] < mi_size_wide[plane_bsize])) {
+ // intra mode needs decoded result such that the next transform block
+ // can use it for prediction.
+ // if the last search tx_type is the best tx_type, we don't need to
+ // do this again
+ if (best_tx_type != last_tx_type) {
+ if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
+ av1_xform_quant(
+ cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ best_tx_type,
+ USE_B_QUANT_NO_TRELLIS ? AV1_XFORM_QUANT_B : AV1_XFORM_QUANT_FP);
+ } else {
+ av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, best_tx_type, AV1_XFORM_QUANT_FP);
+ av1_optimize_b(cpi, x, plane, block, tx_size, best_tx_type, txb_ctx, 1,
+ &rate_cost);
+ }
+ }
+
+ inverse_transform_block_facade(xd, plane, block, blk_row, blk_col,
+ x->plane[plane].eobs[block],
+ cm->reduced_tx_set_used);
+
+ // This may happen because of hash collision. The eob stored in the hash
+ // table is non-zero, but the real eob is zero. We need to make sure tx_type
+ // is DCT_DCT in this case.
+ if (plane == 0 && x->plane[plane].eobs[block] == 0 &&
+ best_tx_type != DCT_DCT) {
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ }
+ }
+ pd->dqcoeff = orig_dqcoeff;
+
+ return best_rd;
+}
+
+static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
+ BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) {
+ struct rdcost_block_args *args = arg;
+ MACROBLOCK *const x = args->x;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const AV1_COMP *cpi = args->cpi;
+ ENTROPY_CONTEXT *a = args->t_above + blk_col;
+ ENTROPY_CONTEXT *l = args->t_left + blk_row;
+ const AV1_COMMON *cm = &cpi->common;
+ int64_t rd1, rd2, rd;
+ RD_STATS this_rd_stats;
+
+ av1_init_rd_stats(&this_rd_stats);
+
+ if (args->exit_early) {
+ args->incomplete_exit = 1;
+ return;
+ }
+
+ if (!is_inter_block(mbmi)) {
+ av1_predict_intra_block_facade(cm, xd, plane, blk_col, blk_row, tx_size);
+ av1_subtract_txb(x, plane, plane_bsize, blk_col, blk_row, tx_size);
+ }
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, a, l, &txb_ctx);
+ search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ &txb_ctx, args->ftxs_mode, args->use_fast_coef_costing,
+ args->best_rd - args->this_rd, &this_rd_stats);
+
+ if (plane == AOM_PLANE_Y && xd->cfl.store_y) {
+ assert(!is_inter_block(mbmi) || plane_bsize < BLOCK_8X8);
+ cfl_store_tx(xd, blk_row, blk_col, tx_size, plane_bsize);
+ }
+
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(&this_rd_stats, plane, tx_size, blk_row, blk_col,
+ this_rd_stats.rate);
+#endif // CONFIG_RD_DEBUG
+ av1_set_txb_context(x, plane, block, tx_size, a, l);
+
+ const int blk_idx =
+ blk_row * (block_size_wide[plane_bsize] >> tx_size_wide_log2[0]) +
+ blk_col;
+
+ if (plane == 0)
+ set_blk_skip(x, plane, blk_idx, x->plane[plane].eobs[block] == 0);
+ else
+ set_blk_skip(x, plane, blk_idx, 0);
+
+ rd1 = RDCOST(x->rdmult, this_rd_stats.rate, this_rd_stats.dist);
+ rd2 = RDCOST(x->rdmult, 0, this_rd_stats.sse);
+
+ // TODO(jingning): temporarily enabled only for luma component
+ rd = AOMMIN(rd1, rd2);
+
+ this_rd_stats.skip &= !x->plane[plane].eobs[block];
+
+ av1_merge_rd_stats(&args->rd_stats, &this_rd_stats);
+
+ args->this_rd += rd;
+
+ if (args->this_rd > args->best_rd) {
+ args->exit_early = 1;
+ return;
+ }
+}
+
+static void txfm_rd_in_plane(MACROBLOCK *x, const AV1_COMP *cpi,
+ RD_STATS *rd_stats, int64_t ref_best_rd, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ int use_fast_coef_casting,
+ FAST_TX_SEARCH_MODE ftxs_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ struct rdcost_block_args args;
+ av1_zero(args);
+ args.x = x;
+ args.cpi = cpi;
+ args.best_rd = ref_best_rd;
+ args.use_fast_coef_costing = use_fast_coef_casting;
+ args.ftxs_mode = ftxs_mode;
+ av1_init_rd_stats(&args.rd_stats);
+
+ if (plane == 0) xd->mi[0]->tx_size = tx_size;
+
+ av1_get_entropy_contexts(bsize, pd, args.t_above, args.t_left);
+
+ av1_foreach_transformed_block_in_plane(xd, bsize, plane, block_rd_txfm,
+ &args);
+
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int invalid_rd = is_inter ? args.incomplete_exit : args.exit_early;
+
+ if (invalid_rd) {
+ av1_invalid_rd_stats(rd_stats);
+ } else {
+ *rd_stats = args.rd_stats;
+ }
+}
+
+static int tx_size_cost(const AV1_COMMON *const cm, const MACROBLOCK *const x,
+ BLOCK_SIZE bsize, TX_SIZE tx_size) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ if (cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type)) {
+ const int32_t tx_size_cat = bsize_to_tx_size_cat(bsize);
+ const int depth = tx_size_to_depth(tx_size, bsize);
+ const int tx_size_ctx = get_tx_size_context(xd);
+ int r_tx_size = x->tx_size_cost[tx_size_cat][tx_size_ctx][depth];
+ return r_tx_size;
+ } else {
+ return 0;
+ }
+}
+
+static int64_t txfm_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd, BLOCK_SIZE bs,
+ TX_SIZE tx_size, FAST_TX_SEARCH_MODE ftxs_mode) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int64_t rd = INT64_MAX;
+ const int skip_ctx = av1_get_skip_context(xd);
+ int s0, s1;
+ const int is_inter = is_inter_block(mbmi);
+ const int tx_select =
+ cm->tx_mode == TX_MODE_SELECT && block_signals_txsize(mbmi->sb_type);
+ int ctx = txfm_partition_context(
+ xd->above_txfm_context, xd->left_txfm_context, mbmi->sb_type, tx_size);
+ const int r_tx_size = is_inter ? x->txfm_partition_cost[ctx][0]
+ : tx_size_cost(cm, x, bs, tx_size);
+
+ assert(IMPLIES(is_rect_tx(tx_size), is_rect_tx_allowed_bsize(bs)));
+
+ s0 = x->skip_cost[skip_ctx][0];
+ s1 = x->skip_cost[skip_ctx][1];
+
+ mbmi->tx_size = tx_size;
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs, tx_size,
+ cpi->sf.use_fast_coef_costing, ftxs_mode);
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ if (rd_stats->skip) {
+ if (is_inter) {
+ rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ } else {
+ rd = RDCOST(x->rdmult, s1 + r_tx_size * tx_select, rd_stats->sse);
+ }
+ } else {
+ rd = RDCOST(x->rdmult, rd_stats->rate + s0 + r_tx_size * tx_select,
+ rd_stats->dist);
+ }
+
+ if (tx_select) rd_stats->rate += r_tx_size;
+
+ if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
+ rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+
+ return rd;
+}
+
+static int64_t estimate_yrd_for_sb(const AV1_COMP *const cpi, BLOCK_SIZE bs,
+ MACROBLOCK *x, int *r, int64_t *d, int *s,
+ int64_t *sse, int64_t ref_best_rd) {
+ RD_STATS rd_stats;
+ av1_subtract_plane(x, bs, 0);
+ x->rd_model = LOW_TXFM_RD;
+ int64_t rd = txfm_yrd(cpi, x, &rd_stats, ref_best_rd, bs,
+ max_txsize_rect_lookup[bs], FTXS_NONE);
+ x->rd_model = FULL_TXFM_RD;
+ *r = rd_stats.rate;
+ *d = rd_stats.dist;
+ *s = rd_stats.skip;
+ *sse = rd_stats.sse;
+ return rd;
+}
+
+static void choose_largest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ mbmi->tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(mbmi->tx_size, is_inter, cm->reduced_tx_set_used);
+ prune_tx(cpi, bs, x, xd, tx_set_type);
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, AOM_PLANE_Y, bs,
+ mbmi->tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ // Reset the pruning flags.
+ av1_zero(x->tx_search_prune);
+ x->tx_split_prune_flag = 0;
+}
+
+static void choose_smallest_tx_size(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, int64_t ref_best_rd,
+ BLOCK_SIZE bs) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ mbmi->tx_size = TX_4X4;
+ txfm_rd_in_plane(x, cpi, rd_stats, ref_best_rd, 0, bs, mbmi->tx_size,
+ cpi->sf.use_fast_coef_costing, FTXS_NONE);
+}
+
+static INLINE int bsize_to_num_blk(BLOCK_SIZE bsize) {
+ int num_blk = 1 << (num_pels_log2_lookup[bsize] - 2 * tx_size_wide_log2[0]);
+ return num_blk;
+}
+
+static int get_search_init_depth(int mi_width, int mi_height, int is_inter,
+ const SPEED_FEATURES *sf) {
+ if (sf->tx_size_search_method == USE_LARGESTALL) return MAX_VARTX_DEPTH;
+
+ if (sf->tx_size_search_lgr_block) {
+ if (mi_width > mi_size_wide[BLOCK_64X64] ||
+ mi_height > mi_size_high[BLOCK_64X64])
+ return MAX_VARTX_DEPTH;
+ }
+
+ if (is_inter) {
+ return (mi_height != mi_width) ? sf->inter_tx_size_search_init_depth_rect
+ : sf->inter_tx_size_search_init_depth_sqr;
+ } else {
+ return (mi_height != mi_width) ? sf->intra_tx_size_search_init_depth_rect
+ : sf->intra_tx_size_search_init_depth_sqr;
+ }
+}
+
+static void choose_tx_size_type_from_rd(const AV1_COMP *const cpi,
+ MACROBLOCK *x, RD_STATS *rd_stats,
+ int64_t ref_best_rd, BLOCK_SIZE bs) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int64_t rd = INT64_MAX;
+ int n;
+ int start_tx;
+ int depth;
+ int64_t best_rd = INT64_MAX;
+ const TX_SIZE max_rect_tx_size = max_txsize_rect_lookup[bs];
+ TX_SIZE best_tx_size = max_rect_tx_size;
+ TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int n4 = bsize_to_num_blk(bs);
+ const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+
+ av1_invalid_rd_stats(rd_stats);
+
+ if (tx_select) {
+ start_tx = max_rect_tx_size;
+ depth = get_search_init_depth(mi_size_wide[bs], mi_size_high[bs],
+ is_inter_block(mbmi), &cpi->sf);
+ } else {
+ const TX_SIZE chosen_tx_size = tx_size_from_tx_mode(bs, cm->tx_mode);
+ start_tx = chosen_tx_size;
+ depth = MAX_TX_DEPTH;
+ }
+
+ prune_tx(cpi, bs, x, xd, EXT_TX_SET_ALL16);
+
+ for (n = start_tx; depth <= MAX_TX_DEPTH; depth++, n = sub_tx_size_map[n]) {
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8) {
+ if (tx_size_wide[n] < 8 || tx_size_high[n] < 8) continue;
+ }
+#endif
+ RD_STATS this_rd_stats;
+ if (mbmi->ref_mv_idx > 0) x->rd_model = LOW_TXFM_RD;
+ rd = txfm_yrd(cpi, x, &this_rd_stats, ref_best_rd, bs, n, FTXS_NONE);
+ x->rd_model = FULL_TXFM_RD;
+
+ if (rd < best_rd) {
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+ best_tx_size = n;
+ best_rd = rd;
+ *rd_stats = this_rd_stats;
+ }
+ if (n == TX_4X4) break;
+ }
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+ }
+
+ // Reset the pruning flags.
+ av1_zero(x->tx_search_prune);
+ x->tx_split_prune_flag = 0;
+}
+
+static void super_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bs,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ av1_init_rd_stats(rd_stats);
+
+ assert(bs == xd->mi[0]->sb_type);
+
+ if (xd->lossless[xd->mi[0]->segment_id]) {
+ choose_smallest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
+ choose_largest_tx_size(cpi, x, rd_stats, ref_best_rd, bs);
+ } else {
+ choose_tx_size_type_from_rd(cpi, x, rd_stats, ref_best_rd, bs);
+ }
+}
+
+// Return the rate cost for luma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_y(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+ int mode_cost) {
+ int total_rate = mode_cost;
+ const int use_palette = mbmi->palette_mode_info.palette_size[0] > 0;
+ const int use_filter_intra = mbmi->filter_intra_mode_info.use_filter_intra;
+ const int use_intrabc = mbmi->use_intrabc;
+ // Can only activate one mode.
+ assert(((mbmi->mode != DC_PRED) + use_palette + use_intrabc +
+ use_filter_intra) <= 1);
+ const int try_palette =
+ av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+ if (try_palette && mbmi->mode == DC_PRED) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int mode_ctx = av1_get_palette_mode_ctx(xd);
+ total_rate += x->palette_y_mode_cost[bsize_ctx][mode_ctx][use_palette];
+ if (use_palette) {
+ const uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ const int plt_size = mbmi->palette_mode_info.palette_size[0];
+ int palette_mode_cost =
+ x->palette_y_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+ palette_mode_cost +=
+ av1_palette_color_cost_y(&mbmi->palette_mode_info, color_cache,
+ n_cache, cpi->common.seq_params.bit_depth);
+ palette_mode_cost +=
+ av1_cost_color_map(x, 0, bsize, mbmi->tx_size, PALETTE_MAP);
+ total_rate += palette_mode_cost;
+ }
+ }
+ if (av1_filter_intra_allowed(&cpi->common, mbmi)) {
+ total_rate += x->filter_intra_cost[mbmi->sb_type][use_filter_intra];
+ if (use_filter_intra) {
+ total_rate += x->filter_intra_mode_cost[mbmi->filter_intra_mode_info
+ .filter_intra_mode];
+ }
+ }
+ if (av1_is_directional_mode(mbmi->mode)) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate += x->angle_delta_cost[mbmi->mode - V_PRED]
+ [MAX_ANGLE_DELTA +
+ mbmi->angle_delta[PLANE_TYPE_Y]];
+ }
+ }
+ if (av1_allow_intrabc(&cpi->common))
+ total_rate += x->intrabc_cost[use_intrabc];
+ return total_rate;
+}
+
+// Return the rate cost for chroma prediction mode info. of intra blocks.
+static int intra_mode_info_cost_uv(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const MB_MODE_INFO *mbmi, BLOCK_SIZE bsize,
+ int mode_cost) {
+ int total_rate = mode_cost;
+ const int use_palette = mbmi->palette_mode_info.palette_size[1] > 0;
+ const UV_PREDICTION_MODE mode = mbmi->uv_mode;
+ // Can only activate one mode.
+ assert(((mode != UV_DC_PRED) + use_palette + mbmi->use_intrabc) <= 1);
+
+ const int try_palette =
+ av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+ if (try_palette && mode == UV_DC_PRED) {
+ const PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+ total_rate +=
+ x->palette_uv_mode_cost[pmi->palette_size[0] > 0][use_palette];
+ if (use_palette) {
+ const int bsize_ctx = av1_get_palette_bsize_ctx(bsize);
+ const int plt_size = pmi->palette_size[1];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const uint8_t *const color_map = xd->plane[1].color_index_map;
+ int palette_mode_cost =
+ x->palette_uv_size_cost[bsize_ctx][plt_size - PALETTE_MIN_SIZE] +
+ write_uniform_cost(plt_size, color_map[0]);
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+ palette_mode_cost += av1_palette_color_cost_uv(
+ pmi, color_cache, n_cache, cpi->common.seq_params.bit_depth);
+ palette_mode_cost +=
+ av1_cost_color_map(x, 1, bsize, mbmi->tx_size, PALETTE_MAP);
+ total_rate += palette_mode_cost;
+ }
+ }
+ if (av1_is_directional_mode(get_uv_mode(mode))) {
+ if (av1_use_angle_delta(bsize)) {
+ total_rate +=
+ x->angle_delta_cost[mode - V_PRED][mbmi->angle_delta[PLANE_TYPE_UV] +
+ MAX_ANGLE_DELTA];
+ }
+ }
+ return total_rate;
+}
+
+static int conditional_skipintra(PREDICTION_MODE mode,
+ PREDICTION_MODE best_intra_mode) {
+ if (mode == D113_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ if (mode == D67_PRED && best_intra_mode != V_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D203_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D45_PRED)
+ return 1;
+ if (mode == D157_PRED && best_intra_mode != H_PRED &&
+ best_intra_mode != D135_PRED)
+ return 1;
+ return 0;
+}
+
+// Model based RD estimation for luma intra blocks.
+static int64_t intra_model_yrd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mode_cost, int mi_row,
+ int mi_col) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ RD_STATS this_rd_stats;
+ int row, col;
+ int64_t temp_sse, this_rd;
+ TX_SIZE tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ const int stepr = tx_size_high_unit[tx_size];
+ const int stepc = tx_size_wide_unit[tx_size];
+ const int max_blocks_wide = max_block_wide(xd, bsize, 0);
+ const int max_blocks_high = max_block_high(xd, bsize, 0);
+ mbmi->tx_size = tx_size;
+ // Prediction.
+ for (row = 0; row < max_blocks_high; row += stepr) {
+ for (col = 0; col < max_blocks_wide; col += stepc) {
+ av1_predict_intra_block_facade(cm, xd, 0, col, row, tx_size);
+ }
+ }
+ // RD estimation.
+ model_rd_sb_fn[MODELRD_TYPE_INTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &this_rd_stats.rate,
+ &this_rd_stats.dist, &this_rd_stats.skip, &temp_sse, NULL, NULL, NULL);
+ if (av1_is_directional_mode(mbmi->mode) && av1_use_angle_delta(bsize)) {
+ mode_cost +=
+ x->angle_delta_cost[mbmi->mode - V_PRED]
+ [MAX_ANGLE_DELTA + mbmi->angle_delta[PLANE_TYPE_Y]];
+ }
+ if (mbmi->mode == DC_PRED &&
+ av1_filter_intra_allowed_bsize(cm, mbmi->sb_type)) {
+ if (mbmi->filter_intra_mode_info.use_filter_intra) {
+ const int mode = mbmi->filter_intra_mode_info.filter_intra_mode;
+ mode_cost += x->filter_intra_cost[mbmi->sb_type][1] +
+ x->filter_intra_mode_cost[mode];
+ } else {
+ mode_cost += x->filter_intra_cost[mbmi->sb_type][0];
+ }
+ }
+ this_rd =
+ RDCOST(x->rdmult, this_rd_stats.rate + mode_cost, this_rd_stats.dist);
+ return this_rd;
+}
+
+// Extends 'color_map' array from 'orig_width x orig_height' to 'new_width x
+// new_height'. Extra rows and columns are filled in by copying last valid
+// row/column.
+static void extend_palette_color_map(uint8_t *const color_map, int orig_width,
+ int orig_height, int new_width,
+ int new_height) {
+ int j;
+ assert(new_width >= orig_width);
+ assert(new_height >= orig_height);
+ if (new_width == orig_width && new_height == orig_height) return;
+
+ for (j = orig_height - 1; j >= 0; --j) {
+ memmove(color_map + j * new_width, color_map + j * orig_width, orig_width);
+ // Copy last column to extra columns.
+ memset(color_map + j * new_width + orig_width,
+ color_map[j * new_width + orig_width - 1], new_width - orig_width);
+ }
+ // Copy last row to extra rows.
+ for (j = orig_height; j < new_height; ++j) {
+ memcpy(color_map + j * new_width, color_map + (orig_height - 1) * new_width,
+ new_width);
+ }
+}
+
+// Bias toward using colors in the cache.
+// TODO(huisu): Try other schemes to improve compression.
+static void optimize_palette_colors(uint16_t *color_cache, int n_cache,
+ int n_colors, int stride, int *centroids) {
+ if (n_cache <= 0) return;
+ for (int i = 0; i < n_colors * stride; i += stride) {
+ int min_diff = abs(centroids[i] - (int)color_cache[0]);
+ int idx = 0;
+ for (int j = 1; j < n_cache; ++j) {
+ const int this_diff = abs(centroids[i] - color_cache[j]);
+ if (this_diff < min_diff) {
+ min_diff = this_diff;
+ idx = j;
+ }
+ }
+ if (min_diff <= 1) centroids[i] = color_cache[idx];
+ }
+}
+
+// Given the base colors as specified in centroids[], calculate the RD cost
+// of palette mode.
+static void palette_rd_y(const AV1_COMP *const cpi, MACROBLOCK *x,
+ MB_MODE_INFO *mbmi, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, const int *data,
+ int *centroids, int n, uint16_t *color_cache,
+ int n_cache, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd,
+ int64_t *best_model_rd, int *rate, int *rate_tokenonly,
+ int *rate_overhead, int64_t *distortion,
+ int *skippable, PICK_MODE_CONTEXT *ctx,
+ uint8_t *blk_skip) {
+ optimize_palette_colors(color_cache, n_cache, n, 1, centroids);
+ int k = av1_remove_duplicates(centroids, n);
+ if (k < PALETTE_MIN_SIZE) {
+ // Too few unique colors to create a palette. And DC_PRED will work
+ // well for that case anyway. So skip.
+ return;
+ }
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ if (cpi->common.seq_params.use_highbitdepth)
+ for (int i = 0; i < k; ++i)
+ pmi->palette_colors[i] = clip_pixel_highbd(
+ (int)centroids[i], cpi->common.seq_params.bit_depth);
+ else
+ for (int i = 0; i < k; ++i)
+ pmi->palette_colors[i] = clip_pixel(centroids[i]);
+ pmi->palette_size[0] = k;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+ av1_calc_indices(data, centroids, color_map, rows * cols, k, 1);
+ extend_palette_color_map(color_map, cols, rows, block_width, block_height);
+ const int palette_mode_cost =
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, dc_mode_cost);
+ int64_t this_model_rd =
+ intra_model_yrd(cpi, x, bsize, palette_mode_cost, mi_row, mi_col);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ return;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ RD_STATS tokenonly_rd_stats;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) return;
+ int this_rate = tokenonly_rd_stats.rate + palette_mode_cost;
+ int64_t this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+ tokenonly_rd_stats.rate -=
+ tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+ }
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ memcpy(best_palette_color_map, color_map,
+ block_width * block_height * sizeof(color_map[0]));
+ *best_mbmi = *mbmi;
+ memcpy(blk_skip, x->blk_skip, sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ *rate_overhead = this_rate - tokenonly_rd_stats.rate;
+ if (rate) *rate = this_rate;
+ if (rate_tokenonly) *rate_tokenonly = tokenonly_rd_stats.rate;
+ if (distortion) *distortion = tokenonly_rd_stats.dist;
+ if (skippable) *skippable = tokenonly_rd_stats.skip;
+ }
+}
+
+static int rd_pick_palette_intra_sby(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int dc_mode_cost, MB_MODE_INFO *best_mbmi,
+ uint8_t *best_palette_color_map, int64_t *best_rd, int64_t *best_model_rd,
+ int *rate, int *rate_tokenonly, int64_t *distortion, int *skippable,
+ PICK_MODE_CONTEXT *ctx, uint8_t *best_blk_skip) {
+ int rate_overhead = 0;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(av1_allow_palette(cpi->common.allow_screen_content_tools, bsize));
+ const SequenceHeader *const seq_params = &cpi->common.seq_params;
+ int colors, n;
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *const src = x->plane[0].src.buf;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ int block_width, block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 0, xd, &block_width, &block_height, &rows,
+ &cols);
+
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ if (seq_params->use_highbitdepth)
+ colors = av1_count_colors_highbd(src, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf);
+ else
+ colors = av1_count_colors(src, src_stride, rows, cols, count_buf);
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ if (colors > 1 && colors <= 64) {
+ int r, c, i;
+ const int max_itr = 50;
+ int *const data = x->palette_buffer->kmeans_data_buf;
+ int centroids[PALETTE_MAX_SIZE];
+ int lb, ub, val;
+ uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+ if (seq_params->use_highbitdepth)
+ lb = ub = src16[0];
+ else
+ lb = ub = src[0];
+
+ if (seq_params->use_highbitdepth) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src16[r * src_stride + c];
+ data[r * cols + c] = val;
+ if (val < lb)
+ lb = val;
+ else if (val > ub)
+ ub = val;
+ }
+ }
+ } else {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ val = src[r * src_stride + c];
+ data[r * cols + c] = val;
+ if (val < lb)
+ lb = val;
+ else if (val > ub)
+ ub = val;
+ }
+ }
+ }
+
+ mbmi->mode = DC_PRED;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 0, color_cache);
+
+ // Find the dominant colors, stored in top_colors[].
+ int top_colors[PALETTE_MAX_SIZE] = { 0 };
+ for (i = 0; i < AOMMIN(colors, PALETTE_MAX_SIZE); ++i) {
+ int max_count = 0;
+ for (int j = 0; j < (1 << seq_params->bit_depth); ++j) {
+ if (count_buf[j] > max_count) {
+ max_count = count_buf[j];
+ top_colors[i] = j;
+ }
+ }
+ assert(max_count > 0);
+ count_buf[top_colors[i]] = 0;
+ }
+
+ // Try the dominant colors directly.
+ // TODO(huisu@google.com): Try to avoid duplicate computation in cases
+ // where the dominant colors and the k-means results are similar.
+ for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+ for (i = 0; i < n; ++i) centroids[i] = top_colors[i];
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
+ }
+
+ // K-means clustering.
+ for (n = AOMMIN(colors, PALETTE_MAX_SIZE); n >= 2; --n) {
+ if (colors == PALETTE_MIN_SIZE) {
+ // Special case: These colors automatically become the centroids.
+ assert(colors == n);
+ assert(colors == 2);
+ centroids[0] = lb;
+ centroids[1] = ub;
+ } else {
+ for (i = 0; i < n; ++i) {
+ centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, rows * cols, n, 1, max_itr);
+ }
+ palette_rd_y(cpi, x, mbmi, bsize, mi_row, mi_col, dc_mode_cost, data,
+ centroids, n, color_cache, n_cache, best_mbmi,
+ best_palette_color_map, best_rd, best_model_rd, rate,
+ rate_tokenonly, &rate_overhead, distortion, skippable, ctx,
+ best_blk_skip);
+ }
+ }
+
+ if (best_mbmi->palette_mode_info.palette_size[0] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ block_width * block_height * sizeof(best_palette_color_map[0]));
+ }
+ *mbmi = *best_mbmi;
+ return rate_overhead;
+}
+
+// Return 1 if an filter intra mode is selected; return 0 otherwise.
+static int rd_pick_filter_intra_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int mode_cost, int64_t *best_rd,
+ int64_t *best_model_rd,
+ PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ int filter_intra_selected_flag = 0;
+ FILTER_INTRA_MODE mode;
+ TX_SIZE best_tx_size = TX_8X8;
+ FILTER_INTRA_MODE_INFO filter_intra_mode_info;
+ TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+ (void)ctx;
+ av1_zero(filter_intra_mode_info);
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->palette_mode_info.palette_size[0] = 0;
+
+ for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+ int64_t this_rd, this_model_rd;
+ RD_STATS tokenonly_rd_stats;
+ mbmi->filter_intra_mode_info.filter_intra_mode = mode;
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ continue;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ const int this_rate =
+ tokenonly_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, mode_cost);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ best_tx_size = mbmi->tx_size;
+ filter_intra_mode_info = mbmi->filter_intra_mode_info;
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip;
+ filter_intra_selected_flag = 1;
+ }
+ }
+
+ if (filter_intra_selected_flag) {
+ mbmi->mode = DC_PRED;
+ mbmi->tx_size = best_tx_size;
+ mbmi->filter_intra_mode_info = filter_intra_mode_info;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(best_txk_type[0]) * TXK_TYPE_BUF_LEN);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+// Run RD calculation with given luma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t calc_rd_given_intra_angle(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int mode_cost, int64_t best_rd_in, int8_t angle_delta,
+ int max_angle_delta, int *rate, RD_STATS *rd_stats, int *best_angle_delta,
+ TX_SIZE *best_tx_size, int64_t *best_rd, int64_t *best_model_rd,
+ TX_TYPE *best_txk_type, uint8_t *best_blk_skip) {
+ RD_STATS tokenonly_rd_stats;
+ int64_t this_rd, this_model_rd;
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+ const int n4 = bsize_to_num_blk(bsize);
+ assert(!is_inter_block(mbmi));
+ mbmi->angle_delta[PLANE_TYPE_Y] = angle_delta;
+ this_model_rd = intra_model_yrd(cpi, x, bsize, mode_cost, mi_row, mi_col);
+ if (*best_model_rd != INT64_MAX &&
+ this_model_rd > *best_model_rd + (*best_model_rd >> 1))
+ return INT64_MAX;
+ if (this_model_rd < *best_model_rd) *best_model_rd = this_model_rd;
+ super_block_yrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in);
+ if (tokenonly_rd_stats.rate == INT_MAX) return INT64_MAX;
+
+ int this_rate =
+ mode_cost + tokenonly_rd_stats.rate +
+ x->angle_delta_cost[mbmi->mode - V_PRED][max_angle_delta + angle_delta];
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < *best_rd) {
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(best_blk_skip, x->blk_skip, sizeof(best_blk_skip[0]) * n4);
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_Y];
+ *best_tx_size = mbmi->tx_size;
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip = tokenonly_rd_stats.skip;
+ }
+ return this_rd;
+}
+
+// With given luma directional intra prediction mode, pick the best angle delta
+// Return the RD cost corresponding to the best angle delta.
+static int64_t rd_pick_intra_angle_sby(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col, int *rate,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int mode_cost, int64_t best_rd,
+ int64_t *best_model_rd) {
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+ assert(!is_inter_block(mbmi));
+
+ int best_angle_delta = 0;
+ int64_t rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+ TX_SIZE best_tx_size = mbmi->tx_size;
+ TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+
+ for (int i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ int first_try = 1;
+ for (int angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
+ const int64_t best_rd_in =
+ (best_rd == INT64_MAX) ? INT64_MAX
+ : (best_rd + (best_rd >> (first_try ? 3 : 5)));
+ const int64_t this_rd = calc_rd_given_intra_angle(
+ cpi, x, bsize, mi_row, mi_col, mode_cost, best_rd_in,
+ (1 - 2 * i) * angle_delta, MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd, best_model_rd,
+ best_txk_type, best_blk_skip);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (first_try && this_rd == INT64_MAX) return best_rd;
+ first_try = 0;
+ if (angle_delta == 0) {
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (int angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (int i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ const int64_t rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ calc_rd_given_intra_angle(cpi, x, bsize, mi_row, mi_col, mode_cost,
+ best_rd, (1 - 2 * i) * angle_delta,
+ MAX_ANGLE_DELTA, rate, rd_stats,
+ &best_angle_delta, &best_tx_size, &best_rd,
+ best_model_rd, best_txk_type, best_blk_skip);
+ }
+ }
+ }
+
+ if (rd_stats->rate != INT_MAX) {
+ mbmi->tx_size = best_tx_size;
+ mbmi->angle_delta[PLANE_TYPE_Y] = best_angle_delta;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+ }
+ return best_rd;
+}
+
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+ {
+ { 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+ {
+ { 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4 },
+ { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 },
+ { 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 },
+ },
+};
+
+/* clang-format off */
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+ 0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+ 0,
+};
+/* clang-format on */
+
+static void angle_estimation(const uint8_t *src, int src_stride, int rows,
+ int cols, BLOCK_SIZE bsize,
+ uint8_t *directional_mode_skip_mask) {
+ memset(directional_mode_skip_mask, 0,
+ INTRA_MODES * sizeof(*directional_mode_skip_mask));
+ // Check if angle_delta is used
+ if (!av1_use_angle_delta(bsize)) return;
+ uint64_t hist[DIRECTIONAL_MODES];
+ memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+ src += src_stride;
+ int r, c, dx, dy;
+ for (r = 1; r < rows; ++r) {
+ for (c = 1; c < cols; ++c) {
+ dx = src[c] - src[c - 1];
+ dy = src[c] - src[c - src_stride];
+ int index;
+ const int temp = dx * dx + dy * dy;
+ if (dy == 0) {
+ index = 2;
+ } else {
+ const int sn = (dx > 0) ^ (dy > 0);
+ dx = abs(dx);
+ dy = abs(dy);
+ const int remd = (dx % dy) * 16 / dy;
+ const int quot = dx / dy;
+ index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+ }
+ hist[index] += temp;
+ }
+ src += src_stride;
+ }
+
+ int i;
+ uint64_t hist_sum = 0;
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+ for (i = 0; i < INTRA_MODES; ++i) {
+ if (av1_is_directional_mode(i)) {
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
+ int weight = 2;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
+ }
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
+ }
+ if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+ directional_mode_skip_mask[i] = 1;
+ }
+ }
+}
+
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+ int rows, int cols, BLOCK_SIZE bsize,
+ uint8_t *directional_mode_skip_mask) {
+ memset(directional_mode_skip_mask, 0,
+ INTRA_MODES * sizeof(*directional_mode_skip_mask));
+ // Check if angle_delta is used
+ if (!av1_use_angle_delta(bsize)) return;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint64_t hist[DIRECTIONAL_MODES];
+ memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+ src += src_stride;
+ int r, c, dx, dy;
+ for (r = 1; r < rows; ++r) {
+ for (c = 1; c < cols; ++c) {
+ dx = src[c] - src[c - 1];
+ dy = src[c] - src[c - src_stride];
+ int index;
+ const int temp = dx * dx + dy * dy;
+ if (dy == 0) {
+ index = 2;
+ } else {
+ const int sn = (dx > 0) ^ (dy > 0);
+ dx = abs(dx);
+ dy = abs(dy);
+ const int remd = (dx % dy) * 16 / dy;
+ const int quot = dx / dy;
+ index = gradient_to_angle_bin[sn][AOMMIN(quot, 6)][AOMMIN(remd, 15)];
+ }
+ hist[index] += temp;
+ }
+ src += src_stride;
+ }
+
+ int i;
+ uint64_t hist_sum = 0;
+ for (i = 0; i < DIRECTIONAL_MODES; ++i) hist_sum += hist[i];
+ for (i = 0; i < INTRA_MODES; ++i) {
+ if (av1_is_directional_mode(i)) {
+ const uint8_t angle_bin = mode_to_angle_bin[i];
+ uint64_t score = 2 * hist[angle_bin];
+ int weight = 2;
+ if (angle_bin > 0) {
+ score += hist[angle_bin - 1];
+ ++weight;
+ }
+ if (angle_bin < DIRECTIONAL_MODES - 1) {
+ score += hist[angle_bin + 1];
+ ++weight;
+ }
+ if (score * ANGLE_SKIP_THRESH < hist_sum * weight)
+ directional_mode_skip_mask[i] = 1;
+ }
+ }
+}
+
+// Given selected prediction mode, search for the best tx type and size.
+static void intra_block_yrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, const int *bmode_costs,
+ int64_t *best_rd, int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ MB_MODE_INFO *best_mbmi, PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ RD_STATS rd_stats;
+ super_block_yrd(cpi, x, &rd_stats, bsize, *best_rd);
+ if (rd_stats.rate == INT_MAX) return;
+ int this_rate_tokenonly = rd_stats.rate;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(mbmi->sb_type)) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -= tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+ }
+ const int this_rate =
+ rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+ const int64_t this_rd = RDCOST(x->rdmult, this_rate, rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_mbmi = *mbmi;
+ *best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = rd_stats.dist;
+ *skippable = rd_stats.skip;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+}
+
+// This function is used only for intra_only frames
+static int64_t rd_pick_intra_sby_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int mi_row, int mi_col, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable, BLOCK_SIZE bsize,
+ int64_t best_rd, PICK_MODE_CONTEXT *ctx) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int64_t best_model_rd = INT64_MAX;
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ int is_directional_mode;
+ uint8_t directional_mode_skip_mask[INTRA_MODES];
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+ int beat_best_rd = 0;
+ const int *bmode_costs;
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+ uint8_t *best_palette_color_map =
+ try_palette ? x->palette_buffer->best_palette_color_map : NULL;
+ const MB_MODE_INFO *above_mi = xd->above_mbmi;
+ const MB_MODE_INFO *left_mi = xd->left_mbmi;
+ const PREDICTION_MODE A = av1_above_block_mode(above_mi);
+ const PREDICTION_MODE L = av1_left_block_mode(left_mi);
+ const int above_ctx = intra_mode_context[A];
+ const int left_ctx = intra_mode_context[L];
+ bmode_costs = x->y_mode_costs[above_ctx][left_ctx];
+
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+ directional_mode_skip_mask);
+ else
+ angle_estimation(src, src_stride, rows, cols, bsize,
+ directional_mode_skip_mask);
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ pmi->palette_size[0] = 0;
+
+ if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+ x->use_default_intra_tx_type = 1;
+ else
+ x->use_default_intra_tx_type = 0;
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ /* Y Search for intra prediction mode */
+ for (int mode_idx = INTRA_MODE_START; mode_idx < INTRA_MODE_END; ++mode_idx) {
+ RD_STATS this_rd_stats;
+ int this_rate, this_rate_tokenonly, s;
+ int64_t this_distortion, this_rd, this_model_rd;
+ mbmi->mode = intra_rd_search_mode_order[mode_idx];
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ this_model_rd =
+ intra_model_yrd(cpi, x, bsize, bmode_costs[mbmi->mode], mi_row, mi_col);
+ if (best_model_rd != INT64_MAX &&
+ this_model_rd > best_model_rd + (best_model_rd >> 1))
+ continue;
+ if (this_model_rd < best_model_rd) best_model_rd = this_model_rd;
+ is_directional_mode = av1_is_directional_mode(mbmi->mode);
+ if (is_directional_mode && directional_mode_skip_mask[mbmi->mode]) continue;
+ if (is_directional_mode && av1_use_angle_delta(bsize)) {
+ this_rd_stats.rate = INT_MAX;
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &this_rate,
+ &this_rd_stats, bsize, bmode_costs[mbmi->mode],
+ best_rd, &best_model_rd);
+ } else {
+ super_block_yrd(cpi, x, &this_rd_stats, bsize, best_rd);
+ }
+ this_rate_tokenonly = this_rd_stats.rate;
+ this_distortion = this_rd_stats.dist;
+ s = this_rd_stats.skip;
+
+ if (this_rate_tokenonly == INT_MAX) continue;
+
+ if (!xd->lossless[mbmi->segment_id] &&
+ block_signals_txsize(mbmi->sb_type)) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ this_rate_tokenonly -=
+ tx_size_cost(&cpi->common, x, bsize, mbmi->tx_size);
+ }
+ this_rate =
+ this_rd_stats.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, bmode_costs[mbmi->mode]);
+ this_rd = RDCOST(x->rdmult, this_rate, this_distortion);
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ beat_best_rd = 1;
+ *rate = this_rate;
+ *rate_tokenonly = this_rate_tokenonly;
+ *distortion = this_distortion;
+ *skippable = s;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+
+ if (try_palette) {
+ rd_pick_palette_intra_sby(
+ cpi, x, bsize, mi_row, mi_col, bmode_costs[DC_PRED], &best_mbmi,
+ best_palette_color_map, &best_rd, &best_model_rd, rate, rate_tokenonly,
+ distortion, skippable, ctx, ctx->blk_skip);
+ }
+
+ if (beat_best_rd && av1_filter_intra_allowed_bsize(&cpi->common, bsize)) {
+ if (rd_pick_filter_intra_sby(
+ cpi, x, mi_row, mi_col, rate, rate_tokenonly, distortion, skippable,
+ bsize, bmode_costs[DC_PRED], &best_rd, &best_model_rd, ctx)) {
+ best_mbmi = *mbmi;
+ }
+ }
+
+ // If previous searches use only the default tx type, do an extra search for
+ // the best tx type.
+ if (x->use_default_intra_tx_type) {
+ *mbmi = best_mbmi;
+ x->use_default_intra_tx_type = 0;
+ intra_block_yrd(cpi, x, bsize, bmode_costs, &best_rd, rate, rate_tokenonly,
+ distortion, skippable, &best_mbmi, ctx);
+ }
+
+ *mbmi = best_mbmi;
+ return best_rd;
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int super_block_uvrd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblockd_plane *const pd = &xd->plane[AOM_PLANE_U];
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ int plane;
+ int is_cost_valid = 1;
+ av1_init_rd_stats(rd_stats);
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ if (x->skip_chroma_rd) return is_cost_valid;
+
+ bsize = scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+
+ if (is_inter_block(mbmi) && is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, bsize, plane);
+ }
+
+ if (is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ RD_STATS pn_rd_stats;
+ txfm_rd_in_plane(x, cpi, &pn_rd_stats, ref_best_rd, plane, bsize,
+ uv_tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ if (pn_rd_stats.rate == INT_MAX) {
+ is_cost_valid = 0;
+ break;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) > ref_best_rd &&
+ RDCOST(x->rdmult, 0, rd_stats->sse) > ref_best_rd) {
+ is_cost_valid = 0;
+ break;
+ }
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+
+static void tx_block_rd_b(const AV1_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+ int blk_row, int blk_col, int plane, int block,
+ int plane_bsize, TXB_CTX *txb_ctx, RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode, int64_t ref_rdcost,
+ TXB_RD_INFO *rd_info_array) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const uint16_t cur_joint_ctx =
+ (txb_ctx->dc_sign_ctx << 8) + txb_ctx->txb_skip_ctx;
+ const int txk_type_idx =
+ av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+ // Look up RD and terminate early in case when we've already processed exactly
+ // the same residual with exactly the same entropy context.
+ if (rd_info_array != NULL && rd_info_array->valid &&
+ rd_info_array->entropy_context == cur_joint_ctx) {
+ if (plane == 0)
+ x->e_mbd.mi[0]->txk_type[txk_type_idx] = rd_info_array->tx_type;
+ const TX_TYPE ref_tx_type =
+ av1_get_tx_type(get_plane_type(plane), &x->e_mbd, blk_row, blk_col,
+ tx_size, cpi->common.reduced_tx_set_used);
+ if (ref_tx_type == rd_info_array->tx_type) {
+ rd_stats->rate += rd_info_array->rate;
+ rd_stats->dist += rd_info_array->dist;
+ rd_stats->sse += rd_info_array->sse;
+ rd_stats->skip &= rd_info_array->eob == 0;
+ p->eobs[block] = rd_info_array->eob;
+ p->txb_entropy_ctx[block] = rd_info_array->txb_entropy_ctx;
+ return;
+ }
+ }
+
+ RD_STATS this_rd_stats;
+ search_txk_type(cpi, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+ txb_ctx, ftxs_mode, 0, ref_rdcost, &this_rd_stats);
+
+ av1_merge_rd_stats(rd_stats, &this_rd_stats);
+
+ // Save RD results for possible reuse in future.
+ if (rd_info_array != NULL) {
+ rd_info_array->valid = 1;
+ rd_info_array->entropy_context = cur_joint_ctx;
+ rd_info_array->rate = this_rd_stats.rate;
+ rd_info_array->dist = this_rd_stats.dist;
+ rd_info_array->sse = this_rd_stats.sse;
+ rd_info_array->eob = p->eobs[block];
+ rd_info_array->txb_entropy_ctx = p->txb_entropy_ctx[block];
+ if (plane == 0) {
+ rd_info_array->tx_type = x->e_mbd.mi[0]->txk_type[txk_type_idx];
+ }
+ }
+}
+
+static void get_mean_and_dev(const int16_t *data, int stride, int bw, int bh,
+ float *mean, float *dev) {
+ int x_sum = 0;
+ uint64_t x2_sum = 0;
+ for (int i = 0; i < bh; ++i) {
+ for (int j = 0; j < bw; ++j) {
+ const int val = data[j];
+ x_sum += val;
+ x2_sum += val * val;
+ }
+ data += stride;
+ }
+
+ const int num = bw * bh;
+ const float e_x = (float)x_sum / num;
+ const float e_x2 = (float)((double)x2_sum / num);
+ const float diff = e_x2 - e_x * e_x;
+ *dev = (diff > 0) ? sqrtf(diff) : 0;
+ *mean = e_x;
+}
+
+static void get_mean_and_dev_float(const float *data, int stride, int bw,
+ int bh, float *mean, float *dev) {
+ float x_sum = 0;
+ float x2_sum = 0;
+ for (int i = 0; i < bh; ++i) {
+ for (int j = 0; j < bw; ++j) {
+ const float val = data[j];
+ x_sum += val;
+ x2_sum += val * val;
+ }
+ data += stride;
+ }
+
+ const int num = bw * bh;
+ const float e_x = x_sum / num;
+ const float e_x2 = x2_sum / num;
+ const float diff = e_x2 - e_x * e_x;
+ *dev = (diff > 0) ? sqrtf(diff) : 0;
+ *mean = e_x;
+}
+
+// Feature used by the model to predict tx split: the mean and standard
+// deviation values of the block and sub-blocks.
+static void get_mean_dev_features(const int16_t *data, int stride, int bw,
+ int bh, int levels, float *feature) {
+ int feature_idx = 0;
+ int width = bw;
+ int height = bh;
+ const int16_t *const data_ptr = &data[0];
+ for (int lv = 0; lv < levels; ++lv) {
+ if (width < 2 || height < 2) break;
+ float mean_buf[16];
+ float dev_buf[16];
+ int blk_idx = 0;
+ for (int row = 0; row < bh; row += height) {
+ for (int col = 0; col < bw; col += width) {
+ float mean, dev;
+ get_mean_and_dev(data_ptr + row * stride + col, stride, width, height,
+ &mean, &dev);
+ feature[feature_idx++] = mean;
+ feature[feature_idx++] = dev;
+ mean_buf[blk_idx] = mean;
+ dev_buf[blk_idx++] = dev;
+ }
+ }
+ if (blk_idx > 1) {
+ float mean, dev;
+ // Deviation of means.
+ get_mean_and_dev_float(mean_buf, 1, 1, blk_idx, &mean, &dev);
+ feature[feature_idx++] = dev;
+ // Mean of deviations.
+ get_mean_and_dev_float(dev_buf, 1, 1, blk_idx, &mean, &dev);
+ feature[feature_idx++] = mean;
+ }
+ // Reduce the block size when proceeding to the next level.
+ if (height == width) {
+ height = height >> 1;
+ width = width >> 1;
+ } else if (height > width) {
+ height = height >> 1;
+ } else {
+ width = width >> 1;
+ }
+ }
+}
+
+static int ml_predict_tx_split(MACROBLOCK *x, BLOCK_SIZE bsize, int blk_row,
+ int blk_col, TX_SIZE tx_size) {
+ const NN_CONFIG *nn_config = av1_tx_split_nnconfig_map[tx_size];
+ if (!nn_config) return -1;
+
+ const int diff_stride = block_size_wide[bsize];
+ const int16_t *diff =
+ x->plane[0].src_diff + 4 * blk_row * diff_stride + 4 * blk_col;
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+ aom_clear_system_state();
+
+ float features[64] = { 0.0f };
+ get_mean_dev_features(diff, diff_stride, bw, bh, 2, features);
+
+ float score = 0.0f;
+ av1_nn_predict(features, nn_config, &score);
+ if (score > 8.0f) return 100;
+ if (score < -8.0f) return 0;
+ score = 1.0f / (1.0f + (float)exp(-score));
+ return (int)(score * 100);
+}
+
+typedef struct {
+ int64_t rd;
+ int txb_entropy_ctx;
+ TX_TYPE tx_type;
+} TxCandidateInfo;
+
+static void try_tx_block_no_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize,
+ const ENTROPY_CONTEXT *ta, const ENTROPY_CONTEXT *tl,
+ int txfm_partition_ctx, RD_STATS *rd_stats, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+ TxCandidateInfo *no_split) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ struct macroblock_plane *const p = &x->plane[0];
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+
+ no_split->rd = INT64_MAX;
+ no_split->txb_entropy_ctx = 0;
+ no_split->tx_type = TX_TYPES;
+
+ const ENTROPY_CONTEXT *const pta = ta + blk_col;
+ const ENTROPY_CONTEXT *const ptl = tl + blk_row;
+
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, pta, ptl, &txb_ctx);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_Y]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+
+ rd_stats->ref_rdcost = ref_best_rd;
+ rd_stats->zero_rate = zero_blk_rate;
+ const int index = av1_get_txb_size_index(plane_bsize, blk_row, blk_col);
+ mbmi->inter_tx_size[index] = tx_size;
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd,
+ rd_info_node != NULL ? rd_info_node->rd_info_array : NULL);
+ assert(rd_stats->rate < INT_MAX);
+
+ if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+#if CONFIG_RD_DEBUG
+ av1_update_txb_coeff_cost(rd_stats, plane, tx_size, blk_row, blk_col,
+ zero_blk_rate - rd_stats->rate);
+#endif // CONFIG_RD_DEBUG
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 1);
+ p->eobs[block] = 0;
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ } else {
+ set_blk_skip(x, 0, blk_row * bw + blk_col, 0);
+ rd_stats->skip = 0;
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->txfm_partition_cost[txfm_partition_ctx][0];
+
+ no_split->rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ no_split->txb_entropy_ctx = p->txb_entropy_ctx[block];
+ const int txk_type_idx =
+ av1_get_txk_type_index(plane_bsize, blk_row, blk_col);
+ no_split->tx_type = mbmi->txk_type[txk_type_idx];
+}
+
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int block, TX_SIZE tx_size, int depth,
+ BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+ TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+ int64_t ref_best_rd, int *is_cost_valid,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ TXB_RD_INFO_NODE *rd_info_node);
+
+static void try_tx_block_split(
+ const AV1_COMP *cpi, MACROBLOCK *x, int blk_row, int blk_col, int block,
+ TX_SIZE tx_size, int depth, BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ int txfm_partition_ctx, int64_t no_split_rd, int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode, TXB_RD_INFO_NODE *rd_info_node,
+ RD_STATS *split_rd_stats, int64_t *split_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int sub_step = bsw * bsh;
+ RD_STATS this_rd_stats;
+ int this_cost_valid = 1;
+ int64_t tmp_rd = 0;
+
+ split_rd_stats->rate = x->txfm_partition_cost[txfm_partition_ctx][1];
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ int blk_idx = 0;
+ for (int r = 0; r < tx_size_high_unit[tx_size]; r += bsh) {
+ for (int c = 0; c < tx_size_wide_unit[tx_size]; c += bsw, ++blk_idx) {
+ const int offsetr = blk_row + r;
+ const int offsetc = blk_col + c;
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+ assert(blk_idx < 4);
+ select_tx_block(
+ cpi, x, offsetr, offsetc, block, sub_txs, depth + 1, plane_bsize, ta,
+ tl, tx_above, tx_left, &this_rd_stats, ref_best_rd - tmp_rd,
+ &this_cost_valid, ftxs_mode,
+ (rd_info_node != NULL) ? rd_info_node->children[blk_idx] : NULL);
+
+ if (!this_cost_valid) goto LOOP_EXIT;
+
+ av1_merge_rd_stats(split_rd_stats, &this_rd_stats);
+
+ tmp_rd = RDCOST(x->rdmult, split_rd_stats->rate, split_rd_stats->dist);
+
+ if (no_split_rd < tmp_rd) {
+ this_cost_valid = 0;
+ goto LOOP_EXIT;
+ }
+ block += sub_step;
+ }
+ }
+
+LOOP_EXIT : {}
+
+ if (this_cost_valid) *split_rd = tmp_rd;
+}
+
+// Search for the best tx partition/type for a given luma block.
+static void select_tx_block(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int block, TX_SIZE tx_size, int depth,
+ BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *ta,
+ ENTROPY_CONTEXT *tl, TXFM_CONTEXT *tx_above,
+ TXFM_CONTEXT *tx_left, RD_STATS *rd_stats,
+ int64_t ref_best_rd, int *is_cost_valid,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ TXB_RD_INFO_NODE *rd_info_node) {
+ assert(tx_size < TX_SIZES_ALL);
+ av1_init_rd_stats(rd_stats);
+ if (ref_best_rd < 0) {
+ *is_cost_valid = 0;
+ return;
+ }
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const int bw = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->sb_type, tx_size);
+ struct macroblock_plane *const p = &x->plane[0];
+
+ const int try_no_split = 1;
+ int try_split = tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH;
+#if CONFIG_DIST_8X8
+ if (x->using_dist_8x8)
+ try_split &= tx_size_wide[tx_size] >= 16 && tx_size_high[tx_size] >= 16;
+#endif
+ TxCandidateInfo no_split = { INT64_MAX, 0, TX_TYPES };
+
+ // TX no split
+ if (try_no_split) {
+ try_tx_block_no_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, ctx, rd_stats, ref_best_rd,
+ ftxs_mode, rd_info_node, &no_split);
+
+ if (cpi->sf.adaptive_txb_search_level &&
+ (no_split.rd -
+ (no_split.rd >> (1 + cpi->sf.adaptive_txb_search_level))) >
+ ref_best_rd) {
+ *is_cost_valid = 0;
+ return;
+ }
+
+ if (cpi->sf.txb_split_cap) {
+ if (p->eobs[block] == 0) try_split = 0;
+ }
+ }
+
+ if (x->e_mbd.bd == 8 && !x->cb_partition_scan && try_split) {
+ const int threshold = cpi->sf.tx_type_search.ml_tx_split_thresh;
+ if (threshold >= 0) {
+ const int split_score =
+ ml_predict_tx_split(x, plane_bsize, blk_row, blk_col, tx_size);
+ if (split_score >= 0 && split_score < threshold) try_split = 0;
+ }
+ }
+
+ // TX split
+ int64_t split_rd = INT64_MAX;
+ RD_STATS split_rd_stats;
+ av1_init_rd_stats(&split_rd_stats);
+ if (try_split) {
+ try_tx_block_split(cpi, x, blk_row, blk_col, block, tx_size, depth,
+ plane_bsize, ta, tl, tx_above, tx_left, ctx, no_split.rd,
+ AOMMIN(no_split.rd, ref_best_rd), ftxs_mode,
+ rd_info_node, &split_rd_stats, &split_rd);
+ }
+
+ if (no_split.rd < split_rd) {
+ ENTROPY_CONTEXT *pta = ta + blk_col;
+ ENTROPY_CONTEXT *ptl = tl + blk_row;
+ const TX_SIZE tx_size_selected = tx_size;
+ p->txb_entropy_ctx[block] = no_split.txb_entropy_ctx;
+ av1_set_txb_context(x, 0, block, tx_size_selected, pta, ptl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ for (int idy = 0; idy < tx_size_high_unit[tx_size]; ++idy) {
+ for (int idx = 0; idx < tx_size_wide_unit[tx_size]; ++idx) {
+ const int index =
+ av1_get_txb_size_index(plane_bsize, blk_row + idy, blk_col + idx);
+ mbmi->inter_tx_size[index] = tx_size_selected;
+ }
+ }
+ mbmi->tx_size = tx_size_selected;
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ no_split.tx_type);
+ set_blk_skip(x, 0, blk_row * bw + blk_col, rd_stats->skip);
+ } else {
+ *rd_stats = split_rd_stats;
+ if (split_rd == INT64_MAX) *is_cost_valid = 0;
+ }
+}
+
+static void select_inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode,
+ TXB_RD_INFO_NODE *rd_info_tree) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int is_cost_valid = 1;
+ int64_t this_rd = 0, skip_rd = 0;
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ av1_init_rd_stats(rd_stats);
+
+ if (is_cost_valid) {
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = max_txsize_rect_lookup[plane_bsize];
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+ RD_STATS pn_rd_stats;
+ const int init_depth =
+ get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+ av1_init_rd_stats(&pn_rd_stats);
+
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+
+ skip_rd = RDCOST(x->rdmult, s1, 0);
+ this_rd = RDCOST(x->rdmult, s0, 0);
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ int64_t best_rd_sofar = (ref_best_rd - (AOMMIN(skip_rd, this_rd)));
+ select_tx_block(cpi, x, idy, idx, block, max_tx_size, init_depth,
+ plane_bsize, ctxa, ctxl, tx_above, tx_left,
+ &pn_rd_stats, best_rd_sofar, &is_cost_valid, ftxs_mode,
+ rd_info_tree);
+ if (!is_cost_valid || pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ block += step;
+ if (rd_info_tree != NULL) rd_info_tree += 1;
+ }
+ }
+ if (skip_rd <= this_rd) {
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ } else {
+ rd_stats->skip = 0;
+ }
+ }
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+}
+
+static int64_t select_tx_size_fix_type(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd,
+ TXB_RD_INFO_NODE *rd_info_tree) {
+ const int fast_tx_search = cpi->sf.tx_size_search_method > USE_FULL_RD;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_inter = is_inter_block(mbmi);
+ const int skip_ctx = av1_get_skip_context(xd);
+ int s0 = x->skip_cost[skip_ctx][0];
+ int s1 = x->skip_cost[skip_ctx][1];
+ int64_t rd;
+
+ // TODO(debargha): enable this as a speed feature where the
+ // select_inter_block_yrd() function above will use a simplified search
+ // such as not using full optimize, but the inter_block_yrd() function
+ // will use more complex search given that the transform partitions have
+ // already been decided.
+
+ int64_t rd_thresh = ref_best_rd;
+ if (fast_tx_search && rd_thresh < INT64_MAX) {
+ if (INT64_MAX - rd_thresh > (rd_thresh >> 3)) rd_thresh += (rd_thresh >> 3);
+ }
+ assert(rd_thresh > 0);
+
+ FAST_TX_SEARCH_MODE ftxs_mode =
+ fast_tx_search ? FTXS_DCT_AND_1D_DCT_ONLY : FTXS_NONE;
+ select_inter_block_yrd(cpi, x, rd_stats, bsize, rd_thresh, ftxs_mode,
+ rd_info_tree);
+ if (rd_stats->rate == INT_MAX) return INT64_MAX;
+
+ // If fast_tx_search is true, only DCT and 1D DCT were tested in
+ // select_inter_block_yrd() above. Do a better search for tx type with
+ // tx sizes already decided.
+ if (fast_tx_search) {
+ if (!inter_block_yrd(cpi, x, rd_stats, bsize, ref_best_rd, FTXS_NONE))
+ return INT64_MAX;
+ }
+
+ if (rd_stats->skip)
+ rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ else
+ rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+
+ if (is_inter && !xd->lossless[xd->mi[0]->segment_id] && !(rd_stats->skip))
+ rd = AOMMIN(rd, RDCOST(x->rdmult, s1, rd_stats->sse));
+
+ return rd;
+}
+
+// Finds rd cost for a y block, given the transform size partitions
+static void tx_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int block, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, int depth,
+ ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+ TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+ int64_t ref_best_rd, RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, 0);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, 0);
+
+ assert(tx_size < TX_SIZES_ALL);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size = mbmi->inter_tx_size[av1_get_txb_size_index(
+ plane_bsize, blk_row, blk_col)];
+
+ int ctx = txfm_partition_context(tx_above + blk_col, tx_left + blk_row,
+ mbmi->sb_type, tx_size);
+
+ av1_init_rd_stats(rd_stats);
+ if (tx_size == plane_tx_size) {
+ ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+ ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, 0, ta, tl, &txb_ctx);
+
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][get_plane_type(0)]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ rd_stats->zero_rate = zero_blk_rate;
+ rd_stats->ref_rdcost = ref_best_rd;
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, 0, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, ref_best_rd, NULL);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ if (RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 1);
+ x->plane[0].eobs[block] = 0;
+ x->plane[0].txb_entropy_ctx[block] = 0;
+ update_txk_array(mbmi->txk_type, plane_bsize, blk_row, blk_col, tx_size,
+ DCT_DCT);
+ } else {
+ rd_stats->skip = 0;
+ set_blk_skip(x, 0, blk_row * mi_width + blk_col, 0);
+ }
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->txfm_partition_cost[ctx][0];
+ av1_set_txb_context(x, 0, block, tx_size, ta, tl);
+ txfm_partition_update(tx_above + blk_col, tx_left + blk_row, tx_size,
+ tx_size);
+ } else {
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsh * bsw;
+ RD_STATS pn_rd_stats;
+ int64_t this_rd = 0;
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, offsetr, offsetc, block, sub_txs, plane_bsize,
+ depth + 1, above_ctx, left_ctx, tx_above, tx_left,
+ ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd += RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist);
+ block += step;
+ }
+ }
+
+ if (tx_size > TX_4X4 && depth < MAX_VARTX_DEPTH)
+ rd_stats->rate += x->txfm_partition_cost[ctx][1];
+ }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int inter_block_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t ref_best_rd, FAST_TX_SEARCH_MODE ftxs_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int is_cost_valid = 1;
+ int64_t this_rd = 0;
+
+ if (ref_best_rd < 0) is_cost_valid = 0;
+
+ av1_init_rd_stats(rd_stats);
+
+ if (is_cost_valid) {
+ const struct macroblockd_plane *const pd = &xd->plane[0];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = mi_size_wide[plane_bsize];
+ const int mi_height = mi_size_high[plane_bsize];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, 0);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ const int init_depth =
+ get_search_init_depth(mi_width, mi_height, 1, &cpi->sf);
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+ ENTROPY_CONTEXT ctxa[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT ctxl[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+ TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+ RD_STATS pn_rd_stats;
+
+ av1_get_entropy_contexts(bsize, pd, ctxa, ctxl);
+ memcpy(tx_above, xd->above_txfm_context, sizeof(TXFM_CONTEXT) * mi_width);
+ memcpy(tx_left, xd->left_txfm_context, sizeof(TXFM_CONTEXT) * mi_height);
+
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_yrd(cpi, x, idy, idx, block, max_tx_size, plane_bsize,
+ init_depth, ctxa, ctxl, tx_above, tx_left,
+ ref_best_rd - this_rd, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd +=
+ AOMMIN(RDCOST(x->rdmult, pn_rd_stats.rate, pn_rd_stats.dist),
+ RDCOST(x->rdmult, pn_rd_stats.zero_rate, pn_rd_stats.sse));
+ block += step;
+ }
+ }
+ }
+
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int s0 = x->skip_cost[skip_ctx][0];
+ const int s1 = x->skip_cost[skip_ctx][1];
+ int64_t skip_rd = RDCOST(x->rdmult, s1, rd_stats->sse);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate + s0, rd_stats->dist);
+ if (skip_rd < this_rd) {
+ this_rd = skip_rd;
+ rd_stats->rate = 0;
+ rd_stats->dist = rd_stats->sse;
+ rd_stats->skip = 1;
+ }
+ if (this_rd > ref_best_rd) is_cost_valid = 0;
+
+ if (!is_cost_valid) {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+ return is_cost_valid;
+}
+
+static INLINE uint32_t get_block_residue_hash(MACROBLOCK *x, BLOCK_SIZE bsize) {
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ const int16_t *diff = x->plane[0].src_diff;
+ const uint32_t hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+ (uint8_t *)diff, 2 * rows * cols);
+ return (hash << 5) + bsize;
+}
+
+static void save_tx_rd_info(int n4, uint32_t hash, const MACROBLOCK *const x,
+ const RD_STATS *const rd_stats,
+ MB_RD_RECORD *tx_rd_record) {
+ int index;
+ if (tx_rd_record->num < RD_RECORD_BUFFER_LEN) {
+ index =
+ (tx_rd_record->index_start + tx_rd_record->num) % RD_RECORD_BUFFER_LEN;
+ ++tx_rd_record->num;
+ } else {
+ index = tx_rd_record->index_start;
+ tx_rd_record->index_start =
+ (tx_rd_record->index_start + 1) % RD_RECORD_BUFFER_LEN;
+ }
+ MB_RD_INFO *const tx_rd_info = &tx_rd_record->tx_rd_info[index];
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ tx_rd_info->hash_value = hash;
+ tx_rd_info->tx_size = mbmi->tx_size;
+ memcpy(tx_rd_info->blk_skip, x->blk_skip,
+ sizeof(tx_rd_info->blk_skip[0]) * n4);
+ av1_copy(tx_rd_info->inter_tx_size, mbmi->inter_tx_size);
+ av1_copy(tx_rd_info->txk_type, mbmi->txk_type);
+ tx_rd_info->rd_stats = *rd_stats;
+}
+
+static void fetch_tx_rd_info(int n4, const MB_RD_INFO *const tx_rd_info,
+ RD_STATS *const rd_stats, MACROBLOCK *const x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ mbmi->tx_size = tx_rd_info->tx_size;
+ memcpy(x->blk_skip, tx_rd_info->blk_skip,
+ sizeof(tx_rd_info->blk_skip[0]) * n4);
+ av1_copy(mbmi->inter_tx_size, tx_rd_info->inter_tx_size);
+ av1_copy(mbmi->txk_type, tx_rd_info->txk_type);
+ *rd_stats = tx_rd_info->rd_stats;
+}
+
+static int find_tx_size_rd_info(TXB_RD_RECORD *cur_record,
+ const uint32_t hash) {
+ // Linear search through the circular buffer to find matching hash.
+ for (int i = cur_record->index_start - 1; i >= 0; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
+ }
+ for (int i = cur_record->num - 1; i >= cur_record->index_start; i--) {
+ if (cur_record->hash_vals[i] == hash) return i;
+ }
+ int index;
+ // If not found - add new RD info into the buffer and return its index
+ if (cur_record->num < TX_SIZE_RD_RECORD_BUFFER_LEN) {
+ index = (cur_record->index_start + cur_record->num) %
+ TX_SIZE_RD_RECORD_BUFFER_LEN;
+ cur_record->num++;
+ } else {
+ index = cur_record->index_start;
+ cur_record->index_start =
+ (cur_record->index_start + 1) % TX_SIZE_RD_RECORD_BUFFER_LEN;
+ }
+
+ cur_record->hash_vals[index] = hash;
+ av1_zero(cur_record->tx_rd_info[index]);
+ return index;
+}
+
+typedef struct {
+ int leaf;
+ int8_t children[4];
+} RD_RECORD_IDX_NODE;
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x8[] = {
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_8x16[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0, 0, 0, 0 } },
+ { 1, { 0, 0, 0, 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x8[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 1, { 0 } },
+ { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_16x16[] = {
+ { 0, { 1, 2, 3, 4 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } }, { 1, { 0 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_2[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 5, 6 } },
+ { 0, { 7, 8, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_2_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, 7, 8 } },
+ { 0, { 5, 6, 9, 10 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_sqr[] = {
+ { 0, { 1, 2, 3, 4 } }, { 0, { 5, 6, 9, 10 } }, { 0, { 7, 8, 11, 12 } },
+ { 0, { 13, 14, 17, 18 } }, { 0, { 15, 16, 19, 20 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_64x128[] = {
+ { 0, { 2, 3, 4, 5 } }, { 0, { 6, 7, 8, 9 } },
+ { 0, { 10, 11, 14, 15 } }, { 0, { 12, 13, 16, 17 } },
+ { 0, { 18, 19, 22, 23 } }, { 0, { 20, 21, 24, 25 } },
+ { 0, { 26, 27, 30, 31 } }, { 0, { 28, 29, 32, 33 } },
+ { 0, { 34, 35, 38, 39 } }, { 0, { 36, 37, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x64[] = {
+ { 0, { 2, 3, 6, 7 } }, { 0, { 4, 5, 8, 9 } },
+ { 0, { 10, 11, 18, 19 } }, { 0, { 12, 13, 20, 21 } },
+ { 0, { 14, 15, 22, 23 } }, { 0, { 16, 17, 24, 25 } },
+ { 0, { 26, 27, 34, 35 } }, { 0, { 28, 29, 36, 37 } },
+ { 0, { 30, 31, 38, 39 } }, { 0, { 32, 33, 40, 41 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_128x128[] = {
+ { 0, { 4, 5, 8, 9 } }, { 0, { 6, 7, 10, 11 } },
+ { 0, { 12, 13, 16, 17 } }, { 0, { 14, 15, 18, 19 } },
+ { 0, { 20, 21, 28, 29 } }, { 0, { 22, 23, 30, 31 } },
+ { 0, { 24, 25, 32, 33 } }, { 0, { 26, 27, 34, 35 } },
+ { 0, { 36, 37, 44, 45 } }, { 0, { 38, 39, 46, 47 } },
+ { 0, { 40, 41, 48, 49 } }, { 0, { 42, 43, 50, 51 } },
+ { 0, { 52, 53, 60, 61 } }, { 0, { 54, 55, 62, 63 } },
+ { 0, { 56, 57, 64, 65 } }, { 0, { 58, 59, 66, 67 } },
+ { 0, { 68, 69, 76, 77 } }, { 0, { 70, 71, 78, 79 } },
+ { 0, { 72, 73, 80, 81 } }, { 0, { 74, 75, 82, 83 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_1_4[] = {
+ { 0, { 1, -1, 2, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE rd_record_tree_4_1[] = {
+ { 0, { 1, 2, -1, -1 } },
+ { 0, { 3, 4, -1, -1 } },
+ { 0, { 5, 6, -1, -1 } },
+};
+
+static const RD_RECORD_IDX_NODE *rd_record_tree[BLOCK_SIZES_ALL] = {
+ NULL, // BLOCK_4X4
+ NULL, // BLOCK_4X8
+ NULL, // BLOCK_8X4
+ rd_record_tree_8x8, // BLOCK_8X8
+ rd_record_tree_8x16, // BLOCK_8X16
+ rd_record_tree_16x8, // BLOCK_16X8
+ rd_record_tree_16x16, // BLOCK_16X16
+ rd_record_tree_1_2, // BLOCK_16X32
+ rd_record_tree_2_1, // BLOCK_32X16
+ rd_record_tree_sqr, // BLOCK_32X32
+ rd_record_tree_1_2, // BLOCK_32X64
+ rd_record_tree_2_1, // BLOCK_64X32
+ rd_record_tree_sqr, // BLOCK_64X64
+ rd_record_tree_64x128, // BLOCK_64X128
+ rd_record_tree_128x64, // BLOCK_128X64
+ rd_record_tree_128x128, // BLOCK_128X128
+ NULL, // BLOCK_4X16
+ NULL, // BLOCK_16X4
+ rd_record_tree_1_4, // BLOCK_8X32
+ rd_record_tree_4_1, // BLOCK_32X8
+ rd_record_tree_1_4, // BLOCK_16X64
+ rd_record_tree_4_1, // BLOCK_64X16
+};
+
+static const int rd_record_tree_size[BLOCK_SIZES_ALL] = {
+ 0, // BLOCK_4X4
+ 0, // BLOCK_4X8
+ 0, // BLOCK_8X4
+ sizeof(rd_record_tree_8x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X8
+ sizeof(rd_record_tree_8x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X16
+ sizeof(rd_record_tree_16x8) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X8
+ sizeof(rd_record_tree_16x16) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X16
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X32
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X16
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X32
+ sizeof(rd_record_tree_1_2) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X64
+ sizeof(rd_record_tree_2_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X32
+ sizeof(rd_record_tree_sqr) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X64
+ sizeof(rd_record_tree_64x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X128
+ sizeof(rd_record_tree_128x64) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X64
+ sizeof(rd_record_tree_128x128) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_128X128
+ 0, // BLOCK_4X16
+ 0, // BLOCK_16X4
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_8X32
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_32X8
+ sizeof(rd_record_tree_1_4) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_16X64
+ sizeof(rd_record_tree_4_1) / sizeof(RD_RECORD_IDX_NODE), // BLOCK_64X16
+};
+
+static INLINE void init_rd_record_tree(TXB_RD_INFO_NODE *tree,
+ BLOCK_SIZE bsize) {
+ const RD_RECORD_IDX_NODE *rd_record = rd_record_tree[bsize];
+ const int size = rd_record_tree_size[bsize];
+ for (int i = 0; i < size; ++i) {
+ if (rd_record[i].leaf) {
+ av1_zero(tree[i].children);
+ } else {
+ for (int j = 0; j < 4; ++j) {
+ const int8_t idx = rd_record[i].children[j];
+ tree[i].children[j] = idx > 0 ? &tree[idx] : NULL;
+ }
+ }
+ }
+}
+
+// Go through all TX blocks that could be used in TX size search, compute
+// residual hash values for them and find matching RD info that stores previous
+// RD search results for these TX blocks. The idea is to prevent repeated
+// rate/distortion computations that happen because of the combination of
+// partition and TX size search. The resulting RD info records are returned in
+// the form of a quadtree for easier access in actual TX size search.
+static int find_tx_size_rd_records(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, TXB_RD_INFO_NODE *dst_rd_info) {
+ TXB_RD_RECORD *rd_records_table[4] = { x->txb_rd_record_8X8,
+ x->txb_rd_record_16X16,
+ x->txb_rd_record_32X32,
+ x->txb_rd_record_64X64 };
+ const TX_SIZE max_square_tx_size = max_txsize_lookup[bsize];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+
+ // Hashing is performed only for square TX sizes larger than TX_4X4
+ if (max_square_tx_size < TX_8X8) return 0;
+ const int diff_stride = bw;
+ const struct macroblock_plane *const p = &x->plane[0];
+ const int16_t *diff = &p->src_diff[0];
+ init_rd_record_tree(dst_rd_info, bsize);
+ // Coordinates of the top-left corner of current block within the superblock
+ // measured in pixels:
+ const int mi_row_in_sb = (mi_row % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+ const int mi_col_in_sb = (mi_col % MAX_MIB_SIZE) << MI_SIZE_LOG2;
+ int cur_rd_info_idx = 0;
+ int cur_tx_depth = 0;
+ TX_SIZE cur_tx_size = max_txsize_rect_lookup[bsize];
+ while (cur_tx_depth <= MAX_VARTX_DEPTH) {
+ const int cur_tx_bw = tx_size_wide[cur_tx_size];
+ const int cur_tx_bh = tx_size_high[cur_tx_size];
+ if (cur_tx_bw < 8 || cur_tx_bh < 8) break;
+ const TX_SIZE next_tx_size = sub_tx_size_map[cur_tx_size];
+ const int tx_size_idx = cur_tx_size - TX_8X8;
+ for (int row = 0; row < bh; row += cur_tx_bh) {
+ for (int col = 0; col < bw; col += cur_tx_bw) {
+ if (cur_tx_bw != cur_tx_bh) {
+ // Use dummy nodes for all rectangular transforms within the
+ // TX size search tree.
+ dst_rd_info[cur_rd_info_idx].rd_info_array = NULL;
+ } else {
+ // Get spatial location of this TX block within the superblock
+ // (measured in cur_tx_bsize units).
+ const int row_in_sb = (mi_row_in_sb + row) / cur_tx_bh;
+ const int col_in_sb = (mi_col_in_sb + col) / cur_tx_bw;
+
+ int16_t hash_data[MAX_SB_SQUARE];
+ int16_t *cur_hash_row = hash_data;
+ const int16_t *cur_diff_row = diff + row * diff_stride + col;
+ for (int i = 0; i < cur_tx_bh; i++) {
+ memcpy(cur_hash_row, cur_diff_row, sizeof(*hash_data) * cur_tx_bw);
+ cur_hash_row += cur_tx_bw;
+ cur_diff_row += diff_stride;
+ }
+ const int hash = av1_get_crc32c_value(&x->mb_rd_record.crc_calculator,
+ (uint8_t *)hash_data,
+ 2 * cur_tx_bw * cur_tx_bh);
+ // Find corresponding RD info based on the hash value.
+ const int record_idx =
+ row_in_sb * (MAX_MIB_SIZE >> (tx_size_idx + 1)) + col_in_sb;
+ TXB_RD_RECORD *records = &rd_records_table[tx_size_idx][record_idx];
+ int idx = find_tx_size_rd_info(records, hash);
+ dst_rd_info[cur_rd_info_idx].rd_info_array =
+ &records->tx_rd_info[idx];
+ }
+ ++cur_rd_info_idx;
+ }
+ }
+ cur_tx_size = next_tx_size;
+ ++cur_tx_depth;
+ }
+ return 1;
+}
+
+// origin_threshold * 128 / 100
+static const uint32_t skip_pred_threshold[3][BLOCK_SIZES_ALL] = {
+ {
+ 64, 64, 64, 70, 60, 60, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 64, 64, 70, 70, 68, 68,
+ },
+ {
+ 88, 88, 88, 86, 87, 87, 68, 68, 68, 68, 68,
+ 68, 68, 68, 68, 68, 88, 88, 86, 86, 68, 68,
+ },
+ {
+ 90, 93, 93, 90, 93, 93, 74, 74, 74, 74, 74,
+ 74, 74, 74, 74, 74, 90, 90, 90, 90, 74, 74,
+ },
+};
+
+// lookup table for predict_skip_flag
+// int max_tx_size = max_txsize_rect_lookup[bsize];
+// if (tx_size_high[max_tx_size] > 16 || tx_size_wide[max_tx_size] > 16)
+// max_tx_size = AOMMIN(max_txsize_lookup[bsize], TX_16X16);
+static const TX_SIZE max_predict_sf_tx_size[BLOCK_SIZES_ALL] = {
+ TX_4X4, TX_4X8, TX_8X4, TX_8X8, TX_8X16, TX_16X8,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_16X16,
+ TX_16X16, TX_16X16, TX_16X16, TX_16X16, TX_4X16, TX_16X4,
+ TX_8X8, TX_8X8, TX_16X16, TX_16X16,
+};
+
+// Uses simple features on top of DCT coefficients to quickly predict
+// whether optimal RD decision is to skip encoding the residual.
+// The sse value is stored in dist.
+static int predict_skip_flag(MACROBLOCK *x, BLOCK_SIZE bsize, int64_t *dist,
+ int reduced_tx_set) {
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
+
+ *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
+ const int64_t mse = *dist / bw / bh;
+ // Normalized quantizer takes the transform upscaling factor (8 for tx size
+ // smaller than 32) into account.
+ const int16_t normalized_dc_q = dc_q >> 3;
+ const int64_t mse_thresh = (int64_t)normalized_dc_q * normalized_dc_q / 8;
+ // Predict not to skip when mse is larger than threshold.
+ if (mse > mse_thresh) return 0;
+
+ const int max_tx_size = max_predict_sf_tx_size[bsize];
+ const int tx_h = tx_size_high[max_tx_size];
+ const int tx_w = tx_size_wide[max_tx_size];
+ DECLARE_ALIGNED(32, tran_low_t, coefs[32 * 32]);
+ TxfmParam param;
+ param.tx_type = DCT_DCT;
+ param.tx_size = max_tx_size;
+ param.bd = xd->bd;
+ param.is_hbd = get_bitdepth_data_path_index(xd);
+ param.lossless = 0;
+ param.tx_set_type = av1_get_ext_tx_set_type(
+ param.tx_size, is_inter_block(xd->mi[0]), reduced_tx_set);
+ const int bd_idx = (xd->bd == 8) ? 0 : ((xd->bd == 10) ? 1 : 2);
+ const uint32_t max_qcoef_thresh = skip_pred_threshold[bd_idx][bsize];
+ const int16_t *src_diff = x->plane[0].src_diff;
+ const int n_coeff = tx_w * tx_h;
+ const int16_t ac_q = av1_ac_quant_QTX(x->qindex, 0, xd->bd);
+ const uint32_t dc_thresh = max_qcoef_thresh * dc_q;
+ const uint32_t ac_thresh = max_qcoef_thresh * ac_q;
+ for (int row = 0; row < bh; row += tx_h) {
+ for (int col = 0; col < bw; col += tx_w) {
+ av1_fwd_txfm(src_diff + col, coefs, bw, &param);
+ // Operating on TX domain, not pixels; we want the QTX quantizers
+ const uint32_t dc_coef = (((uint32_t)abs(coefs[0])) << 7);
+ if (dc_coef >= dc_thresh) return 0;
+ for (int i = 1; i < n_coeff; ++i) {
+ const uint32_t ac_coef = (((uint32_t)abs(coefs[i])) << 7);
+ if (ac_coef >= ac_thresh) return 0;
+ }
+ }
+ src_diff += tx_h * bw;
+ }
+ return 1;
+}
+
+// Used to set proper context for early termination with skip = 1.
+static void set_skip_flag(MACROBLOCK *x, RD_STATS *rd_stats, int bsize,
+ int64_t dist) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int n4 = bsize_to_num_blk(bsize);
+ const TX_SIZE tx_size = max_txsize_rect_lookup[bsize];
+ memset(mbmi->txk_type, DCT_DCT, sizeof(mbmi->txk_type[0]) * TXK_TYPE_BUF_LEN);
+ memset(mbmi->inter_tx_size, tx_size, sizeof(mbmi->inter_tx_size));
+ mbmi->tx_size = tx_size;
+ for (int i = 0; i < n4; ++i) set_blk_skip(x, 0, i, 1);
+ rd_stats->skip = 1;
+ rd_stats->rate = 0;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ dist = ROUND_POWER_OF_TWO(dist, (xd->bd - 8) * 2);
+ rd_stats->dist = rd_stats->sse = (dist << 4);
+}
+
+static void select_tx_type_yrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int64_t rd = INT64_MAX;
+ int64_t best_rd = INT64_MAX;
+ const int is_inter = is_inter_block(mbmi);
+ const int n4 = bsize_to_num_blk(bsize);
+ // Get the tx_size 1 level down
+ const TX_SIZE min_tx_size = sub_tx_size_map[max_txsize_rect_lookup[bsize]];
+ const TxSetType tx_set_type =
+ av1_get_ext_tx_set_type(min_tx_size, is_inter, cm->reduced_tx_set_used);
+ const int within_border =
+ mi_row >= xd->tile.mi_row_start &&
+ (mi_row + mi_size_high[bsize] < xd->tile.mi_row_end) &&
+ mi_col >= xd->tile.mi_col_start &&
+ (mi_col + mi_size_wide[bsize] < xd->tile.mi_col_end);
+
+ av1_invalid_rd_stats(rd_stats);
+
+ if (cpi->sf.model_based_prune_tx_search_level && ref_best_rd != INT64_MAX) {
+ int model_rate;
+ int64_t model_dist;
+ int model_skip;
+ model_rd_sb_fn[MODELRD_TYPE_TX_SEARCH_PRUNE](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &model_rate, &model_dist,
+ &model_skip, NULL, NULL, NULL, NULL);
+ const int64_t model_rd = RDCOST(x->rdmult, model_rate, model_dist);
+ // If the modeled rd is a lot worse than the best so far, breakout.
+ // TODO(debargha, urvang): Improve the model and make the check below
+ // tighter.
+ assert(cpi->sf.model_based_prune_tx_search_level >= 0 &&
+ cpi->sf.model_based_prune_tx_search_level <= 2);
+ static const int prune_factor_by8[] = { 2 + MODELRD_TYPE_TX_SEARCH_PRUNE,
+ 4 + MODELRD_TYPE_TX_SEARCH_PRUNE };
+ if (!model_skip &&
+ ((model_rd *
+ prune_factor_by8[cpi->sf.model_based_prune_tx_search_level - 1]) >>
+ 3) > ref_best_rd)
+ return;
+ }
+
+ const uint32_t hash = get_block_residue_hash(x, bsize);
+ MB_RD_RECORD *mb_rd_record = &x->mb_rd_record;
+
+ if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_mb_rd_hash) {
+ for (int i = 0; i < mb_rd_record->num; ++i) {
+ const int index = (mb_rd_record->index_start + i) % RD_RECORD_BUFFER_LEN;
+ // If there is a match in the tx_rd_record, fetch the RD decision and
+ // terminate early.
+ if (mb_rd_record->tx_rd_info[index].hash_value == hash) {
+ MB_RD_INFO *tx_rd_info = &mb_rd_record->tx_rd_info[index];
+ fetch_tx_rd_info(n4, tx_rd_info, rd_stats, x);
+ return;
+ }
+ }
+ }
+
+ // If we predict that skip is the optimal RD decision - set the respective
+ // context and terminate early.
+ int64_t dist;
+ if (is_inter && cpi->sf.tx_type_search.use_skip_flag_prediction &&
+ predict_skip_flag(x, bsize, &dist, cm->reduced_tx_set_used)) {
+ set_skip_flag(x, rd_stats, bsize, dist);
+ // Save the RD search results into tx_rd_record.
+ if (within_border) save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+ return;
+ }
+
+ // Precompute residual hashes and find existing or add new RD records to
+ // store and reuse rate and distortion values to speed up TX size search.
+ TXB_RD_INFO_NODE matched_rd_info[4 + 16 + 64];
+ int found_rd_info = 0;
+ if (ref_best_rd != INT64_MAX && within_border && cpi->sf.use_inter_txb_hash) {
+ found_rd_info =
+ find_tx_size_rd_records(x, bsize, mi_row, mi_col, matched_rd_info);
+ }
+
+ prune_tx(cpi, bsize, x, xd, tx_set_type);
+
+ int found = 0;
+
+ RD_STATS this_rd_stats;
+ av1_init_rd_stats(&this_rd_stats);
+
+ rd = select_tx_size_fix_type(cpi, x, &this_rd_stats, bsize, ref_best_rd,
+ found_rd_info ? matched_rd_info : NULL);
+ assert(IMPLIES(this_rd_stats.skip && !this_rd_stats.invalid_rate,
+ this_rd_stats.rate == 0));
+
+ ref_best_rd = AOMMIN(rd, ref_best_rd);
+ if (rd < best_rd) {
+ *rd_stats = this_rd_stats;
+ found = 1;
+ }
+
+ // Reset the pruning flags.
+ av1_zero(x->tx_search_prune);
+ x->tx_split_prune_flag = 0;
+
+ // We should always find at least one candidate unless ref_best_rd is less
+ // than INT64_MAX (in which case, all the calls to select_tx_size_fix_type
+ // might have failed to find something better)
+ assert(IMPLIES(!found, ref_best_rd != INT64_MAX));
+ if (!found) return;
+
+ // Save the RD search results into tx_rd_record.
+ if (within_border && cpi->sf.use_mb_rd_hash)
+ save_tx_rd_info(n4, hash, x, rd_stats, mb_rd_record);
+}
+
+static void tx_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x, int blk_row,
+ int blk_col, int plane, int block, TX_SIZE tx_size,
+ BLOCK_SIZE plane_bsize, ENTROPY_CONTEXT *above_ctx,
+ ENTROPY_CONTEXT *left_ctx, RD_STATS *rd_stats,
+ FAST_TX_SEARCH_MODE ftxs_mode) {
+ assert(plane > 0);
+ assert(tx_size < TX_SIZES_ALL);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+ ENTROPY_CONTEXT *tl = left_ctx + blk_row;
+ TXB_CTX txb_ctx;
+ get_txb_ctx(plane_bsize, tx_size, plane, ta, tl, &txb_ctx);
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const int zero_blk_rate = x->coeff_costs[txs_ctx][PLANE_TYPE_UV]
+ .txb_skip_cost[txb_ctx.txb_skip_ctx][1];
+ tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block, plane_bsize,
+ &txb_ctx, rd_stats, ftxs_mode, INT64_MAX, NULL);
+
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int blk_idx = blk_row * mi_width + blk_col;
+
+ av1_set_txb_context(x, plane, block, tx_size, ta, tl);
+ if ((RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist) >=
+ RDCOST(x->rdmult, zero_blk_rate, rd_stats->sse) ||
+ rd_stats->skip == 1) &&
+ !xd->lossless[mbmi->segment_id]) {
+ rd_stats->rate = zero_blk_rate;
+ rd_stats->dist = rd_stats->sse;
+ }
+
+ // Set chroma blk_skip to 0
+ set_blk_skip(x, plane, blk_idx, 0);
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+// 1: rd cost values are valid.
+static int inter_block_uvrd(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_stats, BLOCK_SIZE bsize,
+ int64_t non_skip_ref_best_rd,
+ int64_t skip_ref_best_rd,
+ FAST_TX_SEARCH_MODE ftxs_mode) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int plane;
+ int is_cost_valid = 1;
+ int64_t this_rd = 0;
+ int64_t skip_rd = 0;
+
+ if ((non_skip_ref_best_rd < 0) && (skip_ref_best_rd < 0)) is_cost_valid = 0;
+
+ av1_init_rd_stats(rd_stats);
+
+ if (x->skip_chroma_rd) {
+ if (!is_cost_valid) av1_invalid_rd_stats(rd_stats);
+
+ return is_cost_valid;
+ }
+
+ const BLOCK_SIZE bsizec = scale_chroma_bsize(
+ bsize, xd->plane[1].subsampling_x, xd->plane[1].subsampling_y);
+
+ if (is_inter_block(mbmi) && is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+ av1_subtract_plane(x, bsizec, plane);
+ }
+
+ if (is_cost_valid) {
+ for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height =
+ block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const int bh = tx_size_high_unit[max_tx_size];
+ const int bw = tx_size_wide_unit[max_tx_size];
+ int idx, idy;
+ int block = 0;
+ const int step = bh * bw;
+ ENTROPY_CONTEXT ta[MAX_MIB_SIZE];
+ ENTROPY_CONTEXT tl[MAX_MIB_SIZE];
+ av1_get_entropy_contexts(bsizec, pd, ta, tl);
+
+ for (idy = 0; idy < mi_height; idy += bh) {
+ for (idx = 0; idx < mi_width; idx += bw) {
+ RD_STATS pn_rd_stats;
+ av1_init_rd_stats(&pn_rd_stats);
+ tx_block_uvrd(cpi, x, idy, idx, plane, block, max_tx_size,
+ plane_bsize, ta, tl, &pn_rd_stats, ftxs_mode);
+ if (pn_rd_stats.rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ av1_merge_rd_stats(rd_stats, &pn_rd_stats);
+ this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ skip_rd = RDCOST(x->rdmult, 0, rd_stats->sse);
+ if ((this_rd > non_skip_ref_best_rd) &&
+ (skip_rd > skip_ref_best_rd)) {
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+ block += step;
+ }
+ }
+ }
+ } else {
+ // reset cost value
+ av1_invalid_rd_stats(rd_stats);
+ }
+
+ return is_cost_valid;
+}
+
+static void rd_pick_palette_intra_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int dc_mode_cost,
+ uint8_t *best_palette_color_map,
+ MB_MODE_INFO *const best_mbmi,
+ int64_t *best_rd, int *rate,
+ int *rate_tokenonly, int64_t *distortion,
+ int *skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ assert(
+ av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type));
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ const SequenceHeader *const seq_params = &cpi->common.seq_params;
+ int this_rate;
+ int64_t this_rd;
+ int colors_u, colors_v, colors;
+ const int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ RD_STATS tokenonly_rd_stats;
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ mbmi->uv_mode = UV_DC_PRED;
+
+ int count_buf[1 << 12]; // Maximum (1 << 12) color levels.
+ if (seq_params->use_highbitdepth) {
+ colors_u = av1_count_colors_highbd(src_u, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf);
+ colors_v = av1_count_colors_highbd(src_v, src_stride, rows, cols,
+ seq_params->bit_depth, count_buf);
+ } else {
+ colors_u = av1_count_colors(src_u, src_stride, rows, cols, count_buf);
+ colors_v = av1_count_colors(src_v, src_stride, rows, cols, count_buf);
+ }
+
+ uint16_t color_cache[2 * PALETTE_MAX_SIZE];
+ const int n_cache = av1_get_palette_cache(xd, 1, color_cache);
+
+ colors = colors_u > colors_v ? colors_u : colors_v;
+ if (colors > 1 && colors <= 64) {
+ int r, c, n, i, j;
+ const int max_itr = 50;
+ int lb_u, ub_u, val_u;
+ int lb_v, ub_v, val_v;
+ int *const data = x->palette_buffer->kmeans_data_buf;
+ int centroids[2 * PALETTE_MAX_SIZE];
+
+ uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ if (seq_params->use_highbitdepth) {
+ lb_u = src_u16[0];
+ ub_u = src_u16[0];
+ lb_v = src_v16[0];
+ ub_v = src_v16[0];
+ } else {
+ lb_u = src_u[0];
+ ub_u = src_u[0];
+ lb_v = src_v[0];
+ ub_v = src_v[0];
+ }
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (seq_params->use_highbitdepth) {
+ val_u = src_u16[r * src_stride + c];
+ val_v = src_v16[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ } else {
+ val_u = src_u[r * src_stride + c];
+ val_v = src_v[r * src_stride + c];
+ data[(r * cols + c) * 2] = val_u;
+ data[(r * cols + c) * 2 + 1] = val_v;
+ }
+ if (val_u < lb_u)
+ lb_u = val_u;
+ else if (val_u > ub_u)
+ ub_u = val_u;
+ if (val_v < lb_v)
+ lb_v = val_v;
+ else if (val_v > ub_v)
+ ub_v = val_v;
+ }
+ }
+
+ for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors; n >= 2;
+ --n) {
+ for (i = 0; i < n; ++i) {
+ centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+ centroids[i * 2 + 1] = lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;
+ }
+ av1_k_means(data, centroids, color_map, rows * cols, n, 2, max_itr);
+ optimize_palette_colors(color_cache, n_cache, n, 2, centroids);
+ // Sort the U channel colors in ascending order.
+ for (i = 0; i < 2 * (n - 1); i += 2) {
+ int min_idx = i;
+ int min_val = centroids[i];
+ for (j = i + 2; j < 2 * n; j += 2)
+ if (centroids[j] < min_val) min_val = centroids[j], min_idx = j;
+ if (min_idx != i) {
+ int temp_u = centroids[i], temp_v = centroids[i + 1];
+ centroids[i] = centroids[min_idx];
+ centroids[i + 1] = centroids[min_idx + 1];
+ centroids[min_idx] = temp_u, centroids[min_idx + 1] = temp_v;
+ }
+ }
+ av1_calc_indices(data, centroids, color_map, rows * cols, n, 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+ pmi->palette_size[1] = n;
+ for (i = 1; i < 3; ++i) {
+ for (j = 0; j < n; ++j) {
+ if (seq_params->use_highbitdepth)
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] = clip_pixel_highbd(
+ (int)centroids[j * 2 + i - 1], seq_params->bit_depth);
+ else
+ pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+ clip_pixel((int)centroids[j * 2 + i - 1]);
+ }
+ }
+
+ super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, *best_rd);
+ if (tokenonly_rd_stats.rate == INT_MAX) continue;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, dc_mode_cost);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_mbmi = *mbmi;
+ memcpy(best_palette_color_map, color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ *rate = this_rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *skippable = tokenonly_rd_stats.skip;
+ }
+ }
+ }
+ if (best_mbmi->palette_mode_info.palette_size[1] > 0) {
+ memcpy(color_map, best_palette_color_map,
+ plane_block_width * plane_block_height *
+ sizeof(best_palette_color_map[0]));
+ }
+}
+
+// Run RD calculation with given chroma intra prediction angle., and return
+// the RD cost. Update the best mode info. if the RD cost is the best so far.
+static int64_t pick_intra_angle_routine_sbuv(
+ const AV1_COMP *const cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int rate_overhead, int64_t best_rd_in, int *rate, RD_STATS *rd_stats,
+ int *best_angle_delta, int64_t *best_rd) {
+ MB_MODE_INFO *mbmi = x->e_mbd.mi[0];
+ assert(!is_inter_block(mbmi));
+ int this_rate;
+ int64_t this_rd;
+ RD_STATS tokenonly_rd_stats;
+
+ if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd_in))
+ return INT64_MAX;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, rate_overhead);
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+ if (this_rd < *best_rd) {
+ *best_rd = this_rd;
+ *best_angle_delta = mbmi->angle_delta[PLANE_TYPE_UV];
+ *rate = this_rate;
+ rd_stats->rate = tokenonly_rd_stats.rate;
+ rd_stats->dist = tokenonly_rd_stats.dist;
+ rd_stats->skip = tokenonly_rd_stats.skip;
+ }
+ return this_rd;
+}
+
+// With given chroma directional intra prediction mode, pick the best angle
+// delta. Return true if a RD cost that is smaller than the input one is found.
+static int rd_pick_intra_angle_sbuv(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int rate_overhead,
+ int64_t best_rd, int *rate,
+ RD_STATS *rd_stats) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ int i, angle_delta, best_angle_delta = 0;
+ int64_t this_rd, best_rd_in, rd_cost[2 * (MAX_ANGLE_DELTA + 2)];
+
+ rd_stats->rate = INT_MAX;
+ rd_stats->skip = 0;
+ rd_stats->dist = INT64_MAX;
+ for (i = 0; i < 2 * (MAX_ANGLE_DELTA + 2); ++i) rd_cost[i] = INT64_MAX;
+
+ for (angle_delta = 0; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ for (i = 0; i < 2; ++i) {
+ best_rd_in = (best_rd == INT64_MAX)
+ ? INT64_MAX
+ : (best_rd + (best_rd >> ((angle_delta == 0) ? 3 : 5)));
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ this_rd = pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead,
+ best_rd_in, rate, rd_stats,
+ &best_angle_delta, &best_rd);
+ rd_cost[2 * angle_delta + i] = this_rd;
+ if (angle_delta == 0) {
+ if (this_rd == INT64_MAX) return 0;
+ rd_cost[1] = this_rd;
+ break;
+ }
+ }
+ }
+
+ assert(best_rd != INT64_MAX);
+ for (angle_delta = 1; angle_delta <= MAX_ANGLE_DELTA; angle_delta += 2) {
+ int64_t rd_thresh;
+ for (i = 0; i < 2; ++i) {
+ int skip_search = 0;
+ rd_thresh = best_rd + (best_rd >> 5);
+ if (rd_cost[2 * (angle_delta + 1) + i] > rd_thresh &&
+ rd_cost[2 * (angle_delta - 1) + i] > rd_thresh)
+ skip_search = 1;
+ if (!skip_search) {
+ mbmi->angle_delta[PLANE_TYPE_UV] = (1 - 2 * i) * angle_delta;
+ pick_intra_angle_routine_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ rate, rd_stats, &best_angle_delta,
+ &best_rd);
+ }
+ }
+ }
+
+ mbmi->angle_delta[PLANE_TYPE_UV] = best_angle_delta;
+ return rd_stats->rate != INT_MAX;
+}
+
+#define PLANE_SIGN_TO_JOINT_SIGN(plane, a, b) \
+ (plane == CFL_PRED_U ? a * CFL_SIGNS + b - 1 : b * CFL_SIGNS + a - 1)
+static int cfl_rd_pick_alpha(MACROBLOCK *const x, const AV1_COMP *const cpi,
+ TX_SIZE tx_size, int64_t best_rd) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+#if CONFIG_DEBUG
+ assert(is_cfl_allowed(xd));
+ const int ssx = xd->plane[AOM_PLANE_U].subsampling_x;
+ const int ssy = xd->plane[AOM_PLANE_U].subsampling_y;
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, ssx, ssy);
+ (void)plane_bsize;
+ assert(plane_bsize < BLOCK_SIZES_ALL);
+ if (!xd->lossless[mbmi->segment_id]) {
+ assert(block_size_wide[plane_bsize] == tx_size_wide[tx_size]);
+ assert(block_size_high[plane_bsize] == tx_size_high[tx_size]);
+ }
+#endif // CONFIG_DEBUG
+
+ xd->cfl.use_dc_pred_cache = 1;
+ const int64_t mode_rd =
+ RDCOST(x->rdmult,
+ x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED], 0);
+ int64_t best_rd_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+ int best_c[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#if CONFIG_DEBUG
+ int best_rate_uv[CFL_JOINT_SIGNS][CFL_PRED_PLANES];
+#endif // CONFIG_DEBUG
+
+ for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+ RD_STATS rd_stats;
+ av1_init_rd_stats(&rd_stats);
+ for (int joint_sign = 0; joint_sign < CFL_JOINT_SIGNS; joint_sign++) {
+ best_rd_uv[joint_sign][plane] = INT64_MAX;
+ best_c[joint_sign][plane] = 0;
+ }
+ // Collect RD stats for an alpha value of zero in this plane.
+ // Skip i == CFL_SIGN_ZERO as (0, 0) is invalid.
+ for (int i = CFL_SIGN_NEG; i < CFL_SIGNS; i++) {
+ const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, CFL_SIGN_ZERO, i);
+ if (i == CFL_SIGN_NEG) {
+ mbmi->cfl_alpha_idx = 0;
+ mbmi->cfl_alpha_signs = joint_sign;
+ txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize, tx_size,
+ cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ if (rd_stats.rate == INT_MAX) break;
+ }
+ const int alpha_rate = x->cfl_cost[joint_sign][plane][0];
+ best_rd_uv[joint_sign][plane] =
+ RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+#if CONFIG_DEBUG
+ best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif // CONFIG_DEBUG
+ }
+ }
+
+ int best_joint_sign = -1;
+
+ for (int plane = 0; plane < CFL_PRED_PLANES; plane++) {
+ for (int pn_sign = CFL_SIGN_NEG; pn_sign < CFL_SIGNS; pn_sign++) {
+ int progress = 0;
+ for (int c = 0; c < CFL_ALPHABET_SIZE; c++) {
+ int flag = 0;
+ RD_STATS rd_stats;
+ if (c > 2 && progress < c) break;
+ av1_init_rd_stats(&rd_stats);
+ for (int i = 0; i < CFL_SIGNS; i++) {
+ const int joint_sign = PLANE_SIGN_TO_JOINT_SIGN(plane, pn_sign, i);
+ if (i == 0) {
+ mbmi->cfl_alpha_idx = (c << CFL_ALPHABET_SIZE_LOG2) + c;
+ mbmi->cfl_alpha_signs = joint_sign;
+ txfm_rd_in_plane(x, cpi, &rd_stats, best_rd, plane + 1, bsize,
+ tx_size, cpi->sf.use_fast_coef_costing, FTXS_NONE);
+ if (rd_stats.rate == INT_MAX) break;
+ }
+ const int alpha_rate = x->cfl_cost[joint_sign][plane][c];
+ int64_t this_rd =
+ RDCOST(x->rdmult, rd_stats.rate + alpha_rate, rd_stats.dist);
+ if (this_rd >= best_rd_uv[joint_sign][plane]) continue;
+ best_rd_uv[joint_sign][plane] = this_rd;
+ best_c[joint_sign][plane] = c;
+#if CONFIG_DEBUG
+ best_rate_uv[joint_sign][plane] = rd_stats.rate;
+#endif // CONFIG_DEBUG
+ flag = 2;
+ if (best_rd_uv[joint_sign][!plane] == INT64_MAX) continue;
+ this_rd += mode_rd + best_rd_uv[joint_sign][!plane];
+ if (this_rd >= best_rd) continue;
+ best_rd = this_rd;
+ best_joint_sign = joint_sign;
+ }
+ progress += flag;
+ }
+ }
+ }
+
+ int best_rate_overhead = INT_MAX;
+ int ind = 0;
+ if (best_joint_sign >= 0) {
+ const int u = best_c[best_joint_sign][CFL_PRED_U];
+ const int v = best_c[best_joint_sign][CFL_PRED_V];
+ ind = (u << CFL_ALPHABET_SIZE_LOG2) + v;
+ best_rate_overhead = x->cfl_cost[best_joint_sign][CFL_PRED_U][u] +
+ x->cfl_cost[best_joint_sign][CFL_PRED_V][v];
+#if CONFIG_DEBUG
+ xd->cfl.rate = x->intra_uv_mode_cost[CFL_ALLOWED][mbmi->mode][UV_CFL_PRED] +
+ best_rate_overhead +
+ best_rate_uv[best_joint_sign][CFL_PRED_U] +
+ best_rate_uv[best_joint_sign][CFL_PRED_V];
+#endif // CONFIG_DEBUG
+ } else {
+ best_joint_sign = 0;
+ }
+
+ mbmi->cfl_alpha_idx = ind;
+ mbmi->cfl_alpha_signs = best_joint_sign;
+ xd->cfl.use_dc_pred_cache = 0;
+ xd->cfl.dc_pred_is_cached[0] = 0;
+ xd->cfl.dc_pred_is_cached[1] = 0;
+ return best_rate_overhead;
+}
+
+static void init_sbuv_mode(MB_MODE_INFO *const mbmi) {
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+}
+
+static int64_t rd_pick_intra_sbuv_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ int *rate, int *rate_tokenonly,
+ int64_t *distortion, int *skippable,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ assert(!is_inter_block(mbmi));
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int64_t best_rd = INT64_MAX, this_rd;
+
+ for (int mode_idx = 0; mode_idx < UV_INTRA_MODES; ++mode_idx) {
+ int this_rate;
+ RD_STATS tokenonly_rd_stats;
+ UV_PREDICTION_MODE mode = uv_rd_search_mode_order[mode_idx];
+ const int is_directional_mode = av1_is_directional_mode(get_uv_mode(mode));
+ if (!(cpi->sf.intra_uv_mode_mask[txsize_sqr_up_map[max_tx_size]] &
+ (1 << mode)))
+ continue;
+
+ mbmi->uv_mode = mode;
+ int cfl_alpha_rate = 0;
+ if (mode == UV_CFL_PRED) {
+ if (!is_cfl_allowed(xd)) continue;
+ assert(!is_directional_mode);
+ const TX_SIZE uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ cfl_alpha_rate = cfl_rd_pick_alpha(x, cpi, uv_tx_size, best_rd);
+ if (cfl_alpha_rate == INT_MAX) continue;
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ if (is_directional_mode && av1_use_angle_delta(mbmi->sb_type)) {
+ const int rate_overhead =
+ x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode];
+ if (!rd_pick_intra_angle_sbuv(cpi, x, bsize, rate_overhead, best_rd,
+ &this_rate, &tokenonly_rd_stats))
+ continue;
+ } else {
+ if (!super_block_uvrd(cpi, x, &tokenonly_rd_stats, bsize, best_rd)) {
+ continue;
+ }
+ }
+ const int mode_cost =
+ x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mode] +
+ cfl_alpha_rate;
+ this_rate = tokenonly_rd_stats.rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, mode_cost);
+ if (mode == UV_CFL_PRED) {
+ assert(is_cfl_allowed(xd));
+#if CONFIG_DEBUG
+ if (!xd->lossless[mbmi->segment_id])
+ assert(xd->cfl.rate == tokenonly_rd_stats.rate + mode_cost);
+#endif // CONFIG_DEBUG
+ }
+ this_rd = RDCOST(x->rdmult, this_rate, tokenonly_rd_stats.dist);
+
+ if (this_rd < best_rd) {
+ best_mbmi = *mbmi;
+ best_rd = this_rd;
+ *rate = this_rate;
+ *rate_tokenonly = tokenonly_rd_stats.rate;
+ *distortion = tokenonly_rd_stats.dist;
+ *skippable = tokenonly_rd_stats.skip;
+ }
+ }
+
+ const int try_palette =
+ av1_allow_palette(cpi->common.allow_screen_content_tools, mbmi->sb_type);
+ if (try_palette) {
+ uint8_t *best_palette_color_map = x->palette_buffer->best_palette_color_map;
+ rd_pick_palette_intra_sbuv(
+ cpi, x,
+ x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][UV_DC_PRED],
+ best_palette_color_map, &best_mbmi, &best_rd, rate, rate_tokenonly,
+ distortion, skippable);
+ }
+
+ *mbmi = best_mbmi;
+ // Make sure we actually chose a mode
+ assert(best_rd < INT64_MAX);
+ return best_rd;
+}
+
+static void choose_intra_uv_mode(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, TX_SIZE max_tx_size,
+ int *rate_uv, int *rate_uv_tokenonly,
+ int64_t *dist_uv, int *skip_uv,
+ UV_PREDICTION_MODE *mode_uv) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+ const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
+ // Use an estimated rd for uv_intra based on DC_PRED if the
+ // appropriate speed flag is set.
+ init_sbuv_mode(mbmi);
+ if (x->skip_chroma_rd) {
+ *rate_uv = 0;
+ *rate_uv_tokenonly = 0;
+ *dist_uv = 0;
+ *skip_uv = 1;
+ *mode_uv = UV_DC_PRED;
+ return;
+ }
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
+ bsize = scale_chroma_bsize(bsize, xd->plane[AOM_PLANE_U].subsampling_x,
+ xd->plane[AOM_PLANE_U].subsampling_y);
+ // Only store reconstructed luma when there's chroma RDO. When there's no
+ // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+ xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+ if (xd->cfl.store_y) {
+ // Restore reconstructed luma values.
+ av1_encode_intra_block_plane(cpi, x, mbmi->sb_type, AOM_PLANE_Y,
+ cpi->optimize_seg_arr[mbmi->segment_id],
+ mi_row, mi_col);
+ xd->cfl.store_y = 0;
+ }
+ rd_pick_intra_sbuv_mode(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
+ bsize, max_tx_size);
+ *mode_uv = mbmi->uv_mode;
+}
+
+static int cost_mv_ref(const MACROBLOCK *const x, PREDICTION_MODE mode,
+ int16_t mode_context) {
+ if (is_inter_compound_mode(mode)) {
+ return x
+ ->inter_compound_mode_cost[mode_context][INTER_COMPOUND_OFFSET(mode)];
+ }
+
+ int mode_cost = 0;
+ int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+
+ assert(is_inter_mode(mode));
+
+ if (mode == NEWMV) {
+ mode_cost = x->newmv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost = x->newmv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> GLOBALMV_OFFSET) & GLOBALMV_CTX_MASK;
+
+ if (mode == GLOBALMV) {
+ mode_cost += x->zeromv_mode_cost[mode_ctx][0];
+ return mode_cost;
+ } else {
+ mode_cost += x->zeromv_mode_cost[mode_ctx][1];
+ mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+ mode_cost += x->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+ return mode_cost;
+ }
+ }
+}
+
+static int get_interinter_compound_mask_rate(const MACROBLOCK *const x,
+ const MB_MODE_INFO *const mbmi) {
+ switch (mbmi->interinter_comp.type) {
+ case COMPOUND_AVERAGE: return 0;
+ case COMPOUND_WEDGE:
+ return get_interinter_wedge_bits(mbmi->sb_type) > 0
+ ? av1_cost_literal(1) +
+ x->wedge_idx_cost[mbmi->sb_type]
+ [mbmi->interinter_comp.wedge_index]
+ : 0;
+ case COMPOUND_DIFFWTD: return av1_cost_literal(1);
+ default: assert(0); return 0;
+ }
+}
+
+typedef struct {
+ int eobs;
+ int brate;
+ int byrate;
+ int64_t bdist;
+ int64_t bsse;
+ int64_t brdcost;
+ int_mv mvs[2];
+ int_mv pred_mv[2];
+ int_mv ref_mv[2];
+
+ ENTROPY_CONTEXT ta[2];
+ ENTROPY_CONTEXT tl[2];
+} SEG_RDSTAT;
+
+typedef struct {
+ int_mv *ref_mv[2];
+ int_mv mvp;
+
+ int64_t segment_rd;
+ int r;
+ int64_t d;
+ int64_t sse;
+ int segment_yrate;
+ PREDICTION_MODE modes[4];
+ SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+ int mvthresh;
+} BEST_SEG_INFO;
+
+static INLINE int mv_check_bounds(const MvLimits *mv_limits, const MV *mv) {
+ return (mv->row >> 3) < mv_limits->row_min ||
+ (mv->row >> 3) > mv_limits->row_max ||
+ (mv->col >> 3) < mv_limits->col_min ||
+ (mv->col >> 3) > mv_limits->col_max;
+}
+
+static INLINE PREDICTION_MODE get_single_mode(PREDICTION_MODE this_mode,
+ int ref_idx, int is_comp_pred) {
+ PREDICTION_MODE single_mode;
+ if (is_comp_pred) {
+ single_mode =
+ ref_idx ? compound_ref1_mode(this_mode) : compound_ref0_mode(this_mode);
+ } else {
+ single_mode = this_mode;
+ }
+ return single_mode;
+}
+
+static void joint_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int_mv *cur_mv, int mi_row,
+ int mi_col, int_mv *ref_mv_sub8x8[2],
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, const int block) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ const int plane = 0;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+ const int_mv init_mv[2] = { cur_mv[0], cur_mv[1] };
+ const int refs[2] = { mbmi->ref_frame[0], mbmi->ref_frame[1] };
+ int_mv ref_mv[2];
+ int ite, ref;
+ // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
+ const int ic = block & 1;
+ const int ir = (block - ic) >> 1;
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+ const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ WarpTypesAllowed warp_types[2];
+ for (ref = 0; ref < 2; ++ref) {
+ const WarpedMotionParams *const wm =
+ &xd->global_motion[xd->mi[0]->ref_frame[ref]];
+ const int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+ warp_types[ref].global_warp_allowed = is_global;
+ warp_types[ref].local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+ }
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[2][MAX_MB_PLANE];
+ int last_besterr[2] = { INT_MAX, INT_MAX };
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
+ av1_get_scaled_ref_frame(cpi, refs[0]),
+ av1_get_scaled_ref_frame(cpi, refs[1])
+ };
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint8_t, second_pred16[MAX_SB_SQUARE * sizeof(uint16_t)]);
+ uint8_t *second_pred = get_buf_by_bd(xd, second_pred16);
+ (void)ref_mv_sub8x8;
+
+ const int have_newmv = have_nearmv_in_inter_mode(mbmi->mode);
+ const int ref_mv_idx = mbmi->ref_mv_idx + (have_newmv ? 1 : 0);
+ MV *const best_mv = &x->best_mv.as_mv;
+ const int search_range = SEARCH_RANGE_8P;
+ const int sadpb = x->sadperbit16;
+ // Allow joint search multiple times iteratively for each reference frame
+ // and break out of the search loop if it couldn't find a better mv.
+ for (ite = 0; ite < 4; ite++) {
+ struct buf_2d ref_yv12[2];
+ int bestsme = INT_MAX;
+ MvLimits tmp_mv_limits = x->mv_limits;
+ int id = ite % 2; // Even iterations search in the first reference frame,
+ // odd iterations search in the second. The predictor
+ // found for the 'other' reference frame is factored in.
+ if (ite >= 2 && cur_mv[!id].as_int == init_mv[!id].as_int) {
+ if (cur_mv[id].as_int == init_mv[id].as_int) {
+ break;
+ } else {
+ int_mv cur_int_mv, init_int_mv;
+ cur_int_mv.as_mv.col = cur_mv[id].as_mv.col >> 3;
+ cur_int_mv.as_mv.row = cur_mv[id].as_mv.col >> 3;
+ init_int_mv.as_mv.row = init_mv[id].as_mv.row >> 3;
+ init_int_mv.as_mv.col = init_mv[id].as_mv.col >> 3;
+ if (cur_int_mv.as_int == init_int_mv.as_int) {
+ break;
+ }
+ }
+ }
+ for (ref = 0; ref < 2; ++ref) {
+ ref_mv[ref] = av1_get_ref_mv(x, ref);
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ if (scaled_ref_frame[ref]) {
+ int i;
+ for (i = 0; i < num_planes; i++)
+ backup_yv12[ref][i] = xd->plane[i].pre[ref];
+ av1_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
+ NULL, num_planes);
+ }
+ }
+
+ assert(IMPLIES(scaled_ref_frame[0] != NULL,
+ cm->width == scaled_ref_frame[0]->y_crop_width &&
+ cm->height == scaled_ref_frame[0]->y_crop_height));
+ assert(IMPLIES(scaled_ref_frame[1] != NULL,
+ cm->width == scaled_ref_frame[1]->y_crop_width &&
+ cm->height == scaled_ref_frame[1]->y_crop_height));
+
+ // Initialize based on (possibly scaled) prediction buffers.
+ ref_yv12[0] = xd->plane[plane].pre[0];
+ ref_yv12[1] = xd->plane[plane].pre[1];
+
+ // Get the prediction block from the 'other' reference frame.
+ const InterpFilters interp_filters = EIGHTTAP_REGULAR;
+
+ // Since we have scaled the reference frames to match the size of the
+ // current frame we must use a unit scaling factor during mode selection.
+ av1_build_inter_predictor(ref_yv12[!id].buf, ref_yv12[!id].stride,
+ second_pred, pw, &cur_mv[!id].as_mv,
+ &cm->sf_identity, pw, ph, &conv_params,
+ interp_filters, &warp_types[!id], p_col, p_row,
+ plane, !id, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+
+ const int order_idx = id != 0;
+ av1_jnt_comp_weight_assign(cm, mbmi, order_idx, &xd->jcp_param.fwd_offset,
+ &xd->jcp_param.bck_offset,
+ &xd->jcp_param.use_jnt_comp_avg, 1);
+
+ // Do full-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv[id].as_mv);
+
+ // Use the mv result from the single mode as mv predictor.
+ *best_mv = cur_mv[id].as_mv;
+
+ best_mv->col >>= 3;
+ best_mv->row >>= 3;
+
+ av1_set_mvcost(x, id, ref_mv_idx);
+
+ // Small-range full-pixel motion search.
+ bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+ &cpi->fn_ptr[bsize], mask, mask_stride,
+ id, &ref_mv[id].as_mv, second_pred);
+ if (bestsme < INT_MAX) {
+ if (mask)
+ bestsme = av1_get_mvpred_mask_var(x, best_mv, &ref_mv[id].as_mv,
+ second_pred, mask, mask_stride, id,
+ &cpi->fn_ptr[bsize], 1);
+ else
+ bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+ second_pred, &cpi->fn_ptr[bsize], 1);
+ }
+
+ x->mv_limits = tmp_mv_limits;
+
+ // Restore the pointer to the first (possibly scaled) prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+
+ for (ref = 0; ref < 2; ++ref) {
+ if (scaled_ref_frame[ref]) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref] = backup_yv12[ref][i];
+ }
+ // Re-initialize based on unscaled prediction buffers.
+ ref_yv12[ref] = xd->plane[plane].pre[ref];
+ }
+ }
+
+ // Do sub-pixel compound motion search on the current reference frame.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[id];
+
+ if (cpi->common.cur_frame_force_integer_mv) {
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+ }
+ if (bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ bestsme = cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv[id].as_mv,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+ x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask,
+ mask_stride, id, pw, ph, cpi->sf.use_accurate_subpel_search);
+ }
+
+ // Restore the pointer to the first prediction buffer.
+ if (id) xd->plane[plane].pre[0] = ref_yv12[0];
+ if (bestsme < last_besterr[id]) {
+ cur_mv[id].as_mv = *best_mv;
+ last_besterr[id] = bestsme;
+ } else {
+ break;
+ }
+ }
+
+ *rate_mv = 0;
+
+ for (ref = 0; ref < 2; ++ref) {
+ av1_set_mvcost(x, ref, ref_mv_idx);
+ const int_mv curr_ref_mv = av1_get_ref_mv(x, ref);
+ *rate_mv += av1_mv_bit_cost(&cur_mv[ref].as_mv, &curr_ref_mv.as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+}
+
+static void estimate_ref_frame_costs(
+ const AV1_COMMON *cm, const MACROBLOCKD *xd, const MACROBLOCK *x,
+ int segment_id, unsigned int *ref_costs_single,
+ unsigned int (*ref_costs_comp)[REF_FRAMES]) {
+ int seg_ref_active =
+ segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ if (seg_ref_active) {
+ memset(ref_costs_single, 0, REF_FRAMES * sizeof(*ref_costs_single));
+ int ref_frame;
+ for (ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame)
+ memset(ref_costs_comp[ref_frame], 0,
+ REF_FRAMES * sizeof((*ref_costs_comp)[0]));
+ } else {
+ int intra_inter_ctx = av1_get_intra_inter_context(xd);
+ ref_costs_single[INTRA_FRAME] = x->intra_inter_cost[intra_inter_ctx][0];
+ unsigned int base_cost = x->intra_inter_cost[intra_inter_ctx][1];
+
+ for (int i = LAST_FRAME; i <= ALTREF_FRAME; ++i)
+ ref_costs_single[i] = base_cost;
+
+ const int ctx_p1 = av1_get_pred_context_single_ref_p1(xd);
+ const int ctx_p2 = av1_get_pred_context_single_ref_p2(xd);
+ const int ctx_p3 = av1_get_pred_context_single_ref_p3(xd);
+ const int ctx_p4 = av1_get_pred_context_single_ref_p4(xd);
+ const int ctx_p5 = av1_get_pred_context_single_ref_p5(xd);
+ const int ctx_p6 = av1_get_pred_context_single_ref_p6(xd);
+
+ // Determine cost of a single ref frame, where frame types are represented
+ // by a tree:
+ // Level 0: add cost whether this ref is a forward or backward ref
+ ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p1][0][0];
+ ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+ ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p1][0][1];
+
+ // Level 1: if this ref is forward ref,
+ // add cost whether it is last/last2 or last3/golden
+ ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p3][2][0];
+ ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+ ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p3][2][1];
+
+ // Level 1: if this ref is backward ref
+ // then add cost whether this ref is altref or backward ref
+ ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p2][1][0];
+ ref_costs_single[ALTREF_FRAME] += x->single_ref_cost[ctx_p2][1][1];
+
+ // Level 2: further add cost whether this ref is last or last2
+ ref_costs_single[LAST_FRAME] += x->single_ref_cost[ctx_p4][3][0];
+ ref_costs_single[LAST2_FRAME] += x->single_ref_cost[ctx_p4][3][1];
+
+ // Level 2: last3 or golden
+ ref_costs_single[LAST3_FRAME] += x->single_ref_cost[ctx_p5][4][0];
+ ref_costs_single[GOLDEN_FRAME] += x->single_ref_cost[ctx_p5][4][1];
+
+ // Level 2: bwdref or altref2
+ ref_costs_single[BWDREF_FRAME] += x->single_ref_cost[ctx_p6][5][0];
+ ref_costs_single[ALTREF2_FRAME] += x->single_ref_cost[ctx_p6][5][1];
+
+ if (cm->reference_mode != SINGLE_REFERENCE) {
+ // Similar to single ref, determine cost of compound ref frames.
+ // cost_compound_refs = cost_first_ref + cost_second_ref
+ const int bwdref_comp_ctx_p = av1_get_pred_context_comp_bwdref_p(xd);
+ const int bwdref_comp_ctx_p1 = av1_get_pred_context_comp_bwdref_p1(xd);
+ const int ref_comp_ctx_p = av1_get_pred_context_comp_ref_p(xd);
+ const int ref_comp_ctx_p1 = av1_get_pred_context_comp_ref_p1(xd);
+ const int ref_comp_ctx_p2 = av1_get_pred_context_comp_ref_p2(xd);
+
+ const int comp_ref_type_ctx = av1_get_comp_reference_type_context(xd);
+ unsigned int ref_bicomp_costs[REF_FRAMES] = { 0 };
+
+ ref_bicomp_costs[LAST_FRAME] = ref_bicomp_costs[LAST2_FRAME] =
+ ref_bicomp_costs[LAST3_FRAME] = ref_bicomp_costs[GOLDEN_FRAME] =
+ base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][1];
+ ref_bicomp_costs[BWDREF_FRAME] = ref_bicomp_costs[ALTREF2_FRAME] = 0;
+ ref_bicomp_costs[ALTREF_FRAME] = 0;
+
+ // cost of first ref frame
+ ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][0];
+ ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+ ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[LAST_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[LAST2_FRAME] += x->comp_ref_cost[ref_comp_ctx_p1][1][1];
+
+ ref_bicomp_costs[LAST3_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][0];
+ ref_bicomp_costs[GOLDEN_FRAME] += x->comp_ref_cost[ref_comp_ctx_p2][2][1];
+
+ // cost of second ref frame
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ x->comp_bwdref_cost[bwdref_comp_ctx_p][0][0];
+ ref_bicomp_costs[ALTREF_FRAME] +=
+ x->comp_bwdref_cost[bwdref_comp_ctx_p][0][1];
+
+ ref_bicomp_costs[BWDREF_FRAME] +=
+ x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][0];
+ ref_bicomp_costs[ALTREF2_FRAME] +=
+ x->comp_bwdref_cost[bwdref_comp_ctx_p1][1][1];
+
+ // cost: if one ref frame is forward ref, the other ref is backward ref
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1) {
+ ref_costs_comp[ref0][ref1] =
+ ref_bicomp_costs[ref0] + ref_bicomp_costs[ref1];
+ }
+ }
+
+ // cost: if both ref frames are the same side.
+ const int uni_comp_ref_ctx_p = av1_get_pred_context_uni_comp_ref_p(xd);
+ const int uni_comp_ref_ctx_p1 = av1_get_pred_context_uni_comp_ref_p1(xd);
+ const int uni_comp_ref_ctx_p2 = av1_get_pred_context_uni_comp_ref_p2(xd);
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] =
+ base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][0];
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] =
+ base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][0];
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] =
+ base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p1][1][1] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p2][2][1];
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] =
+ base_cost + x->comp_ref_type_cost[comp_ref_type_ctx][0] +
+ x->uni_comp_ref_cost[uni_comp_ref_ctx_p][0][1];
+ } else {
+ int ref0, ref1;
+ for (ref0 = LAST_FRAME; ref0 <= GOLDEN_FRAME; ++ref0) {
+ for (ref1 = BWDREF_FRAME; ref1 <= ALTREF_FRAME; ++ref1)
+ ref_costs_comp[ref0][ref1] = 512;
+ }
+ ref_costs_comp[LAST_FRAME][LAST2_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][LAST3_FRAME] = 512;
+ ref_costs_comp[LAST_FRAME][GOLDEN_FRAME] = 512;
+ ref_costs_comp[BWDREF_FRAME][ALTREF_FRAME] = 512;
+ }
+ }
+}
+
+static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+ int mode_index,
+ int64_t comp_pred_diff[REFERENCE_MODES],
+ int skippable) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+
+ // Take a snapshot of the coding context so it can be
+ // restored if we decide to encode this way
+ ctx->skip = x->skip;
+ ctx->skippable = skippable;
+ ctx->best_mode_index = mode_index;
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+ ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
+ ctx->comp_pred_diff = (int)comp_pred_diff[COMPOUND_REFERENCE];
+ ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
+}
+
+static void setup_buffer_ref_mvs_inter(
+ const AV1_COMP *const cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
+ BLOCK_SIZE block_size, int mi_row, int mi_col,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+
+ assert(yv12 != NULL);
+
+ // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
+ // use the UV scaling factors.
+ av1_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf,
+ num_planes);
+
+ // Gets an initial list of candidate vectors from neighbours and orders them
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+
+ // Further refinement that is encode side only to test the top few candidates
+ // in full and choose the best as the centre point for subsequent searches.
+ // The current implementation doesn't support scaling.
+ (void)block_size;
+ av1_mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, ref_frame,
+ block_size);
+}
+
+static void single_motion_search(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int ref_idx, int *rate_mv) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0, 0, 0, 0 } };
+ int bestsme = INT_MAX;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ MV mvp_full;
+ int ref = mbmi->ref_frame[ref_idx];
+ MV ref_mv = av1_get_ref_mv(x, ref_idx).as_mv;
+
+ MvLimits tmp_mv_limits = x->mv_limits;
+ int cost_list[5];
+
+ const YV12_BUFFER_CONFIG *scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+
+ if (scaled_ref_frame) {
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (int i = 0; i < num_planes; i++) {
+ backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ }
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ av1_set_mvcost(
+ x, ref_idx,
+ mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+
+ // Work out the size of the first step in the mv step search.
+ // 0 here is maximum length first step. 1 is AOMMAX >> 1 etc.
+ if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+ // Take the weighted average of the step_params based on the last frame's
+ // max mv magnitude and that based on the best ref mvs of the current
+ // block for the given reference.
+ step_param =
+ (av1_init_search_range(x->max_mv_context[ref]) + cpi->mv_step_param) /
+ 2;
+ } else {
+ step_param = cpi->mv_step_param;
+ }
+
+ if (cpi->sf.adaptive_motion_search && bsize < cm->seq_params.sb_size) {
+ int boffset =
+ 2 * (mi_size_wide_log2[cm->seq_params.sb_size] -
+ AOMMIN(mi_size_high_log2[bsize], mi_size_wide_log2[bsize]));
+ step_param = AOMMAX(step_param, boffset);
+ }
+
+ if (cpi->sf.adaptive_motion_search) {
+ int bwl = mi_size_wide_log2[bsize];
+ int bhl = mi_size_high_log2[bsize];
+ int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+ if (tlevel < 5) {
+ step_param += 2;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 1);
+ }
+
+ // prev_mv_sad is not setup for dynamically scaled frames.
+ if (cpi->oxcf.resize_mode != RESIZE_RANDOM) {
+ int i;
+ for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+ if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+ x->pred_mv[ref].row = 0;
+ x->pred_mv[ref].col = 0;
+ x->best_mv.as_int = INVALID_MV;
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers before returning.
+ for (int j = 0; j < num_planes; ++j)
+ xd->plane[j].pre[ref_idx] = backup_yv12[j];
+ }
+ return;
+ }
+ }
+ }
+ }
+
+ // Note: MV limits are modified here. Always restore the original values
+ // after full-pixel motion search.
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv);
+
+ if (mbmi->motion_mode != SIMPLE_TRANSLATION)
+ mvp_full = mbmi->mv[0].as_mv;
+ else
+ mvp_full = ref_mv;
+
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+
+ x->best_mv.as_int = x->second_best_mv.as_int = INVALID_MV;
+
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+ bestsme = av1_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &ref_mv, INT_MAX, 1,
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 0);
+ break;
+ case OBMC_CAUSAL:
+ bestsme = av1_obmc_full_pixel_search(cpi, x, &mvp_full, step_param, sadpb,
+ MAX_MVSEARCH_STEPS - 1 - step_param,
+ 1, &cpi->fn_ptr[bsize], &ref_mv,
+ &(x->best_mv.as_mv), 0);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+ }
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (cpi->common.cur_frame_force_integer_mv) {
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+ }
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+ if (use_fractional_mv) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ switch (mbmi->motion_mode) {
+ case SIMPLE_TRANSLATION:
+ if (cpi->sf.use_accurate_subpel_search) {
+ int best_mv_var;
+ const int try_second = x->second_best_mv.as_int != INVALID_MV &&
+ x->second_best_mv.as_int != x->best_mv.as_int;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+
+ best_mv_var = cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+ 0, 0, pw, ph, 1);
+
+ if (try_second) {
+ const int minc =
+ AOMMAX(x->mv_limits.col_min * 8, ref_mv.col - MV_MAX);
+ const int maxc =
+ AOMMIN(x->mv_limits.col_max * 8, ref_mv.col + MV_MAX);
+ const int minr =
+ AOMMAX(x->mv_limits.row_min * 8, ref_mv.row - MV_MAX);
+ const int maxr =
+ AOMMIN(x->mv_limits.row_max * 8, ref_mv.row + MV_MAX);
+ int this_var;
+ MV best_mv = x->best_mv.as_mv;
+
+ x->best_mv = x->second_best_mv;
+ if (x->best_mv.as_mv.row * 8 <= maxr &&
+ x->best_mv.as_mv.row * 8 >= minr &&
+ x->best_mv.as_mv.col * 8 <= maxc &&
+ x->best_mv.as_mv.col * 8 >= minc) {
+ this_var = cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost,
+ &dis, &x->pred_sse[ref], NULL, NULL, 0, 0, pw, ph, 1);
+ if (this_var < best_mv_var) best_mv = x->best_mv.as_mv;
+ x->best_mv.as_mv = best_mv;
+ }
+ }
+ } else {
+ cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv, cm->allow_high_precision_mv,
+ x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.mv.subpel_force_stop,
+ cpi->sf.mv.subpel_iters_per_step, cond_cost_list(cpi, cost_list),
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, NULL,
+ 0, 0, 0, 0, 0);
+ }
+ break;
+ case OBMC_CAUSAL:
+ av1_find_best_obmc_sub_pixel_tree_up(
+ x, cm, mi_row, mi_col, &x->best_mv.as_mv, &ref_mv,
+ cm->allow_high_precision_mv, x->errorperbit, &cpi->fn_ptr[bsize],
+ cpi->sf.mv.subpel_force_stop, cpi->sf.mv.subpel_iters_per_step,
+ x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], 0,
+ cpi->sf.use_accurate_subpel_search);
+ break;
+ default: assert(0 && "Invalid motion mode!\n");
+ }
+ }
+ *rate_mv = av1_mv_bit_cost(&x->best_mv.as_mv, &ref_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+
+ if (cpi->sf.adaptive_motion_search && mbmi->motion_mode == SIMPLE_TRANSLATION)
+ x->pred_mv[ref] = x->best_mv.as_mv;
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd, BUFFER_SET dst,
+ const int num_planes) {
+ int i;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].dst.buf = dst.plane[i];
+ xd->plane[i].dst.stride = dst.stride[i];
+ }
+}
+
+static void build_second_inter_pred(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, const MV *other_mv,
+ int mi_row, int mi_col, const int block,
+ int ref_idx, uint8_t *second_pred) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int other_ref = mbmi->ref_frame[!ref_idx];
+ struct macroblockd_plane *const pd = &xd->plane[0];
+ // ic and ir are the 4x4 coordinates of the sub8x8 at index "block"
+ const int ic = block & 1;
+ const int ir = (block - ic) >> 1;
+ const int p_col = ((mi_col * MI_SIZE) >> pd->subsampling_x) + 4 * ic;
+ const int p_row = ((mi_row * MI_SIZE) >> pd->subsampling_y) + 4 * ir;
+ const WarpedMotionParams *const wm = &xd->global_motion[other_ref];
+ int is_global = is_global_mv_block(xd->mi[0], wm->wmtype);
+
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(mbmi));
+
+ const int plane = 0;
+ struct buf_2d ref_yv12 = xd->plane[plane].pre[!ref_idx];
+
+ struct scale_factors sf;
+ av1_setup_scale_factors_for_frame(&sf, ref_yv12.width, ref_yv12.height,
+ cm->width, cm->height);
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global;
+ warp_types.local_warp_allowed = mbmi->motion_mode == WARPED_CAUSAL;
+
+ // Get the prediction block from the 'other' reference frame.
+ av1_build_inter_predictor(ref_yv12.buf, ref_yv12.stride, second_pred, pw,
+ other_mv, &sf, pw, ph, &conv_params,
+ mbmi->interp_filters, &warp_types, p_col, p_row,
+ plane, !ref_idx, MV_PRECISION_Q3, mi_col * MI_SIZE,
+ mi_row * MI_SIZE, xd, cm->allow_warped_motion);
+
+ av1_jnt_comp_weight_assign(cm, mbmi, 0, &xd->jcp_param.fwd_offset,
+ &xd->jcp_param.bck_offset,
+ &xd->jcp_param.use_jnt_comp_avg, 1);
+}
+
+// Search for the best mv for one component of a compound,
+// given that the other component is fixed.
+static void compound_single_motion_search(const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, MV *this_mv,
+ int mi_row, int mi_col,
+ const uint8_t *second_pred,
+ const uint8_t *mask, int mask_stride,
+ int *rate_mv, int ref_idx) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const int pw = block_size_wide[bsize];
+ const int ph = block_size_high[bsize];
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int ref = mbmi->ref_frame[ref_idx];
+ const int_mv ref_mv = av1_get_ref_mv(x, ref_idx);
+ struct macroblockd_plane *const pd = &xd->plane[0];
+
+ struct buf_2d backup_yv12[MAX_MB_PLANE];
+ const YV12_BUFFER_CONFIG *const scaled_ref_frame =
+ av1_get_scaled_ref_frame(cpi, ref);
+
+ // Check that this is either an interinter or an interintra block
+ assert(has_second_ref(mbmi) || (ref_idx == 0 && is_interintra_mode(mbmi)));
+
+ // Store the first prediction buffer.
+ struct buf_2d orig_yv12;
+ if (ref_idx) {
+ orig_yv12 = pd->pre[0];
+ pd->pre[0] = pd->pre[ref_idx];
+ }
+
+ if (scaled_ref_frame) {
+ int i;
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // full-pixel motion search code to be used without additional
+ // modifications.
+ for (i = 0; i < num_planes; i++) backup_yv12[i] = xd->plane[i].pre[ref_idx];
+ av1_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL,
+ num_planes);
+ }
+
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit16;
+ MV *const best_mv = &x->best_mv.as_mv;
+ int search_range = SEARCH_RANGE_8P;
+
+ MvLimits tmp_mv_limits = x->mv_limits;
+
+ // Do compound motion search on the current reference frame.
+ av1_set_mv_search_range(&x->mv_limits, &ref_mv.as_mv);
+
+ // Use the mv result from the single mode as mv predictor.
+ *best_mv = *this_mv;
+
+ best_mv->col >>= 3;
+ best_mv->row >>= 3;
+
+ av1_set_mvcost(
+ x, ref_idx,
+ mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+
+ // Small-range full-pixel motion search.
+ bestsme = av1_refining_search_8p_c(x, sadpb, search_range,
+ &cpi->fn_ptr[bsize], mask, mask_stride,
+ ref_idx, &ref_mv.as_mv, second_pred);
+ if (bestsme < INT_MAX) {
+ if (mask)
+ bestsme =
+ av1_get_mvpred_mask_var(x, best_mv, &ref_mv.as_mv, second_pred, mask,
+ mask_stride, ref_idx, &cpi->fn_ptr[bsize], 1);
+ else
+ bestsme = av1_get_mvpred_av_var(x, best_mv, &ref_mv.as_mv, second_pred,
+ &cpi->fn_ptr[bsize], 1);
+ }
+
+ x->mv_limits = tmp_mv_limits;
+
+ if (scaled_ref_frame) {
+ // Swap back the original buffers for subpel motion search.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[ref_idx] = backup_yv12[i];
+ }
+ }
+
+ if (cpi->common.cur_frame_force_integer_mv) {
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+ }
+ const int use_fractional_mv =
+ bestsme < INT_MAX && cpi->common.cur_frame_force_integer_mv == 0;
+ if (use_fractional_mv) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+ bestsme = cpi->find_fractional_mv_step(
+ x, cm, mi_row, mi_col, &ref_mv.as_mv,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[bsize], 0, cpi->sf.mv.subpel_iters_per_step, NULL,
+ x->nmvjointcost, x->mvcost, &dis, &sse, second_pred, mask, mask_stride,
+ ref_idx, pw, ph, cpi->sf.use_accurate_subpel_search);
+ }
+
+ // Restore the pointer to the first unscaled prediction buffer.
+ if (ref_idx) pd->pre[0] = orig_yv12;
+
+ if (bestsme < INT_MAX) *this_mv = *best_mv;
+
+ *rate_mv = 0;
+
+ av1_set_mvcost(
+ x, ref_idx,
+ mbmi->ref_mv_idx + (have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0));
+ *rate_mv += av1_mv_bit_cost(this_mv, &ref_mv.as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+}
+
+// Wrapper for compound_single_motion_search, for the common case
+// where the second prediction is also an inter mode.
+static void compound_single_motion_search_interinter(
+ const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *cur_mv,
+ int mi_row, int mi_col, const uint8_t *mask, int mask_stride, int *rate_mv,
+ const int block, int ref_idx) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ // This function should only ever be called for compound modes
+ assert(has_second_ref(xd->mi[0]));
+
+ // Prediction buffer from second frame.
+ DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
+ uint8_t *second_pred;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
+ else
+ second_pred = (uint8_t *)second_pred_alloc_16;
+
+ MV *this_mv = &cur_mv[ref_idx].as_mv;
+ const MV *other_mv = &cur_mv[!ref_idx].as_mv;
+
+ build_second_inter_pred(cpi, x, bsize, other_mv, mi_row, mi_col, block,
+ ref_idx, second_pred);
+
+ compound_single_motion_search(cpi, x, bsize, this_mv, mi_row, mi_col,
+ second_pred, mask, mask_stride, rate_mv,
+ ref_idx);
+}
+
+static void do_masked_motion_search_indexed(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, int_mv *tmp_mv, int *rate_mv, int which) {
+ // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ BLOCK_SIZE sb_type = mbmi->sb_type;
+ const uint8_t *mask;
+ const int mask_stride = block_size_wide[bsize];
+
+ mask = av1_get_compound_type_mask(comp_data, sb_type);
+
+ tmp_mv[0].as_int = cur_mv[0].as_int;
+ tmp_mv[1].as_int = cur_mv[1].as_int;
+ if (which == 0 || which == 1) {
+ compound_single_motion_search_interinter(cpi, x, bsize, tmp_mv, mi_row,
+ mi_col, mask, mask_stride, rate_mv,
+ 0, which);
+ } else if (which == 2) {
+ joint_motion_search(cpi, x, bsize, tmp_mv, mi_row, mi_col, NULL, mask,
+ mask_stride, rate_mv, 0);
+ }
+}
+
+#define USE_DISCOUNT_NEWMV_TEST 0
+#if USE_DISCOUNT_NEWMV_TEST
+// In some situations we want to discount the apparent cost of a new motion
+// vector. Where there is a subtle motion field and especially where there is
+// low spatial complexity then it can be hard to cover the cost of a new motion
+// vector in a single block, even if that motion vector reduces distortion.
+// However, once established that vector may be usable through the nearest and
+// near mv modes to reduce distortion in subsequent blocks and also improve
+// visual quality.
+#define NEW_MV_DISCOUNT_FACTOR 8
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const MB_MODE_INFO_EXT *mbmi_ext);
+static int discount_newmv_test(const AV1_COMP *const cpi, const MACROBLOCK *x,
+ PREDICTION_MODE this_mode, int_mv this_mv) {
+ if (this_mode == NEWMV && this_mv.as_int != 0 &&
+ !cpi->rc.is_src_frame_alt_ref) {
+ // Only discount new_mv when nearst_mv and all near_mv are zero, and the
+ // new_mv is not equal to global_mv
+ const AV1_COMMON *const cm = &cpi->common;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const MV_REFERENCE_FRAME tmp_ref_frames[2] = { mbmi->ref_frame[0],
+ NONE_FRAME };
+ const uint8_t ref_frame_type = av1_ref_frame_type(tmp_ref_frames);
+ int_mv nearest_mv;
+ get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ int ret = nearest_mv.as_int == 0;
+ for (int ref_mv_idx = 0;
+ ref_mv_idx < x->mbmi_ext->ref_mv_count[ref_frame_type]; ++ref_mv_idx) {
+ int_mv near_mv;
+ get_this_mv(&near_mv, NEARMV, 0, ref_mv_idx, tmp_ref_frames, x->mbmi_ext);
+ ret &= near_mv.as_int == 0;
+ }
+ if (cm->global_motion[tmp_ref_frames[0]].wmtype <= TRANSLATION) {
+ int_mv global_mv;
+ get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ ret &= global_mv.as_int != this_mv.as_int;
+ }
+ return ret;
+ }
+ return 0;
+}
+#endif
+
+#define LEFT_TOP_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+#define RIGHT_BOTTOM_MARGIN ((AOM_BORDER_IN_PIXELS - AOM_INTERP_EXTEND) << 3)
+
+// TODO(jingning): this mv clamping function should be block size dependent.
+static INLINE void clamp_mv2(MV *mv, const MACROBLOCKD *xd) {
+ clamp_mv(mv, xd->mb_to_left_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+}
+
+static int estimate_wedge_sign(const AV1_COMP *cpi, const MACROBLOCK *x,
+ const BLOCK_SIZE bsize, const uint8_t *pred0,
+ int stride0, const uint8_t *pred1, int stride1) {
+ static const BLOCK_SIZE split_qtr[BLOCK_SIZES_ALL] = {
+ // 4X4
+ BLOCK_INVALID,
+ // 4X8, 8X4, 8X8
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+ // 8X16, 16X8, 16X16
+ BLOCK_4X8, BLOCK_8X4, BLOCK_8X8,
+ // 16X32, 32X16, 32X32
+ BLOCK_8X16, BLOCK_16X8, BLOCK_16X16,
+ // 32X64, 64X32, 64X64
+ BLOCK_16X32, BLOCK_32X16, BLOCK_32X32,
+ // 64x128, 128x64, 128x128
+ BLOCK_32X64, BLOCK_64X32, BLOCK_64X64,
+ // 4X16, 16X4, 8X32
+ BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X16,
+ // 32X8, 16X64, 64X16
+ BLOCK_16X4, BLOCK_8X32, BLOCK_32X8
+ };
+ const struct macroblock_plane *const p = &x->plane[0];
+ const uint8_t *src = p->src.buf;
+ int src_stride = p->src.stride;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ uint32_t esq[2][4];
+ int64_t tl, br;
+
+ const BLOCK_SIZE f_index = split_qtr[bsize];
+ assert(f_index != BLOCK_INVALID);
+
+ if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ pred0 = CONVERT_TO_BYTEPTR(pred0);
+ pred1 = CONVERT_TO_BYTEPTR(pred1);
+ }
+
+ cpi->fn_ptr[f_index].vf(src, src_stride, pred0, stride0, &esq[0][0]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred0 + bw / 2, stride0,
+ &esq[0][1]);
+ cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+ pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+ cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+ pred0 + bh / 2 * stride0 + bw / 2, stride0,
+ &esq[0][3]);
+ cpi->fn_ptr[f_index].vf(src, src_stride, pred1, stride1, &esq[1][0]);
+ cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride, pred1 + bw / 2, stride1,
+ &esq[1][1]);
+ cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride, src_stride,
+ pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+ cpi->fn_ptr[f_index].vf(src + bh / 2 * src_stride + bw / 2, src_stride,
+ pred1 + bh / 2 * stride1 + bw / 2, stride0,
+ &esq[1][3]);
+
+ tl = ((int64_t)esq[0][0] + esq[0][1] + esq[0][2]) -
+ ((int64_t)esq[1][0] + esq[1][1] + esq[1][2]);
+ br = ((int64_t)esq[1][3] + esq[1][1] + esq[1][2]) -
+ ((int64_t)esq[0][3] + esq[0][1] + esq[0][2]);
+ return (tl + br > 0);
+}
+
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const AV1_COMP *const cpi, const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, const uint8_t *const p0,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ int *const best_wedge_sign,
+ int *const best_wedge_index) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int wedge_index;
+ int wedge_sign;
+ int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+
+ DECLARE_ALIGNED(32, int16_t, residual0[MAX_SB_SQUARE]); // src - pred0
+ if (hbd) {
+ aom_highbd_subtract_block(bh, bw, residual0, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual0, bw, src->buf, src->stride, p0, bw);
+ }
+
+ int64_t sign_limit = ((int64_t)aom_sum_squares_i16(residual0, N) -
+ (int64_t)aom_sum_squares_i16(residual1, N)) *
+ (1 << WEDGE_WEIGHT_BITS) / 2;
+ int16_t *ds = residual0;
+
+ av1_wedge_compute_delta_squares(ds, residual0, residual1, N);
+
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, 0, bsize);
+
+ wedge_sign = av1_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ // int rate2;
+ // int64_t dist2;
+ // model_rd_with_curvfit(cpi, x, bsize, 0, sse, N, &rate2, &dist2);
+ // printf("sse %"PRId64": leagacy: %d %"PRId64", curvfit %d %"PRId64"\n",
+ // sse, rate, dist, rate2, dist2); dist = dist2;
+ // rate = rate2;
+
+ rate += x->wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ *best_wedge_sign = wedge_sign;
+ best_rd = rd;
+ }
+ }
+
+ return best_rd -
+ RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const int16_t *const residual1,
+ const int16_t *const diff10,
+ const int wedge_sign,
+ int *const best_wedge_index) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = bw * bh;
+ assert(N >= 64);
+ int rate;
+ int64_t dist;
+ int64_t rd, best_rd = INT64_MAX;
+ int wedge_index;
+ int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+ const uint8_t *mask;
+ uint64_t sse;
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+ mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+ sse = av1_wedge_sse_from_residuals(residual1, diff10, mask, N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ rate += x->wedge_idx_cost[bsize][wedge_index];
+ rd = RDCOST(x->rdmult, rate, dist);
+
+ if (rd < best_rd) {
+ *best_wedge_index = wedge_index;
+ best_rd = rd;
+ }
+ }
+ return best_rd -
+ RDCOST(x->rdmult, x->wedge_idx_cost[bsize][*best_wedge_index], 0);
+}
+
+static int64_t pick_interinter_wedge(
+ const AV1_COMP *const cpi, MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0, const uint8_t *const p1,
+ const int16_t *const residual1, const int16_t *const diff10) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+
+ int64_t rd;
+ int wedge_index = -1;
+ int wedge_sign = 0;
+
+ assert(is_interinter_compound_used(COMPOUND_WEDGE, bsize));
+ assert(cpi->common.seq_params.enable_masked_compound);
+
+ if (cpi->sf.fast_wedge_sign_estimate) {
+ wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+ rd = pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, wedge_sign,
+ &wedge_index);
+ } else {
+ rd = pick_wedge(cpi, x, bsize, p0, residual1, diff10, &wedge_sign,
+ &wedge_index);
+ }
+
+ mbmi->interinter_comp.wedge_sign = wedge_sign;
+ mbmi->interinter_comp.wedge_index = wedge_index;
+ return rd;
+}
+
+static int64_t pick_interinter_seg(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ const int N = 1 << num_pels_log2_lookup[bsize];
+ int rate;
+ int64_t dist;
+ DIFFWTD_MASK_TYPE cur_mask_type;
+ int64_t best_rd = INT64_MAX;
+ DIFFWTD_MASK_TYPE best_mask_type = 0;
+ const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+ const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+ DECLARE_ALIGNED(16, uint8_t, seg_mask[2 * MAX_SB_SQUARE]);
+ uint8_t *tmp_mask[2] = { xd->seg_mask, seg_mask };
+ // try each mask type and its inverse
+ for (cur_mask_type = 0; cur_mask_type < DIFFWTD_MASK_TYPES; cur_mask_type++) {
+ // build mask and inverse
+ if (hbd)
+ av1_build_compound_diffwtd_mask_highbd(
+ tmp_mask[cur_mask_type], cur_mask_type, CONVERT_TO_BYTEPTR(p0), bw,
+ CONVERT_TO_BYTEPTR(p1), bw, bh, bw, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(tmp_mask[cur_mask_type], cur_mask_type,
+ p0, bw, p1, bw, bh, bw);
+
+ // compute rd for mask
+ uint64_t sse = av1_wedge_sse_from_residuals(residual1, diff10,
+ tmp_mask[cur_mask_type], N);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+ model_rd_sse_fn[MODELRD_TYPE_MASKED_COMPOUND](cpi, x, bsize, 0, sse, N,
+ &rate, &dist);
+ const int64_t rd0 = RDCOST(x->rdmult, rate, dist);
+
+ if (rd0 < best_rd) {
+ best_mask_type = cur_mask_type;
+ best_rd = rd0;
+ }
+ }
+ mbmi->interinter_comp.mask_type = best_mask_type;
+ if (best_mask_type == DIFFWTD_38_INV) {
+ memcpy(xd->seg_mask, seg_mask, N * 2);
+ }
+ return best_rd;
+}
+
+static int64_t pick_interintra_wedge(const AV1_COMP *const cpi,
+ const MACROBLOCK *const x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(is_interintra_wedge_used(bsize));
+ assert(cpi->common.seq_params.enable_interintra_compound);
+
+ const struct buf_2d *const src = &x->plane[0].src;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ DECLARE_ALIGNED(32, int16_t, residual1[MAX_SB_SQUARE]); // src - pred1
+ DECLARE_ALIGNED(32, int16_t, diff10[MAX_SB_SQUARE]); // pred1 - pred0
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(p1), bw,
+ CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, p1, bw);
+ aom_subtract_block(bh, bw, diff10, bw, p1, bw, p0, bw);
+ }
+ int wedge_index = -1;
+ int64_t rd =
+ pick_wedge_fixed_sign(cpi, x, bsize, residual1, diff10, 0, &wedge_index);
+
+ mbmi->interintra_wedge_sign = 0;
+ mbmi->interintra_wedge_index = wedge_index;
+ return rd;
+}
+
+static int64_t pick_interinter_mask(const AV1_COMP *const cpi, MACROBLOCK *x,
+ const BLOCK_SIZE bsize,
+ const uint8_t *const p0,
+ const uint8_t *const p1,
+ const int16_t *const residual1,
+ const int16_t *const diff10) {
+ const COMPOUND_TYPE compound_type = x->e_mbd.mi[0]->interinter_comp.type;
+ switch (compound_type) {
+ case COMPOUND_WEDGE:
+ return pick_interinter_wedge(cpi, x, bsize, p0, p1, residual1, diff10);
+ case COMPOUND_DIFFWTD:
+ return pick_interinter_seg(cpi, x, bsize, p0, p1, residual1, diff10);
+ default: assert(0); return 0;
+ }
+}
+
+static int interinter_compound_motion_search(const AV1_COMP *const cpi,
+ MACROBLOCK *x,
+ const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize,
+ const PREDICTION_MODE this_mode,
+ int mi_row, int mi_col) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int_mv tmp_mv[2];
+ int tmp_rate_mv = 0;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *compound_data = &mbmi->interinter_comp;
+
+ if (this_mode == NEW_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ mi_row, mi_col, tmp_mv, &tmp_rate_mv, 2);
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ mi_row, mi_col, tmp_mv, &tmp_rate_mv, 0);
+ mbmi->mv[0].as_int = tmp_mv[0].as_int;
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ do_masked_motion_search_indexed(cpi, x, cur_mv, compound_data, bsize,
+ mi_row, mi_col, tmp_mv, &tmp_rate_mv, 1);
+ mbmi->mv[1].as_int = tmp_mv[1].as_int;
+ }
+ return tmp_rate_mv;
+}
+
+static void get_inter_predictors_masked_compound(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const BLOCK_SIZE bsize,
+ int mi_row, int mi_col, uint8_t **preds0, uint8_t **preds1,
+ int16_t *residual1, int16_t *diff10, int *strides) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ const int bw = block_size_wide[bsize];
+ const int bh = block_size_high[bsize];
+ int can_use_previous = cm->allow_warped_motion;
+ // get inter predictors to use for masked compound modes
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 0, preds0, strides, can_use_previous);
+ av1_build_inter_predictors_for_planes_single_buf(
+ xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides, can_use_previous);
+ const struct buf_2d *const src = &x->plane[0].src;
+ if (get_bitdepth_data_path_index(xd)) {
+ aom_highbd_subtract_block(bh, bw, residual1, bw, src->buf, src->stride,
+ CONVERT_TO_BYTEPTR(*preds1), bw, xd->bd);
+ aom_highbd_subtract_block(bh, bw, diff10, bw, CONVERT_TO_BYTEPTR(*preds1),
+ bw, CONVERT_TO_BYTEPTR(*preds0), bw, xd->bd);
+ } else {
+ aom_subtract_block(bh, bw, residual1, bw, src->buf, src->stride, *preds1,
+ bw);
+ aom_subtract_block(bh, bw, diff10, bw, *preds1, bw, *preds0, bw);
+ }
+}
+
+static int64_t build_and_cost_compound_type(
+ const AV1_COMP *const cpi, MACROBLOCK *x, const int_mv *const cur_mv,
+ const BLOCK_SIZE bsize, const PREDICTION_MODE this_mode, int *rs2,
+ int rate_mv, BUFFER_SET *ctx, int *out_rate_mv, uint8_t **preds0,
+ uint8_t **preds1, int16_t *residual1, int16_t *diff10, int *strides,
+ int mi_row, int mi_col, int mode_rate, int64_t ref_best_rd,
+ int *calc_pred_masked_compound) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int rate_sum;
+ int64_t dist_sum;
+ int64_t best_rd_cur = INT64_MAX;
+ int64_t rd = INT64_MAX;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ const COMPOUND_TYPE compound_type = mbmi->interinter_comp.type;
+
+ if (*calc_pred_masked_compound) {
+ get_inter_predictors_masked_compound(cpi, x, bsize, mi_row, mi_col, preds0,
+ preds1, residual1, diff10, strides);
+ *calc_pred_masked_compound = 0;
+ }
+
+ best_rd_cur =
+ pick_interinter_mask(cpi, x, bsize, *preds0, *preds1, residual1, diff10);
+ *rs2 += get_interinter_compound_mask_rate(x, mbmi);
+ best_rd_cur += RDCOST(x->rdmult, *rs2 + rate_mv, 0);
+
+ // Although the true rate_mv might be different after motion search, but it
+ // is unlikely to be the best mode considering the transform rd cost and other
+ // mode overhead cost
+ int64_t mode_rd = RDCOST(x->rdmult, *rs2 + mode_rate, 0);
+ if (mode_rd > ref_best_rd) return INT64_MAX;
+
+ if (have_newmv_in_inter_mode(this_mode) && compound_type == COMPOUND_WEDGE) {
+ *out_rate_mv = interinter_compound_motion_search(cpi, x, cur_mv, bsize,
+ this_mode, mi_row, mi_col);
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+ if (rd >= best_rd_cur) {
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ *out_rate_mv = rate_mv;
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+ preds1, strides);
+ }
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, *rs2 + *out_rate_mv + rate_sum, dist_sum);
+ best_rd_cur = rd;
+
+ } else {
+ av1_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0, preds0, strides,
+ preds1, strides);
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, *rs2 + rate_mv + rate_sum, dist_sum);
+ best_rd_cur = rd;
+ }
+ return best_rd_cur;
+}
+
+typedef struct {
+ // OBMC secondary prediction buffers and respective strides
+ uint8_t *above_pred_buf[MAX_MB_PLANE];
+ int above_pred_stride[MAX_MB_PLANE];
+ uint8_t *left_pred_buf[MAX_MB_PLANE];
+ int left_pred_stride[MAX_MB_PLANE];
+ int_mv (*single_newmv)[REF_FRAMES];
+ // Pointer to array of motion vectors to use for each ref and their rates
+ // Should point to first of 2 arrays in 2D array
+ int (*single_newmv_rate)[REF_FRAMES];
+ int (*single_newmv_valid)[REF_FRAMES];
+ // Pointer to array of predicted rate-distortion
+ // Should point to first of 2 arrays in 2D array
+ int64_t (*modelled_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+ InterpFilter single_filter[MB_MODE_COUNT][REF_FRAMES];
+ int ref_frame_cost;
+ int single_comp_cost;
+ int64_t (*simple_rd)[MAX_REF_MV_SERCH][REF_FRAMES];
+ int skip_motion_mode;
+ INTERINTRA_MODE *inter_intra_mode;
+} HandleInterModeArgs;
+
+/* If the current mode shares the same mv with other modes with higher cost,
+ * skip this mode. */
+static int skip_repeated_mv(const AV1_COMMON *const cm,
+ const MACROBLOCK *const x,
+ PREDICTION_MODE this_mode,
+ const MV_REFERENCE_FRAME ref_frames[2],
+ InterModeSearchState *search_state) {
+ const int is_comp_pred = ref_frames[1] > INTRA_FRAME;
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frames);
+ const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ PREDICTION_MODE compare_mode = MB_MODE_COUNT;
+ if (!is_comp_pred) {
+ if (this_mode == NEARMV) {
+ if (ref_mv_count == 0) {
+ // NEARMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // NEARMV has the same motion vector as GLOBALMV
+ compare_mode = GLOBALMV;
+ }
+ }
+ if (this_mode == GLOBALMV) {
+ if (ref_mv_count == 0 &&
+ cm->global_motion[ref_frames[0]].wmtype <= TRANSLATION) {
+ // GLOBALMV has the same motion vector as NEARESTMV
+ compare_mode = NEARESTMV;
+ }
+ if (ref_mv_count == 1) {
+ // GLOBALMV has the same motion vector as NEARMV
+ compare_mode = NEARMV;
+ }
+ }
+
+ if (compare_mode != MB_MODE_COUNT) {
+ // Use modelled_rd to check whether compare mode was searched
+ if (search_state->modelled_rd[compare_mode][0][ref_frames[0]] !=
+ INT64_MAX) {
+ const int16_t mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, ref_frames);
+ const int compare_cost = cost_mv_ref(x, compare_mode, mode_ctx);
+ const int this_cost = cost_mv_ref(x, this_mode, mode_ctx);
+
+ // Only skip if the mode cost is larger than compare mode cost
+ if (this_cost > compare_cost) {
+ search_state->modelled_rd[this_mode][0][ref_frames[0]] =
+ search_state->modelled_rd[compare_mode][0][ref_frames[0]];
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+static INLINE int clamp_and_check_mv(int_mv *out_mv, int_mv in_mv,
+ const AV1_COMMON *cm,
+ const MACROBLOCK *x) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ *out_mv = in_mv;
+ lower_mv_precision(&out_mv->as_mv, cm->allow_high_precision_mv,
+ cm->cur_frame_force_integer_mv);
+ clamp_mv2(&out_mv->as_mv, xd);
+ return !mv_check_bounds(&x->mv_limits, &out_mv->as_mv);
+}
+
+static int64_t handle_newmv(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ const BLOCK_SIZE bsize, int_mv *cur_mv,
+ const int mi_row, const int mi_col,
+ int *const rate_mv,
+ HandleInterModeArgs *const args) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int refs[2] = { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ int i;
+
+ (void)args;
+
+ if (is_comp_pred) {
+ if (this_mode == NEW_NEWMV) {
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ joint_motion_search(cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, NULL,
+ 0, rate_mv, 0);
+ } else {
+ *rate_mv = 0;
+ for (i = 0; i < 2; ++i) {
+ const int_mv ref_mv = av1_get_ref_mv(x, i);
+ av1_set_mvcost(x, i, mbmi->ref_mv_idx);
+ *rate_mv +=
+ av1_mv_bit_cost(&cur_mv[i].as_mv, &ref_mv.as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+ }
+ }
+ } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+ cur_mv[1].as_int = args->single_newmv[ref_mv_idx][refs[1]].as_int;
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ compound_single_motion_search_interinter(
+ cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 1);
+ } else {
+ av1_set_mvcost(x, 1,
+ mbmi->ref_mv_idx + (this_mode == NEAR_NEWMV ? 1 : 0));
+ const int_mv ref_mv = av1_get_ref_mv(x, 1);
+ *rate_mv = av1_mv_bit_cost(&cur_mv[1].as_mv, &ref_mv.as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+ } else {
+ assert(this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV);
+ cur_mv[0].as_int = args->single_newmv[ref_mv_idx][refs[0]].as_int;
+ if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+ compound_single_motion_search_interinter(
+ cpi, x, bsize, cur_mv, mi_row, mi_col, NULL, 0, rate_mv, 0, 0);
+ } else {
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ av1_set_mvcost(x, 0,
+ mbmi->ref_mv_idx + (this_mode == NEW_NEARMV ? 1 : 0));
+ *rate_mv = av1_mv_bit_cost(&cur_mv[0].as_mv, &ref_mv.as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+ }
+ }
+ } else {
+ single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, rate_mv);
+ if (x->best_mv.as_int == INVALID_MV) return INT64_MAX;
+
+ args->single_newmv[ref_mv_idx][refs[0]] = x->best_mv;
+ args->single_newmv_rate[ref_mv_idx][refs[0]] = *rate_mv;
+ args->single_newmv_valid[ref_mv_idx][refs[0]] = 1;
+
+ cur_mv[0].as_int = x->best_mv.as_int;
+
+#if USE_DISCOUNT_NEWMV_TEST
+ // Estimate the rate implications of a new mv but discount this
+ // under certain circumstances where we want to help initiate a weak
+ // motion field, where the distortion gain for a single block may not
+ // be enough to overcome the cost of a new mv.
+ if (discount_newmv_test(cpi, x, this_mode, x->best_mv)) {
+ *rate_mv = AOMMAX(*rate_mv / NEW_MV_DISCOUNT_FACTOR, 1);
+ }
+#endif
+ }
+
+ return 0;
+}
+
+static INLINE void swap_dst_buf(MACROBLOCKD *xd, const BUFFER_SET *dst_bufs[2],
+ int num_planes) {
+ const BUFFER_SET *buf0 = dst_bufs[0];
+ dst_bufs[0] = dst_bufs[1];
+ dst_bufs[1] = buf0;
+ restore_dst_buf(xd, *dst_bufs[0], num_planes);
+}
+
+static INLINE int get_switchable_rate(MACROBLOCK *const x,
+ const InterpFilters filters,
+ const int ctx[2]) {
+ int inter_filter_cost;
+ const InterpFilter filter0 = av1_extract_interp_filter(filters, 0);
+ const InterpFilter filter1 = av1_extract_interp_filter(filters, 1);
+ inter_filter_cost = x->switchable_interp_costs[ctx[0]][filter0];
+ inter_filter_cost += x->switchable_interp_costs[ctx[1]][filter1];
+ return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+
+// calculate the rdcost of given interpolation_filter
+static INLINE int64_t interpolation_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2], int filter_idx,
+ const int switchable_ctx[2], const int skip_pred, int *rate,
+ int64_t *dist) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int tmp_rate[2], tmp_skip_sb[2] = { 1, 1 };
+ int64_t tmp_dist[2], tmp_skip_sse[2] = { 0, 0 };
+
+ const InterpFilters last_best = mbmi->interp_filters;
+ mbmi->interp_filters = filter_sets[filter_idx];
+ const int tmp_rs =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+
+ assert(skip_pred != 2);
+ assert((skip_pred >= 0) && (skip_pred <= cpi->default_interp_skip_flags));
+ assert(rate[0] >= 0);
+ assert(dist[0] >= 0);
+ assert((skip_txfm_sb[0] == 0) || (skip_txfm_sb[0] == 1));
+ assert(skip_sse_sb[0] >= 0);
+ assert(rate[1] >= 0);
+ assert(dist[1] >= 0);
+ assert((skip_txfm_sb[1] == 0) || (skip_txfm_sb[1] == 1));
+ assert(skip_sse_sb[1] >= 0);
+
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred != DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &tmp_skip_sb[0], &tmp_skip_sse[0], NULL, NULL, NULL);
+ tmp_rate[1] = tmp_rate[0];
+ tmp_dist[1] = tmp_dist[0];
+ } else {
+ // only luma MC is skipped
+ tmp_rate[1] = rate[0];
+ tmp_dist[1] = dist[0];
+ }
+ if (num_planes > 1) {
+ for (int plane = 1; plane < num_planes; ++plane) {
+ int tmp_rate_uv, tmp_skip_sb_uv;
+ int64_t tmp_dist_uv, tmp_skip_sse_uv;
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+ if (tmp_rd >= *rd) {
+ mbmi->interp_filters = last_best;
+ return 0;
+ }
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, orig_dst, bsize,
+ plane);
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, plane, plane, mi_row, mi_col, &tmp_rate_uv,
+ &tmp_dist_uv, &tmp_skip_sb_uv, &tmp_skip_sse_uv, NULL, NULL, NULL);
+ tmp_rate[1] =
+ (int)AOMMIN(((int64_t)tmp_rate[1] + (int64_t)tmp_rate_uv), INT_MAX);
+ tmp_dist[1] += tmp_dist_uv;
+ tmp_skip_sb[1] &= tmp_skip_sb_uv;
+ tmp_skip_sse[1] += tmp_skip_sse_uv;
+ }
+ }
+ } else {
+ // both luma and chroma MC is skipped
+ tmp_rate[1] = rate[1];
+ tmp_dist[1] = dist[1];
+ }
+ int64_t tmp_rd = RDCOST(x->rdmult, tmp_rs + tmp_rate[1], tmp_dist[1]);
+
+ if (tmp_rd < *rd) {
+ *rd = tmp_rd;
+ *switchable_rate = tmp_rs;
+ if (skip_pred != cpi->default_interp_skip_flags) {
+ if (skip_pred == 0) {
+ // Overwrite the data as current filter is the best one
+ tmp_skip_sb[1] = tmp_skip_sb[0] & tmp_skip_sb[1];
+ tmp_skip_sse[1] = tmp_skip_sse[0] + tmp_skip_sse[1];
+ memcpy(rate, tmp_rate, sizeof(*rate) * 2);
+ memcpy(dist, tmp_dist, sizeof(*dist) * 2);
+ memcpy(skip_txfm_sb, tmp_skip_sb, sizeof(*skip_txfm_sb) * 2);
+ memcpy(skip_sse_sb, tmp_skip_sse, sizeof(*skip_sse_sb) * 2);
+ // As luma MC data is computed, no need to recompute after the search
+ x->recalc_luma_mc_data = 0;
+ } else if (skip_pred == DEFAULT_LUMA_INTERP_SKIP_FLAG) {
+ // As luma MC data is not computed, update of luma data can be skipped
+ rate[1] = tmp_rate[1];
+ dist[1] = tmp_dist[1];
+ skip_txfm_sb[1] = skip_txfm_sb[0] & tmp_skip_sb[1];
+ skip_sse_sb[1] = skip_sse_sb[0] + tmp_skip_sse[1];
+ // As luma MC data is not recomputed and current filter is the best,
+ // indicate the possibility of recomputing MC data
+ // If current buffer contains valid MC data, toggle to indicate that
+ // luma MC data needs to be recomputed
+ x->recalc_luma_mc_data ^= 1;
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ }
+ return 1;
+ }
+ mbmi->interp_filters = last_best;
+ return 0;
+}
+
+// Find the best rd filter in horizontal direction
+static INLINE int find_best_horiz_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+ const int switchable_ctx[2], const int skip_hor, int *rate, int64_t *dist,
+ int best_dual_mode) {
+ int i;
+ const int bw = block_size_wide[bsize];
+ assert(best_dual_mode == 0);
+ if ((bw <= 4) && (skip_hor != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
+ // Process the filters in reverse order to enable reusing rate and
+ // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+ for (i = (SWITCHABLE_FILTERS - 1); i >= 1; --i) {
+ if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_pred, rate,
+ dist)) {
+ best_dual_mode = i;
+ }
+ skip_pred = skip_hor;
+ }
+ } else {
+ for (i = 1; i < SWITCHABLE_FILTERS; ++i) {
+ if (interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_hor, rate,
+ dist)) {
+ best_dual_mode = i;
+ }
+ }
+ }
+ return best_dual_mode;
+}
+
+// Find the best rd filter in vertical direction
+static INLINE void find_best_vert_interp_filter_rd(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, BUFFER_SET *const orig_dst, int64_t *const rd,
+ int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const BUFFER_SET *dst_bufs[2],
+ const int switchable_ctx[2], const int skip_ver, int *rate, int64_t *dist,
+ int best_dual_mode, int filter_set_size) {
+ int i;
+ const int bh = block_size_high[bsize];
+ if ((bh <= 4) && (skip_ver != cpi->default_interp_skip_flags)) {
+ int skip_pred = cpi->default_interp_skip_flags;
+ // Process the filters in reverse order to enable reusing rate and
+ // distortion (calcuated during EIGHTTAP_REGULAR) for MULTITAP_SHARP
+ assert(filter_set_size == DUAL_FILTER_SET_SIZE);
+ for (i = (filter_set_size - SWITCHABLE_FILTERS + best_dual_mode);
+ i >= (best_dual_mode + SWITCHABLE_FILTERS); i -= SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_pred, rate,
+ dist);
+ skip_pred = skip_ver;
+ }
+ } else {
+ for (i = best_dual_mode + SWITCHABLE_FILTERS; i < filter_set_size;
+ i += SWITCHABLE_FILTERS) {
+ interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, skip_txfm_sb, skip_sse_sb,
+ dst_bufs, i, switchable_ctx, skip_ver, rate,
+ dist);
+ }
+ }
+}
+
+// check if there is saved result match with this search
+static INLINE int is_interp_filter_match(const INTERPOLATION_FILTER_STATS *st,
+ MB_MODE_INFO *const mi) {
+ for (int i = 0; i < 2; ++i) {
+ if ((st->ref_frames[i] != mi->ref_frame[i]) ||
+ (st->mv[i].as_int != mi->mv[i].as_int)) {
+ return 0;
+ }
+ }
+ if (has_second_ref(mi) && st->comp_type != mi->interinter_comp.type) return 0;
+ return 1;
+}
+
+static INLINE int find_interp_filter_in_stats(MACROBLOCK *x,
+ MB_MODE_INFO *const mbmi) {
+ const int comp_idx = mbmi->compound_idx;
+ const int offset = x->interp_filter_stats_idx[comp_idx];
+ for (int j = 0; j < offset; ++j) {
+ const INTERPOLATION_FILTER_STATS *st = &x->interp_filter_stats[comp_idx][j];
+ if (is_interp_filter_match(st, mbmi)) {
+ mbmi->interp_filters = st->filters;
+ return j;
+ }
+ }
+ return -1; // no match result found
+}
+
+static INLINE void save_interp_filter_search_stat(MACROBLOCK *x,
+ MB_MODE_INFO *const mbmi) {
+ const int comp_idx = mbmi->compound_idx;
+ const int offset = x->interp_filter_stats_idx[comp_idx];
+ if (offset < MAX_INTERP_FILTER_STATS) {
+ INTERPOLATION_FILTER_STATS stat = { mbmi->interp_filters,
+ { mbmi->mv[0], mbmi->mv[1] },
+ { mbmi->ref_frame[0],
+ mbmi->ref_frame[1] },
+ mbmi->interinter_comp.type };
+ x->interp_filter_stats[comp_idx][offset] = stat;
+ x->interp_filter_stats_idx[comp_idx]++;
+ }
+}
+
+static int64_t interpolation_filter_search(
+ MACROBLOCK *const x, const AV1_COMP *const cpi, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, const BUFFER_SET *const tmp_dst,
+ BUFFER_SET *const orig_dst, InterpFilter (*const single_filter)[REF_FRAMES],
+ int64_t *const rd, int *const switchable_rate, int *const skip_txfm_sb,
+ int64_t *const skip_sse_sb, const int skip_build_pred,
+ HandleInterModeArgs *args, int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int need_search =
+ av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+ int i;
+ // Index 0 corresponds to luma rd data and index 1 corresponds to cummulative
+ // data of all planes
+ int tmp_rate[2] = { 0, 0 };
+ int64_t tmp_dist[2] = { 0, 0 };
+ int best_skip_txfm_sb[2] = { 1, 1 };
+ int64_t best_skip_sse_sb[2] = { 0, 0 };
+ const int ref_frame = xd->mi[0]->ref_frame[0];
+
+ (void)single_filter;
+ int match_found = -1;
+ const InterpFilter assign_filter = cm->interp_filter;
+ if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+ match_found = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (!need_search || match_found == -1) {
+ set_default_interp_filters(mbmi, assign_filter);
+ }
+ int switchable_ctx[2];
+ switchable_ctx[0] = av1_get_pred_context_switchable_interp(xd, 0);
+ switchable_ctx[1] = av1_get_pred_context_switchable_interp(xd, 1);
+ *switchable_rate =
+ get_switchable_rate(x, mbmi->interp_filters, switchable_ctx);
+ if (!skip_build_pred)
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+#if CONFIG_COLLECT_RD_STATS == 3
+ RD_STATS rd_stats_y;
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col, INT64_MAX);
+ PrintPredictionUnitStats(cpi, x, &rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 3
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &tmp_rate[0], &tmp_dist[0],
+ &best_skip_txfm_sb[0], &best_skip_sse_sb[0], NULL, NULL, NULL);
+ if (num_planes > 1)
+ model_rd_sb_fn[MODELRD_TYPE_INTERP_FILTER](
+ cpi, bsize, x, xd, 1, num_planes - 1, mi_row, mi_col, &tmp_rate[1],
+ &tmp_dist[1], &best_skip_txfm_sb[1], &best_skip_sse_sb[1], NULL, NULL,
+ NULL);
+ tmp_rate[1] =
+ (int)AOMMIN((int64_t)tmp_rate[0] + (int64_t)tmp_rate[1], INT_MAX);
+ assert(tmp_rate[1] >= 0);
+ tmp_dist[1] = tmp_dist[0] + tmp_dist[1];
+ best_skip_txfm_sb[1] = best_skip_txfm_sb[0] & best_skip_txfm_sb[1];
+ best_skip_sse_sb[1] = best_skip_sse_sb[0] + best_skip_sse_sb[1];
+ *rd = RDCOST(x->rdmult, (*switchable_rate + tmp_rate[1]), tmp_dist[1]);
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
+ if (assign_filter != SWITCHABLE || match_found != -1) {
+ return 0;
+ }
+ if (!need_search) {
+ assert(mbmi->interp_filters ==
+ av1_broadcast_interp_filter(EIGHTTAP_REGULAR));
+ return 0;
+ }
+ if (args->modelled_rd != NULL) {
+ if (has_second_ref(mbmi)) {
+ const int ref_mv_idx = mbmi->ref_mv_idx;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ const int mode0 = compound_ref0_mode(mbmi->mode);
+ const int mode1 = compound_ref1_mode(mbmi->mode);
+ const int64_t mrd = AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((*rd >> 1) > mrd && ref_best_rd < INT64_MAX) {
+ return INT64_MAX;
+ }
+ }
+ }
+
+ x->recalc_luma_mc_data = 0;
+ // skip_flag=xx (in binary form)
+ // Setting 0th flag corresonds to skipping luma MC and setting 1st bt
+ // corresponds to skipping chroma MC skip_flag=0 corresponds to "Don't skip
+ // luma and chroma MC" Skip flag=1 corresponds to "Skip Luma MC only"
+ // Skip_flag=2 is not a valid case
+ // skip_flag=3 corresponds to "Skip both luma and chroma MC"
+ int skip_hor = cpi->default_interp_skip_flags;
+ int skip_ver = cpi->default_interp_skip_flags;
+ const int is_compound = has_second_ref(mbmi);
+ assert(is_intrabc_block(mbmi) == 0);
+ for (int j = 0; j < 1 + is_compound; ++j) {
+ const RefBuffer *ref_buf = &cm->frame_refs[mbmi->ref_frame[j] - LAST_FRAME];
+ const struct scale_factors *const sf = &ref_buf->sf;
+ // TODO(any): Refine skip flag calculation considering scaling
+ if (av1_is_scaled(sf)) {
+ skip_hor = 0;
+ skip_ver = 0;
+ break;
+ }
+ const MV mv = mbmi->mv[j].as_mv;
+ int skip_hor_plane = 0;
+ int skip_ver_plane = 0;
+ for (int k = 0; k < AOMMAX(1, (num_planes - 1)); ++k) {
+ struct macroblockd_plane *const pd = &xd->plane[k];
+ const int bw = pd->width;
+ const int bh = pd->height;
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ const int sub_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ const int sub_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ skip_hor_plane |= ((sub_x == 0) << k);
+ skip_ver_plane |= ((sub_y == 0) << k);
+ }
+ skip_hor = skip_hor & skip_hor_plane;
+ skip_ver = skip_ver & skip_ver_plane;
+ // It is not valid that "luma MV is sub-pel, whereas chroma MV is not"
+ assert(skip_hor != 2);
+ assert(skip_ver != 2);
+ }
+ // When compond prediction type is compound segment wedge, luma MC and chroma
+ // MC need to go hand in hand as mask generated during luma MC is reuired for
+ // chroma MC. If skip_hor = 0 and skip_ver = 1, mask used for chroma MC during
+ // vertical filter decision may be incorrect as temporary MC evaluation
+ // overwrites the mask. Make skip_ver as 0 for this case so that mask is
+ // populated during luma MC
+ if (is_compound && mbmi->compound_idx == 1 &&
+ mbmi->interinter_comp.type == COMPOUND_DIFFWTD) {
+ assert(mbmi->comp_group_idx == 1);
+ if (skip_hor == 0 && skip_ver == 1) skip_ver = 0;
+ }
+ // do interp_filter search
+ const int filter_set_size = DUAL_FILTER_SET_SIZE;
+ restore_dst_buf(xd, *tmp_dst, num_planes);
+ const BUFFER_SET *dst_bufs[2] = { tmp_dst, orig_dst };
+ if (cpi->sf.use_fast_interpolation_filter_search &&
+ cm->seq_params.enable_dual_filter) {
+ // default to (R,R): EIGHTTAP_REGULARxEIGHTTAP_REGULAR
+ int best_dual_mode = 0;
+ // Find best of {R}x{R,Sm,Sh}
+ // EIGHTTAP_REGULAR mode is calculated beforehand
+ best_dual_mode = find_best_horiz_interp_filter_rd(
+ x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_hor,
+ tmp_rate, tmp_dist, best_dual_mode);
+
+ // From best of horizontal EIGHTTAP_REGULAR modes, check vertical modes
+ find_best_vert_interp_filter_rd(
+ x, cpi, bsize, mi_row, mi_col, orig_dst, rd, switchable_rate,
+ best_skip_txfm_sb, best_skip_sse_sb, dst_bufs, switchable_ctx, skip_ver,
+ tmp_rate, tmp_dist, best_dual_mode, filter_set_size);
+ } else {
+ // EIGHTTAP_REGULAR mode is calculated beforehand
+ for (i = 1; i < filter_set_size; ++i) {
+ if (cm->seq_params.enable_dual_filter == 0) {
+ const int16_t filter_y = filter_sets[i] & 0xffff;
+ const int16_t filter_x = filter_sets[i] >> 16;
+ if (filter_x != filter_y) continue;
+ }
+ interpolation_filter_rd(x, cpi, bsize, mi_row, mi_col, orig_dst, rd,
+ switchable_rate, best_skip_txfm_sb,
+ best_skip_sse_sb, dst_bufs, i, switchable_ctx, 0,
+ tmp_rate, tmp_dist);
+ assert(x->recalc_luma_mc_data == 0);
+ }
+ }
+ swap_dst_buf(xd, dst_bufs, num_planes);
+ // Recompute final MC data if required
+ if (x->recalc_luma_mc_data == 1) {
+ // Recomputing final luma MC data is required only if the same was skipped
+ // in either of the directions Condition below is necessary, but not
+ // sufficient
+ assert((skip_hor == 1) || (skip_ver == 1));
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ *skip_txfm_sb = best_skip_txfm_sb[1];
+ *skip_sse_sb = best_skip_sse_sb[1];
+ x->pred_sse[ref_frame] = (unsigned int)(best_skip_sse_sb[0] >> 4);
+
+ // save search results
+ if (cpi->sf.skip_repeat_interpolation_filter_search) {
+ assert(match_found == -1);
+ save_interp_filter_search_stat(x, mbmi);
+ }
+ return 0;
+}
+
+static int txfm_search(const AV1_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, RD_STATS *rd_stats,
+ RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int mode_rate, int64_t ref_best_rd) {
+ /*
+ * This function combines y and uv planes' transform search processes
+ * together, when the prediction is generated. It first does subtration to
+ * obtain the prediction error. Then it calls
+ * select_tx_type_yrd/super_block_yrd and inter_block_uvrd sequentially and
+ * handles the early terminations happen in those functions. At the end, it
+ * computes the rd_stats/_y/_uv accordingly.
+ */
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ int skip_txfm_sb = 0;
+ const int num_planes = av1_num_planes(cm);
+ const int ref_frame_1 = mbmi->ref_frame[1];
+ const int64_t mode_rd = RDCOST(x->rdmult, mode_rate, 0);
+ const int64_t rd_thresh =
+ ref_best_rd == INT64_MAX ? INT64_MAX : ref_best_rd - mode_rd;
+ const int skip_ctx = av1_get_skip_context(xd);
+ const int64_t min_header_rate =
+ mode_rate + AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ // Account for minimum skip and non_skip rd.
+ // Eventually either one of them will be added to mode_rate
+ const int64_t min_header_rd_possible = RDCOST(x->rdmult, min_header_rate, 0);
+
+ if (min_header_rd_possible > ref_best_rd) {
+ av1_invalid_rd_stats(rd_stats_y);
+ av1_invalid_rd_stats(rd_stats);
+ return 0;
+ }
+
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_y);
+ av1_init_rd_stats(rd_stats_uv);
+ rd_stats->rate = mode_rate;
+
+ if (!cpi->common.all_lossless)
+ check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+ if (!skip_txfm_sb) {
+ int64_t non_skip_rdcosty = INT64_MAX;
+ int64_t skip_rdcosty = INT64_MAX;
+ int64_t min_rdcosty = INT64_MAX;
+ int is_cost_valid_uv = 0;
+
+ // cost and distortion
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ // Motion mode
+ select_tx_type_yrd(cpi, x, rd_stats_y, bsize, mi_row, mi_col, rd_thresh);
+#if CONFIG_COLLECT_RD_STATS == 2
+ PrintPredictionUnitStats(cpi, x, rd_stats_y, bsize);
+#endif // CONFIG_COLLECT_RD_STATS == 2
+ } else {
+ super_block_yrd(cpi, x, rd_stats_y, bsize, rd_thresh);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y->skip);
+ }
+
+ if (rd_stats_y->rate == INT_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ // TODO(angiebird): check if we need this
+ // restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ av1_merge_rd_stats(rd_stats, rd_stats_y);
+
+ non_skip_rdcosty = RDCOST(
+ x->rdmult, rd_stats->rate + x->skip_cost[skip_ctx][0], rd_stats->dist);
+ skip_rdcosty =
+ RDCOST(x->rdmult, mode_rate + x->skip_cost[skip_ctx][1], rd_stats->sse);
+ min_rdcosty = AOMMIN(non_skip_rdcosty, skip_rdcosty);
+
+ if (min_rdcosty > ref_best_rd) {
+ int64_t tokenonly_rdy =
+ AOMMIN(RDCOST(x->rdmult, rd_stats_y->rate, rd_stats_y->dist),
+ RDCOST(x->rdmult, 0, rd_stats_y->sse));
+ // Invalidate rd_stats_y to skip the rest of the motion modes search
+ if (tokenonly_rdy - (tokenonly_rdy >> cpi->sf.adaptive_txb_search_level) >
+ rd_thresh)
+ av1_invalid_rd_stats(rd_stats_y);
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+
+ if (num_planes > 1) {
+ /* clang-format off */
+ is_cost_valid_uv =
+ inter_block_uvrd(cpi, x, rd_stats_uv, bsize,
+ ref_best_rd - non_skip_rdcosty,
+ ref_best_rd - skip_rdcosty, FTXS_NONE);
+ if (!is_cost_valid_uv) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ /* clang-format on */
+ av1_merge_rd_stats(rd_stats, rd_stats_uv);
+ } else {
+ av1_init_rd_stats(rd_stats_uv);
+ }
+ if (rd_stats->skip) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ mbmi->skip = 0;
+ // here mbmi->skip temporarily plays a role as what this_skip2 does
+
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ } else if (!xd->lossless[mbmi->segment_id] &&
+ (RDCOST(x->rdmult,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][0],
+ rd_stats->dist) >=
+ RDCOST(x->rdmult, x->skip_cost[skip_ctx][1], rd_stats->sse))) {
+ rd_stats->rate -= rd_stats_uv->rate + rd_stats_y->rate;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ rd_stats->dist = rd_stats->sse;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ mbmi->skip = 1;
+ } else {
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
+ mbmi->skip = 0;
+ }
+ } else {
+ x->skip = 1;
+ mbmi->tx_size = tx_size_from_tx_mode(bsize, cm->tx_mode);
+ // The cost of skip bit needs to be added.
+ mbmi->skip = 0;
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ rd_stats->skip = 1;
+ int64_t tmprd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (tmprd > ref_best_rd) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int handle_inter_intra_mode(const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize,
+ int mi_row, int mi_col, MB_MODE_INFO *mbmi,
+ HandleInterModeArgs *args,
+ int64_t ref_best_rd, int *rate_mv,
+ int *tmp_rate2, BUFFER_SET *orig_dst) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+ int64_t rd, best_interintra_rd = INT64_MAX;
+ int rmode, rate_sum;
+ int64_t dist_sum;
+ int tmp_rate_mv = 0;
+ int tmp_skip_txfm_sb;
+ int bw = block_size_wide[bsize];
+ int64_t tmp_skip_sse_sb;
+ DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_INTERINTRA_SB_SQUARE]);
+ uint8_t *tmp_buf = get_buf_by_bd(xd, tmp_buf_);
+ uint8_t *intrapred = get_buf_by_bd(xd, intrapred_);
+ const int *const interintra_mode_cost =
+ x->interintra_mode_cost[size_group_lookup[bsize]];
+ const int_mv mv0 = mbmi->mv[0];
+ const int is_wedge_used = is_interintra_wedge_used(bsize);
+ int rwedge = is_wedge_used ? x->wedge_interintra_cost[bsize][0] : 0;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ xd->plane[0].dst.buf = tmp_buf;
+ xd->plane[0].dst.stride = bw;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, NULL, bsize);
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ mbmi->ref_frame[1] = INTRA_FRAME;
+ mbmi->use_wedge_interintra = 0;
+ best_interintra_mode = args->inter_intra_mode[mbmi->ref_frame[0]];
+ int j = 0;
+ if (cpi->sf.reuse_inter_intra_mode == 0 ||
+ best_interintra_mode == INTERINTRA_MODES) {
+ for (j = 0; j < INTERINTRA_MODES; ++j) {
+ mbmi->interintra_mode = (INTERINTRA_MODE)j;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ model_rd_sb_fn[MODELRD_TYPE_INTERINTRA](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rate_sum + rmode, dist_sum);
+ if (rd < best_interintra_rd) {
+ best_interintra_rd = rd;
+ best_interintra_mode = mbmi->interintra_mode;
+ }
+ }
+ args->inter_intra_mode[mbmi->ref_frame[0]] = best_interintra_mode;
+ }
+ if (j == 0 || best_interintra_mode != II_SMOOTH_PRED) {
+ mbmi->interintra_mode = best_interintra_mode;
+ rmode = interintra_mode_cost[mbmi->interintra_mode];
+ av1_build_intra_predictors_for_interintra(cm, xd, bsize, 0, orig_dst,
+ intrapred, bw);
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, *rate_mv + rmode + rate_sum + rwedge, dist_sum);
+ best_interintra_rd = rd;
+ if (ref_best_rd < INT64_MAX && (best_interintra_rd >> 1) > ref_best_rd) {
+ return -1;
+ }
+ if (is_wedge_used) {
+ int64_t best_interintra_rd_nowedge = rd;
+ int64_t best_interintra_rd_wedge = INT64_MAX;
+ int_mv tmp_mv;
+ // Disable wedge search if source variance is small
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+ mbmi->use_wedge_interintra = 1;
+
+ rwedge = av1_cost_literal(get_interintra_wedge_bits(bsize)) +
+ x->wedge_interintra_cost[bsize][1];
+
+ best_interintra_rd_wedge =
+ pick_interintra_wedge(cpi, x, bsize, intrapred_, tmp_buf_);
+
+ best_interintra_rd_wedge +=
+ RDCOST(x->rdmult, rmode + *rate_mv + rwedge, 0);
+ rd = INT64_MAX;
+ // Refine motion vector.
+ if (have_newmv_in_inter_mode(mbmi->mode)) {
+ // get negative of mask
+ const uint8_t *mask = av1_get_contiguous_soft_mask(
+ mbmi->interintra_wedge_index, 1, bsize);
+ tmp_mv = mbmi->mv[0];
+ compound_single_motion_search(cpi, x, bsize, &tmp_mv.as_mv, mi_row,
+ mi_col, intrapred, mask, bw, &tmp_rate_mv,
+ 0);
+ if (mbmi->mv[0].as_int != tmp_mv.as_int) {
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst,
+ bsize);
+ model_rd_sb_fn[MODELRD_TYPE_MASKED_COMPOUND](
+ cpi, bsize, x, xd, 0, 0, mi_row, mi_col, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, NULL, NULL, NULL);
+ rd = RDCOST(x->rdmult, tmp_rate_mv + rmode + rate_sum + rwedge,
+ dist_sum);
+ }
+ }
+ if (rd >= best_interintra_rd_wedge) {
+ tmp_mv.as_int = mv0.as_int;
+ tmp_rate_mv = *rate_mv;
+ av1_combine_interintra(xd, bsize, 0, tmp_buf, bw, intrapred, bw);
+ }
+ // Evaluate closer to true rd
+ rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (rd != INT64_MAX)
+ rd = RDCOST(x->rdmult, rmode + tmp_rate_mv + rwedge + rate_sum,
+ dist_sum);
+ best_interintra_rd_wedge = rd;
+ if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+ mbmi->use_wedge_interintra = 1;
+ mbmi->mv[0].as_int = tmp_mv.as_int;
+ *tmp_rate2 += tmp_rate_mv - *rate_mv;
+ *rate_mv = tmp_rate_mv;
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ mbmi->mv[0].as_int = mv0.as_int;
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ } else {
+ mbmi->use_wedge_interintra = 0;
+ }
+ } // if (is_interintra_wedge_used(bsize))
+ if (num_planes > 1) {
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ return 0;
+}
+
+// TODO(afergs): Refactor the MBMI references in here - there's four
+// TODO(afergs): Refactor optional args - add them to a struct or remove
+static int64_t motion_mode_rd(const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int *disable_skip, int mi_row, int mi_col,
+ HandleInterModeArgs *const args,
+ int64_t ref_best_rd, const int *refs,
+ int *rate_mv, BUFFER_SET *orig_dst
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ ,
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ int do_tx_search, InterModesInfo *inter_modes_info
+#endif
+) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int rate2_nocoeff = rd_stats->rate;
+ int best_xskip, best_disable_skip = 0;
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ MB_MODE_INFO base_mbmi, best_mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int rate_mv0 = *rate_mv;
+
+ int interintra_allowed = cm->seq_params.enable_interintra_compound &&
+ is_interintra_allowed(mbmi) && mbmi->compound_idx;
+ int pts0[SAMPLES_ARRAY_SIZE], pts_inref0[SAMPLES_ARRAY_SIZE];
+
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ const MV_REFERENCE_FRAME ref_frame_1 = mbmi->ref_frame[1];
+ av1_invalid_rd_stats(&best_rd_stats);
+ aom_clear_system_state();
+ mbmi->num_proj_ref = 1; // assume num_proj_ref >=1
+ MOTION_MODE last_motion_mode_allowed = SIMPLE_TRANSLATION;
+ if (cm->switchable_motion_mode) {
+ last_motion_mode_allowed = motion_mode_allowed(xd->global_motion, xd, mbmi,
+ cm->allow_warped_motion);
+ }
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts0, pts_inref0);
+ }
+ int total_samples = mbmi->num_proj_ref;
+ if (total_samples == 0) {
+ last_motion_mode_allowed = OBMC_CAUSAL;
+ }
+ base_mbmi = *mbmi;
+
+ const int switchable_rate =
+ av1_is_interp_needed(xd) ? av1_get_switchable_rate(cm, x, xd) : 0;
+ int64_t best_rd = INT64_MAX;
+ int best_rate_mv = rate_mv0;
+ for (int mode_index = (int)SIMPLE_TRANSLATION;
+ mode_index <= (int)last_motion_mode_allowed + interintra_allowed;
+ mode_index++) {
+ if (args->skip_motion_mode && mode_index) continue;
+ int64_t tmp_rd = INT64_MAX;
+ int tmp_rate2 = rate2_nocoeff;
+ int is_interintra_mode = mode_index > (int)last_motion_mode_allowed;
+ int skip_txfm_sb = 0;
+ int tmp_rate_mv = rate_mv0;
+
+ *mbmi = base_mbmi;
+ if (is_interintra_mode) {
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ } else {
+ mbmi->motion_mode = (MOTION_MODE)mode_index;
+ assert(mbmi->ref_frame[1] != INTRA_FRAME);
+ }
+
+ if (mbmi->motion_mode == SIMPLE_TRANSLATION && !is_interintra_mode) {
+ // SIMPLE_TRANSLATION mode: no need to recalculate.
+ // The prediction is calculated before motion_mode_rd() is called in
+ // handle_inter_mode()
+ } else if (mbmi->motion_mode == OBMC_CAUSAL) {
+ uint32_t cur_mv = mbmi->mv[0].as_int;
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ single_motion_search(cpi, x, bsize, mi_row, mi_col, 0, &tmp_rate_mv);
+ mbmi->mv[0].as_int = x->best_mv.as_int;
+#if USE_DISCOUNT_NEWMV_TEST
+ if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+ tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ }
+#endif
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ }
+ if (mbmi->mv[0].as_int != cur_mv) {
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ }
+ av1_build_obmc_inter_prediction(
+ cm, xd, mi_row, mi_col, args->above_pred_buf, args->above_pred_stride,
+ args->left_pred_buf, args->left_pred_stride);
+ } else if (mbmi->motion_mode == WARPED_CAUSAL) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->motion_mode = WARPED_CAUSAL;
+ mbmi->wm_params.wmtype = DEFAULT_WMTYPE;
+ mbmi->interp_filters = av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->interp_filter));
+
+ memcpy(pts, pts0, total_samples * 2 * sizeof(*pts0));
+ memcpy(pts_inref, pts_inref0, total_samples * 2 * sizeof(*pts_inref0));
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1) {
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+ }
+
+ if (!find_projection(mbmi->num_proj_ref, pts, pts_inref, bsize,
+ mbmi->mv[0].as_mv.row, mbmi->mv[0].as_mv.col,
+ &mbmi->wm_params, mi_row, mi_col)) {
+ // Refine MV for NEWMV mode
+ assert(!is_comp_pred);
+ if (have_newmv_in_inter_mode(this_mode)) {
+ const int_mv mv0 = mbmi->mv[0];
+ const WarpedMotionParams wm_params0 = mbmi->wm_params;
+ int num_proj_ref0 = mbmi->num_proj_ref;
+
+ // Refine MV in a small range.
+ av1_refine_warped_mv(cpi, x, bsize, mi_row, mi_col, pts0, pts_inref0,
+ total_samples);
+
+ // Keep the refined MV and WM parameters.
+ if (mv0.as_int != mbmi->mv[0].as_int) {
+ const int ref = refs[0];
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ tmp_rate_mv =
+ av1_mv_bit_cost(&mbmi->mv[0].as_mv, &ref_mv.as_mv,
+ x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+ if (cpi->sf.adaptive_motion_search)
+ x->pred_mv[ref] = mbmi->mv[0].as_mv;
+
+#if USE_DISCOUNT_NEWMV_TEST
+ if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+ tmp_rate_mv = AOMMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+ }
+#endif
+ tmp_rate2 = rate2_nocoeff - rate_mv0 + tmp_rate_mv;
+ } else {
+ // Restore the old MV and WM parameters.
+ mbmi->mv[0] = mv0;
+ mbmi->wm_params = wm_params0;
+ mbmi->num_proj_ref = num_proj_ref0;
+ }
+ }
+
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ } else {
+ continue;
+ }
+ } else if (is_interintra_mode) {
+ const int ret = handle_inter_intra_mode(
+ cpi, x, bsize, mi_row, mi_col, mbmi, args, ref_best_rd, &tmp_rate_mv,
+ &tmp_rate2, orig_dst);
+ if (ret < 0) continue;
+ }
+
+ if (!cpi->common.all_lossless)
+ check_block_skip(cpi, bsize, x, xd, 0, num_planes - 1, &skip_txfm_sb);
+
+ x->skip = 0;
+
+ rd_stats->dist = 0;
+ rd_stats->sse = 0;
+ rd_stats->skip = 1;
+ rd_stats->rate = tmp_rate2;
+ if (mbmi->motion_mode != WARPED_CAUSAL) rd_stats->rate += switchable_rate;
+ if (interintra_allowed) {
+ rd_stats->rate += x->interintra_cost[size_group_lookup[bsize]]
+ [mbmi->ref_frame[1] == INTRA_FRAME];
+ if (mbmi->ref_frame[1] == INTRA_FRAME) {
+ rd_stats->rate += x->interintra_mode_cost[size_group_lookup[bsize]]
+ [mbmi->interintra_mode];
+ if (is_interintra_wedge_used(bsize)) {
+ rd_stats->rate +=
+ x->wedge_interintra_cost[bsize][mbmi->use_wedge_interintra];
+ if (mbmi->use_wedge_interintra) {
+ rd_stats->rate +=
+ av1_cost_literal(get_interintra_wedge_bits(bsize));
+ }
+ }
+ }
+ }
+ if ((last_motion_mode_allowed > SIMPLE_TRANSLATION) &&
+ (mbmi->ref_frame[1] != INTRA_FRAME)) {
+ if (last_motion_mode_allowed == WARPED_CAUSAL) {
+ rd_stats->rate += x->motion_mode_cost[bsize][mbmi->motion_mode];
+ } else {
+ rd_stats->rate += x->motion_mode_cost1[bsize][mbmi->motion_mode];
+ }
+ }
+
+ if (!skip_txfm_sb) {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ int64_t est_rd = 0;
+ int est_skip = 0;
+ if (cpi->sf.inter_mode_rd_model_estimation && cm->tile_cols == 1 &&
+ cm->tile_rows == 1) {
+ InterModeRdModel *md = &tile_data->inter_mode_rd_models[mbmi->sb_type];
+ if (md->ready) {
+ const int64_t curr_sse = get_sse(cpi, x);
+ est_rd = get_est_rd(tile_data, mbmi->sb_type, x->rdmult, curr_sse,
+ rd_stats->rate);
+ est_skip = est_rd * 0.8 > *best_est_rd;
+ if (est_skip) {
+ mbmi->ref_frame[1] = ref_frame_1;
+ continue;
+ } else {
+ if (est_rd < *best_est_rd) {
+ *best_est_rd = est_rd;
+ }
+ }
+ }
+ }
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ }
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ const int64_t curr_sse = get_sse(cpi, x);
+ int est_residue_cost = 0;
+ int64_t est_dist = 0;
+ const int has_est_rd = get_est_rate_dist(tile_data, bsize, curr_sse,
+ &est_residue_cost, &est_dist);
+ (void)has_est_rd;
+ assert(has_est_rd);
+ const int mode_rate = rd_stats->rate;
+ rd_stats->rate += est_residue_cost;
+ rd_stats->dist = est_dist;
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (cm->reference_mode == SINGLE_REFERENCE) {
+ if (!is_comp_pred) {
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
+ }
+ } else {
+ inter_modes_info_push(inter_modes_info, mode_rate, curr_sse,
+ rd_stats->rdcost, mbmi);
+ }
+ } else {
+#endif
+ int mode_rate = rd_stats->rate;
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, rd_stats, rd_stats_y,
+ rd_stats_uv, mode_rate, ref_best_rd)) {
+ if (rd_stats_y->rate == INT_MAX && mode_index == 0) {
+ return INT64_MAX;
+ }
+ continue;
+ }
+ if (!skip_txfm_sb) {
+ const int64_t curr_rd =
+ RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (curr_rd < ref_best_rd) {
+ ref_best_rd = curr_rd;
+ }
+ *disable_skip = 0;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (cpi->sf.inter_mode_rd_model_estimation) {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats->sse,
+ rd_stats->dist,
+ rd_stats_y->rate + rd_stats_uv->rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ } else {
+ *disable_skip = 1;
+ }
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ }
+#endif
+
+ if (this_mode == GLOBALMV || this_mode == GLOBAL_GLOBALMV) {
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->interp_filter));
+ }
+ }
+
+ tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ if (mode_index == 0)
+ args->simple_rd[this_mode][mbmi->ref_mv_idx][mbmi->ref_frame[0]] = tmp_rd;
+ if ((mode_index == 0) || (tmp_rd < best_rd)) {
+ best_mbmi = *mbmi;
+ best_rd = tmp_rd;
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rate_mv = tmp_rate_mv;
+ if (num_planes > 1) best_rd_stats_uv = *rd_stats_uv;
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ best_xskip = x->skip;
+ best_disable_skip = *disable_skip;
+ if (best_xskip) break;
+ }
+ }
+ mbmi->ref_frame[1] = ref_frame_1;
+ *rate_mv = best_rate_mv;
+ if (best_rd == INT64_MAX) {
+ av1_invalid_rd_stats(rd_stats);
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return INT64_MAX;
+ }
+ *mbmi = best_mbmi;
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ if (num_planes > 1) *rd_stats_uv = best_rd_stats_uv;
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ x->skip = best_xskip;
+ *disable_skip = best_disable_skip;
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+static int64_t skip_mode_rd(RD_STATS *rd_stats, const AV1_COMP *const cpi,
+ MACROBLOCK *const x, BLOCK_SIZE bsize, int mi_row,
+ int mi_col, BUFFER_SET *const orig_dst) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, orig_dst, bsize);
+
+ int64_t total_sse = 0;
+ for (int plane = 0; plane < num_planes; ++plane) {
+ const struct macroblock_plane *const p = &x->plane[plane];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsize, pd->subsampling_x, pd->subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+
+ av1_subtract_plane(x, bsize, plane);
+ int64_t sse = aom_sum_squares_2d_i16(p->src_diff, bw, bw, bh);
+ sse = sse << 4;
+ total_sse += sse;
+ }
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ rd_stats->dist = rd_stats->sse = total_sse;
+ rd_stats->rate = x->skip_mode_cost[skip_mode_ctx][1];
+ rd_stats->rdcost = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+
+ restore_dst_buf(xd, *orig_dst, num_planes);
+ return 0;
+}
+
+static INLINE int get_ref_mv_offset(PREDICTION_MODE single_mode,
+ uint8_t ref_mv_idx) {
+ assert(is_inter_singleref_mode(single_mode));
+ int ref_mv_offset;
+ if (single_mode == NEARESTMV) {
+ ref_mv_offset = 0;
+ } else if (single_mode == NEARMV) {
+ ref_mv_offset = ref_mv_idx + 1;
+ } else {
+ ref_mv_offset = -1;
+ }
+ return ref_mv_offset;
+}
+
+static INLINE void get_this_mv(int_mv *this_mv, PREDICTION_MODE this_mode,
+ int ref_idx, int ref_mv_idx,
+ const MV_REFERENCE_FRAME *ref_frame,
+ const MB_MODE_INFO_EXT *mbmi_ext) {
+ const uint8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int is_comp_pred = ref_frame[1] > INTRA_FRAME;
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, ref_idx, is_comp_pred);
+ assert(is_inter_singleref_mode(single_mode));
+ if (single_mode == NEWMV) {
+ this_mv->as_int = INVALID_MV;
+ } else if (single_mode == GLOBALMV) {
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ } else {
+ assert(single_mode == NEARMV || single_mode == NEARESTMV);
+ const int ref_mv_offset = get_ref_mv_offset(single_mode, ref_mv_idx);
+ if (ref_mv_offset < mbmi_ext->ref_mv_count[ref_frame_type]) {
+ assert(ref_mv_offset >= 0);
+ if (ref_idx == 0) {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].this_mv;
+ } else {
+ *this_mv =
+ mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_offset].comp_mv;
+ }
+ } else {
+ *this_mv = mbmi_ext->global_mvs[ref_frame[ref_idx]];
+ }
+ }
+}
+
+// This function update the non-new mv for the current prediction mode
+static INLINE int build_cur_mv(int_mv *cur_mv, PREDICTION_MODE this_mode,
+ const AV1_COMMON *cm, const MACROBLOCK *x) {
+ const MACROBLOCKD *xd = &x->e_mbd;
+ const MB_MODE_INFO *mbmi = xd->mi[0];
+ const int is_comp_pred = has_second_ref(mbmi);
+ int ret = 1;
+ for (int i = 0; i < is_comp_pred + 1; ++i) {
+ int_mv this_mv;
+ get_this_mv(&this_mv, this_mode, i, mbmi->ref_mv_idx, mbmi->ref_frame,
+ x->mbmi_ext);
+ const PREDICTION_MODE single_mode =
+ get_single_mode(this_mode, i, is_comp_pred);
+ if (single_mode == NEWMV) {
+ cur_mv[i] = this_mv;
+ } else {
+ ret &= clamp_and_check_mv(cur_mv + i, this_mv, cm, x);
+ }
+ }
+ return ret;
+}
+
+static INLINE int get_drl_cost(const MB_MODE_INFO *mbmi,
+ const MB_MODE_INFO_EXT *mbmi_ext,
+ int (*drl_mode_cost0)[2],
+ int8_t ref_frame_type) {
+ int cost = 0;
+ if (mbmi->mode == NEWMV || mbmi->mode == NEW_NEWMV) {
+ for (int idx = 0; idx < 2; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != idx];
+ if (mbmi->ref_mv_idx == idx) return cost;
+ }
+ }
+ return cost;
+ }
+
+ if (have_nearmv_in_inter_mode(mbmi->mode)) {
+ for (int idx = 1; idx < 3; ++idx) {
+ if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+ uint8_t drl_ctx =
+ av1_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+ cost += drl_mode_cost0[drl_ctx][mbmi->ref_mv_idx != (idx - 1)];
+ if (mbmi->ref_mv_idx == (idx - 1)) return cost;
+ }
+ }
+ return cost;
+ }
+ return cost;
+}
+
+// Struct for buffers used by compound_type_rd() function.
+// For sizes and alignment of these arrays, refer to
+// alloc_compound_type_rd_buffers() function.
+typedef struct {
+ uint8_t *pred0;
+ uint8_t *pred1;
+ int16_t *residual1; // src - pred1
+ int16_t *diff10; // pred1 - pred0
+ uint8_t *tmp_best_mask_buf; // backup of the best segmentation mask
+} CompoundTypeRdBuffers;
+
+static int compound_type_rd(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_col, int mi_row,
+ int_mv *cur_mv, int masked_compound_used,
+ BUFFER_SET *orig_dst, const BUFFER_SET *tmp_dst,
+ CompoundTypeRdBuffers *buffers, int *rate_mv,
+ int64_t *rd, RD_STATS *rd_stats,
+ int64_t ref_best_rd) {
+ const AV1_COMMON *cm = &cpi->common;
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int bw = block_size_wide[bsize];
+ int rate_sum, rs2;
+ int64_t dist_sum;
+
+ int_mv best_mv[2];
+ int best_tmp_rate_mv = *rate_mv;
+ int tmp_skip_txfm_sb;
+ int64_t tmp_skip_sse_sb;
+ INTERINTER_COMPOUND_DATA best_compound_data;
+ best_compound_data.type = COMPOUND_AVERAGE;
+ uint8_t *preds0[1] = { buffers->pred0 };
+ uint8_t *preds1[1] = { buffers->pred1 };
+ int strides[1] = { bw };
+ int tmp_rate_mv;
+ const int num_pix = 1 << num_pels_log2_lookup[bsize];
+ const int mask_len = 2 * num_pix * sizeof(uint8_t);
+ COMPOUND_TYPE cur_type;
+ int best_compmode_interinter_cost = 0;
+ int calc_pred_masked_compound = 1;
+
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ *rd = INT64_MAX;
+ for (cur_type = COMPOUND_AVERAGE; cur_type < COMPOUND_TYPES; cur_type++) {
+ if (cur_type != COMPOUND_AVERAGE && !masked_compound_used) break;
+ if (!is_interinter_compound_used(cur_type, bsize)) continue;
+ tmp_rate_mv = *rate_mv;
+ int64_t best_rd_cur = INT64_MAX;
+ mbmi->interinter_comp.type = cur_type;
+ int masked_type_cost = 0;
+
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ mbmi->compound_idx = 1;
+ if (cur_type == COMPOUND_AVERAGE) {
+ mbmi->comp_group_idx = 0;
+ if (masked_compound_used) {
+ masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][0];
+ }
+ masked_type_cost += x->comp_idx_cost[comp_index_ctx][1];
+ rs2 = masked_type_cost;
+ const int64_t mode_rd = RDCOST(x->rdmult, rs2 + rd_stats->rate, 0);
+ if (mode_rd < ref_best_rd) {
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, orig_dst, bsize);
+ int64_t est_rd =
+ estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+ &tmp_skip_txfm_sb, &tmp_skip_sse_sb, INT64_MAX);
+ if (est_rd != INT64_MAX)
+ best_rd_cur = RDCOST(x->rdmult, rs2 + *rate_mv + rate_sum, dist_sum);
+ }
+ // use spare buffer for following compound type try
+ restore_dst_buf(xd, *tmp_dst, 1);
+ } else {
+ mbmi->comp_group_idx = 1;
+ masked_type_cost += x->comp_group_idx_cost[comp_group_idx_ctx][1];
+ masked_type_cost += x->compound_type_cost[bsize][cur_type - 1];
+ rs2 = masked_type_cost;
+ if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+ *rd / 3 < ref_best_rd) {
+ best_rd_cur = build_and_cost_compound_type(
+ cpi, x, cur_mv, bsize, this_mode, &rs2, *rate_mv, orig_dst,
+ &tmp_rate_mv, preds0, preds1, buffers->residual1, buffers->diff10,
+ strides, mi_row, mi_col, rd_stats->rate, ref_best_rd,
+ &calc_pred_masked_compound);
+ }
+ }
+ if (best_rd_cur < *rd) {
+ *rd = best_rd_cur;
+ best_compound_data = mbmi->interinter_comp;
+ if (masked_compound_used && cur_type != COMPOUND_TYPES - 1) {
+ memcpy(buffers->tmp_best_mask_buf, xd->seg_mask, mask_len);
+ }
+ best_compmode_interinter_cost = rs2;
+ if (have_newmv_in_inter_mode(this_mode)) {
+ if (cur_type == COMPOUND_WEDGE) {
+ best_tmp_rate_mv = tmp_rate_mv;
+ best_mv[0].as_int = mbmi->mv[0].as_int;
+ best_mv[1].as_int = mbmi->mv[1].as_int;
+ } else {
+ best_mv[0].as_int = cur_mv[0].as_int;
+ best_mv[1].as_int = cur_mv[1].as_int;
+ }
+ }
+ }
+ // reset to original mvs for next iteration
+ mbmi->mv[0].as_int = cur_mv[0].as_int;
+ mbmi->mv[1].as_int = cur_mv[1].as_int;
+ }
+ if (mbmi->interinter_comp.type != best_compound_data.type) {
+ mbmi->comp_group_idx =
+ (best_compound_data.type == COMPOUND_AVERAGE) ? 0 : 1;
+ mbmi->interinter_comp = best_compound_data;
+ memcpy(xd->seg_mask, buffers->tmp_best_mask_buf, mask_len);
+ }
+ if (have_newmv_in_inter_mode(this_mode)) {
+ mbmi->mv[0].as_int = best_mv[0].as_int;
+ mbmi->mv[1].as_int = best_mv[1].as_int;
+ if (mbmi->interinter_comp.type == COMPOUND_WEDGE) {
+ rd_stats->rate += best_tmp_rate_mv - *rate_mv;
+ *rate_mv = best_tmp_rate_mv;
+ }
+ }
+ restore_dst_buf(xd, *orig_dst, 1);
+ return best_compmode_interinter_cost;
+}
+
+static INLINE int is_single_newmv_valid(HandleInterModeArgs *args,
+ MB_MODE_INFO *mbmi,
+ PREDICTION_MODE this_mode) {
+ for (int ref_idx = 0; ref_idx < 2; ++ref_idx) {
+ const PREDICTION_MODE single_mode = get_single_mode(this_mode, ref_idx, 1);
+ const MV_REFERENCE_FRAME ref = mbmi->ref_frame[ref_idx];
+ if (single_mode == NEWMV &&
+ args->single_newmv_valid[mbmi->ref_mv_idx][ref] == 0) {
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int get_drl_refmv_count(const MACROBLOCK *const x,
+ const MV_REFERENCE_FRAME *ref_frame,
+ PREDICTION_MODE mode) {
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int8_t ref_frame_type = av1_ref_frame_type(ref_frame);
+ const int has_nearmv = have_nearmv_in_inter_mode(mode) ? 1 : 0;
+ const int ref_mv_count = mbmi_ext->ref_mv_count[ref_frame_type];
+ const int only_newmv = (mode == NEWMV || mode == NEW_NEWMV);
+ const int has_drl =
+ (has_nearmv && ref_mv_count > 2) || (only_newmv && ref_mv_count > 1);
+ const int ref_set =
+ has_drl ? AOMMIN(MAX_REF_MV_SERCH, ref_mv_count - has_nearmv) : 1;
+
+ return ref_set;
+}
+
+typedef struct {
+ int64_t rd;
+ int drl_cost;
+ int rate_mv;
+ int_mv mv;
+} inter_mode_info;
+
+static int64_t handle_inter_mode(const AV1_COMP *const cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, RD_STATS *rd_stats,
+ RD_STATS *rd_stats_y, RD_STATS *rd_stats_uv,
+ int *disable_skip, int mi_row, int mi_col,
+ HandleInterModeArgs *args, int64_t ref_best_rd,
+ uint8_t *const tmp_buf,
+ CompoundTypeRdBuffers *rd_buffers
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ ,
+ TileDataEnc *tile_data, int64_t *best_est_rd,
+ const int do_tx_search,
+ InterModesInfo *inter_modes_info
+#endif
+) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *xd = &x->e_mbd;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const int is_comp_pred = has_second_ref(mbmi);
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ int i;
+ int refs[2] = { mbmi->ref_frame[0],
+ (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+ int rate_mv = 0;
+ int64_t rd = INT64_MAX;
+
+ // do first prediction into the destination buffer. Do the next
+ // prediction into a temporary buffer. Then keep track of which one
+ // of these currently holds the best predictor, and use the other
+ // one for future predictions. In the end, copy from tmp_buf to
+ // dst if necessary.
+ struct macroblockd_plane *p = xd->plane;
+ BUFFER_SET orig_dst = {
+ { p[0].dst.buf, p[1].dst.buf, p[2].dst.buf },
+ { p[0].dst.stride, p[1].dst.stride, p[2].dst.stride },
+ };
+ const BUFFER_SET tmp_dst = { { tmp_buf, tmp_buf + 1 * MAX_SB_SQUARE,
+ tmp_buf + 2 * MAX_SB_SQUARE },
+ { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE } };
+
+ int skip_txfm_sb = 0;
+ int64_t skip_sse_sb = INT64_MAX;
+ int16_t mode_ctx;
+ const int masked_compound_used = is_any_masked_compound_used(bsize) &&
+ cm->seq_params.enable_masked_compound;
+ int64_t ret_val = INT64_MAX;
+ const int8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ RD_STATS best_rd_stats, best_rd_stats_y, best_rd_stats_uv;
+ int64_t best_rd = INT64_MAX;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ MB_MODE_INFO best_mbmi = *mbmi;
+ int best_disable_skip;
+ int best_xskip;
+ int64_t newmv_ret_val = INT64_MAX;
+ int_mv backup_mv[2] = { { 0 } };
+ int backup_rate_mv = 0;
+ inter_mode_info mode_info[MAX_REF_MV_SERCH];
+
+ int comp_idx;
+ const int search_jnt_comp = is_comp_pred & cm->seq_params.enable_jnt_comp &
+ (mbmi->mode != GLOBAL_GLOBALMV);
+
+ // TODO(jingning): This should be deprecated shortly.
+ const int has_nearmv = have_nearmv_in_inter_mode(mbmi->mode) ? 1 : 0;
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ mode_info[ref_mv_idx].mv.as_int = INVALID_MV;
+ mode_info[ref_mv_idx].rd = INT64_MAX;
+
+ if (cpi->sf.reduce_inter_modes && ref_mv_idx > 0) {
+ if (mbmi->ref_frame[0] == LAST2_FRAME ||
+ mbmi->ref_frame[0] == LAST3_FRAME ||
+ mbmi->ref_frame[1] == LAST2_FRAME ||
+ mbmi->ref_frame[1] == LAST3_FRAME) {
+ if (mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx + has_nearmv]
+ .weight < REF_CAT_LEVEL) {
+ continue;
+ }
+ }
+ }
+
+ av1_init_rd_stats(rd_stats);
+
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = 1;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+
+ mode_ctx =
+ av1_mode_context_analyzer(mbmi_ext->mode_context, mbmi->ref_frame);
+
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = ref_mv_idx;
+
+ if (is_comp_pred && (!is_single_newmv_valid(args, mbmi, this_mode))) {
+ continue;
+ }
+
+ rd_stats->rate += args->ref_frame_cost + args->single_comp_cost;
+ const int drl_cost =
+ get_drl_cost(mbmi, mbmi_ext, x->drl_mode_cost0, ref_frame_type);
+ rd_stats->rate += drl_cost;
+ mode_info[ref_mv_idx].drl_cost = drl_cost;
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
+
+ int64_t best_rd2 = INT64_MAX;
+
+ const RD_STATS backup_rd_stats = *rd_stats;
+ // If !search_jnt_comp, we need to force mbmi->compound_idx = 1.
+ for (comp_idx = 1; comp_idx >= !search_jnt_comp; --comp_idx) {
+ int rs = 0;
+ int compmode_interinter_cost = 0;
+ mbmi->compound_idx = comp_idx;
+ if (is_comp_pred && comp_idx == 0) {
+ *rd_stats = backup_rd_stats;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ if (mbmi->ref_frame[1] == INTRA_FRAME) mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->num_proj_ref = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->comp_group_idx = 0;
+
+ const int comp_group_idx_ctx = get_comp_group_idx_context(xd);
+ const int comp_index_ctx = get_comp_index_context(cm, xd);
+ if (masked_compound_used) {
+ compmode_interinter_cost +=
+ x->comp_group_idx_cost[comp_group_idx_ctx][0];
+ }
+ compmode_interinter_cost += x->comp_idx_cost[comp_index_ctx][0];
+ }
+
+ int_mv cur_mv[2];
+ if (!build_cur_mv(cur_mv, this_mode, cm, x)) {
+ continue;
+ }
+ if (have_newmv_in_inter_mode(this_mode)) {
+ if (comp_idx == 0) {
+ cur_mv[0] = backup_mv[0];
+ cur_mv[1] = backup_mv[1];
+ rate_mv = backup_rate_mv;
+ }
+
+ // when jnt_comp_skip_mv_search flag is on, new mv will be searched once
+ if (!(search_jnt_comp && cpi->sf.jnt_comp_skip_mv_search &&
+ comp_idx == 0)) {
+ newmv_ret_val = handle_newmv(cpi, x, bsize, cur_mv, mi_row, mi_col,
+ &rate_mv, args);
+
+ // Store cur_mv and rate_mv so that they can be restored in the next
+ // iteration of the loop
+ backup_mv[0] = cur_mv[0];
+ backup_mv[1] = cur_mv[1];
+ backup_rate_mv = rate_mv;
+ }
+
+ if (newmv_ret_val != 0) {
+ continue;
+ } else {
+ rd_stats->rate += rate_mv;
+ }
+
+ if (cpi->sf.skip_repeated_newmv) {
+ if (!is_comp_pred && this_mode == NEWMV && ref_mv_idx > 0) {
+ int skip = 0;
+ int this_rate_mv = 0;
+ for (i = 0; i < ref_mv_idx; ++i) {
+ // Check if the motion search result same as previous results
+ if (cur_mv[0].as_int == args->single_newmv[i][refs[0]].as_int) {
+ // If the compared mode has no valid rd, it is unlikely this
+ // mode will be the best mode
+ if (mode_info[i].rd == INT64_MAX) {
+ skip = 1;
+ break;
+ }
+ // Compare the cost difference including drl cost and mv cost
+ if (mode_info[i].mv.as_int != INVALID_MV) {
+ const int compare_cost =
+ mode_info[i].rate_mv + mode_info[i].drl_cost;
+ const int_mv ref_mv = av1_get_ref_mv(x, 0);
+ this_rate_mv = av1_mv_bit_cost(&mode_info[i].mv.as_mv,
+ &ref_mv.as_mv, x->nmvjointcost,
+ x->mvcost, MV_COST_WEIGHT);
+ const int this_cost = this_rate_mv + drl_cost;
+
+ if (compare_cost < this_cost) {
+ skip = 1;
+ break;
+ } else {
+ // If the cost is less than current best result, make this
+ // the best and update corresponding variables
+ if (best_mbmi.ref_mv_idx == i) {
+ assert(best_rd != INT64_MAX);
+ best_mbmi.ref_mv_idx = ref_mv_idx;
+ best_rd_stats.rate += this_cost - compare_cost;
+ best_rd = RDCOST(x->rdmult, best_rd_stats.rate,
+ best_rd_stats.dist);
+ if (best_rd < ref_best_rd) ref_best_rd = best_rd;
+
+ skip = 1;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (skip) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->modelled_rd[this_mode][i][refs[0]];
+ args->simple_rd[this_mode][ref_mv_idx][refs[0]] =
+ args->simple_rd[this_mode][i][refs[0]];
+ mode_info[ref_mv_idx].rd = mode_info[i].rd;
+ mode_info[ref_mv_idx].rate_mv = this_rate_mv;
+ mode_info[ref_mv_idx].mv.as_int = mode_info[i].mv.as_int;
+
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
+ }
+ for (i = 0; i < is_comp_pred + 1; ++i) {
+ mbmi->mv[i].as_int = cur_mv[i].as_int;
+ }
+ const int ref_mv_cost = cost_mv_ref(x, this_mode, mode_ctx);
+#if USE_DISCOUNT_NEWMV_TEST
+ // We don't include the cost of the second reference here, because there
+ // are only three options: Last/Golden, ARF/Last or Golden/ARF, or in
+ // other words if you present them in that order, the second one is always
+ // known if the first is known.
+ //
+ // Under some circumstances we discount the cost of new mv mode to
+ // encourage initiation of a motion field.
+ if (discount_newmv_test(cpi, x, this_mode, mbmi->mv[0])) {
+ // discount_newmv_test only applies discount on NEWMV mode.
+ assert(this_mode == NEWMV);
+ rd_stats->rate += AOMMIN(cost_mv_ref(x, this_mode, mode_ctx),
+ cost_mv_ref(x, NEARESTMV, mode_ctx));
+ } else {
+ rd_stats->rate += ref_mv_cost;
+ }
+#else
+ rd_stats->rate += ref_mv_cost;
+#endif
+
+ if (RDCOST(x->rdmult, rd_stats->rate, 0) > ref_best_rd &&
+ mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV) {
+ continue;
+ }
+
+ int skip_build_pred = 0;
+ if (is_comp_pred && comp_idx) {
+ // Find matching interp filter or set to default interp filter
+ const int need_search =
+ av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd);
+ int match_found = -1;
+ const InterpFilter assign_filter = cm->interp_filter;
+ if (cpi->sf.skip_repeat_interpolation_filter_search && need_search) {
+ match_found = find_interp_filter_in_stats(x, mbmi);
+ }
+ if (!need_search || match_found == -1) {
+ set_default_interp_filters(mbmi, assign_filter);
+ }
+
+ int64_t best_rd_compound;
+ compmode_interinter_cost = compound_type_rd(
+ cpi, x, bsize, mi_col, mi_row, cur_mv, masked_compound_used,
+ &orig_dst, &tmp_dst, rd_buffers, &rate_mv, &best_rd_compound,
+ rd_stats, ref_best_rd);
+ if (ref_best_rd < INT64_MAX && best_rd_compound / 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ // No need to call av1_build_inter_predictors_sby if
+ // COMPOUND_AVERAGE is selected because it is the first
+ // candidate in compound_type_rd, and the following
+ // compound types searching uses tmp_dst buffer
+ if (mbmi->interinter_comp.type == COMPOUND_AVERAGE) {
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, &orig_dst,
+ bsize);
+ skip_build_pred = 1;
+ }
+ }
+
+ ret_val = interpolation_filter_search(
+ x, cpi, bsize, mi_row, mi_col, &tmp_dst, &orig_dst,
+ args->single_filter, &rd, &rs, &skip_txfm_sb, &skip_sse_sb,
+ skip_build_pred, args, ref_best_rd);
+ if (args->modelled_rd != NULL && !is_comp_pred) {
+ args->modelled_rd[this_mode][ref_mv_idx][refs[0]] = rd;
+ }
+ if (ret_val != 0) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ } else if (cpi->sf.model_based_post_interp_filter_breakout &&
+ ref_best_rd != INT64_MAX && (rd >> 3) * 3 > ref_best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ if ((rd >> 3) * 2 > ref_best_rd) break;
+ continue;
+ }
+
+ if (search_jnt_comp) {
+ // if 1/2 model rd is larger than best_rd in jnt_comp mode,
+ // use jnt_comp mode, save additional search
+ if ((rd >> 3) * 4 > best_rd) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+
+ if (!is_comp_pred)
+ args->single_filter[this_mode][refs[0]] =
+ av1_extract_interp_filter(mbmi->interp_filters, 0);
+
+ if (args->modelled_rd != NULL) {
+ if (is_comp_pred) {
+ const int mode0 = compound_ref0_mode(this_mode);
+ const int mode1 = compound_ref1_mode(this_mode);
+ const int64_t mrd =
+ AOMMIN(args->modelled_rd[mode0][ref_mv_idx][refs[0]],
+ args->modelled_rd[mode1][ref_mv_idx][refs[1]]);
+ if ((rd >> 3) * 6 > mrd && ref_best_rd < INT64_MAX) {
+ restore_dst_buf(xd, orig_dst, num_planes);
+ continue;
+ }
+ }
+ }
+ rd_stats->rate += compmode_interinter_cost;
+
+ if (search_jnt_comp && cpi->sf.jnt_comp_fast_tx_search && comp_idx == 0) {
+ // TODO(chengchen): this speed feature introduces big loss.
+ // Need better estimation of rate distortion.
+ int dummy_rate;
+ int64_t dummy_dist;
+ int plane_rate[MAX_MB_PLANE] = { 0 };
+ int64_t plane_sse[MAX_MB_PLANE] = { 0 };
+ int64_t plane_dist[MAX_MB_PLANE] = { 0 };
+
+ model_rd_sb_fn[MODELRD_TYPE_JNT_COMPOUND](
+ cpi, bsize, x, xd, 0, num_planes - 1, mi_row, mi_col, &dummy_rate,
+ &dummy_dist, &skip_txfm_sb, &skip_sse_sb, plane_rate, plane_sse,
+ plane_dist);
+
+ rd_stats->rate += rs;
+ rd_stats->rate += plane_rate[0] + plane_rate[1] + plane_rate[2];
+ rd_stats_y->rate = plane_rate[0];
+ rd_stats_uv->rate = plane_rate[1] + plane_rate[2];
+ rd_stats->sse = plane_sse[0] + plane_sse[1] + plane_sse[2];
+ rd_stats_y->sse = plane_sse[0];
+ rd_stats_uv->sse = plane_sse[1] + plane_sse[2];
+ rd_stats->dist = plane_dist[0] + plane_dist[1] + plane_dist[2];
+ rd_stats_y->dist = plane_dist[0];
+ rd_stats_uv->dist = plane_dist[1] + plane_dist[2];
+ } else {
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ ret_val = motion_mode_rd(
+ cpi, x, bsize, rd_stats, rd_stats_y, rd_stats_uv, disable_skip,
+ mi_row, mi_col, args, ref_best_rd, refs, &rate_mv, &orig_dst,
+ tile_data, best_est_rd, do_tx_search, inter_modes_info);
+#else
+ ret_val = motion_mode_rd(cpi, x, bsize, rd_stats, rd_stats_y,
+ rd_stats_uv, disable_skip, mi_row, mi_col,
+ args, ref_best_rd, refs, &rate_mv, &orig_dst);
+#endif
+ }
+ mode_info[ref_mv_idx].mv.as_int = mbmi->mv[0].as_int;
+ mode_info[ref_mv_idx].rate_mv = rate_mv;
+ if (ret_val != INT64_MAX) {
+ int64_t tmp_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ mode_info[ref_mv_idx].rd = tmp_rd;
+ if (tmp_rd < best_rd) {
+ best_rd_stats = *rd_stats;
+ best_rd_stats_y = *rd_stats_y;
+ best_rd_stats_uv = *rd_stats_uv;
+ best_rd = tmp_rd;
+ best_mbmi = *mbmi;
+ best_disable_skip = *disable_skip;
+ best_xskip = x->skip;
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
+ }
+
+ if (tmp_rd < best_rd2) {
+ best_rd2 = tmp_rd;
+ }
+
+ if (tmp_rd < ref_best_rd) {
+ ref_best_rd = tmp_rd;
+ }
+ }
+ restore_dst_buf(xd, orig_dst, num_planes);
+ }
+ }
+
+ if (best_rd == INT64_MAX) return INT64_MAX;
+
+ // re-instate status of the best choice
+ *rd_stats = best_rd_stats;
+ *rd_stats_y = best_rd_stats_y;
+ *rd_stats_uv = best_rd_stats_uv;
+ *mbmi = best_mbmi;
+ *disable_skip = best_disable_skip;
+ x->skip = best_xskip;
+ assert(IMPLIES(mbmi->comp_group_idx == 1,
+ mbmi->interinter_comp.type != COMPOUND_AVERAGE));
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * xd->n4_h * xd->n4_w);
+
+ return RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+}
+
+static int64_t rd_pick_intrabc_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ if (!av1_allow_intrabc(cm)) return INT64_MAX;
+ const int num_planes = av1_num_planes(cm);
+
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const TileInfo *tile = &xd->tile;
+ MB_MODE_INFO *mbmi = xd->mi[0];
+ const int mi_row = -xd->mb_to_top_edge / (8 * MI_SIZE);
+ const int mi_col = -xd->mb_to_left_edge / (8 * MI_SIZE);
+ const int w = block_size_wide[bsize];
+ const int h = block_size_high[bsize];
+ const int sb_row = mi_row >> cm->seq_params.mib_size_log2;
+ const int sb_col = mi_col >> cm->seq_params.mib_size_log2;
+
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ MV_REFERENCE_FRAME ref_frame = INTRA_FRAME;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+
+ int_mv nearestmv, nearmv;
+ av1_find_best_ref_mvs_from_stack(0, mbmi_ext, ref_frame, &nearestmv, &nearmv,
+ 0);
+
+ if (nearestmv.as_int == INVALID_MV) {
+ nearestmv.as_int = 0;
+ }
+ if (nearmv.as_int == INVALID_MV) {
+ nearmv.as_int = 0;
+ }
+
+ int_mv dv_ref = nearestmv.as_int == 0 ? nearmv : nearestmv;
+ if (dv_ref.as_int == 0)
+ av1_find_ref_dv(&dv_ref, tile, cm->seq_params.mib_size, mi_row, mi_col);
+ // Ref DV should not have sub-pel.
+ assert((dv_ref.as_mv.col & 7) == 0);
+ assert((dv_ref.as_mv.row & 7) == 0);
+ mbmi_ext->ref_mv_stack[INTRA_FRAME][0].this_mv = dv_ref;
+
+ struct buf_2d yv12_mb[MAX_MB_PLANE];
+ av1_setup_pred_block(xd, yv12_mb, xd->cur_buf, mi_row, mi_col, NULL, NULL,
+ num_planes);
+ for (int i = 0; i < num_planes; ++i) {
+ xd->plane[i].pre[0] = yv12_mb[i];
+ }
+
+ enum IntrabcMotionDirection {
+ IBC_MOTION_ABOVE,
+ IBC_MOTION_LEFT,
+ IBC_MOTION_DIRECTIONS
+ };
+
+ MB_MODE_INFO best_mbmi = *mbmi;
+ RD_STATS best_rdcost = *rd_cost;
+ int best_skip = x->skip;
+
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE] = { 0 };
+ for (enum IntrabcMotionDirection dir = IBC_MOTION_ABOVE;
+ dir < IBC_MOTION_DIRECTIONS; ++dir) {
+ const MvLimits tmp_mv_limits = x->mv_limits;
+ switch (dir) {
+ case IBC_MOTION_ABOVE:
+ x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+ x->mv_limits.col_max = (tile->mi_col_end - mi_col) * MI_SIZE - w;
+ x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+ x->mv_limits.row_max =
+ (sb_row * cm->seq_params.mib_size - mi_row) * MI_SIZE - h;
+ break;
+ case IBC_MOTION_LEFT:
+ x->mv_limits.col_min = (tile->mi_col_start - mi_col) * MI_SIZE;
+ x->mv_limits.col_max =
+ (sb_col * cm->seq_params.mib_size - mi_col) * MI_SIZE - w;
+ // TODO(aconverse@google.com): Minimize the overlap between above and
+ // left areas.
+ x->mv_limits.row_min = (tile->mi_row_start - mi_row) * MI_SIZE;
+ int bottom_coded_mi_edge =
+ AOMMIN((sb_row + 1) * cm->seq_params.mib_size, tile->mi_row_end);
+ x->mv_limits.row_max = (bottom_coded_mi_edge - mi_row) * MI_SIZE - h;
+ break;
+ default: assert(0);
+ }
+ assert(x->mv_limits.col_min >= tmp_mv_limits.col_min);
+ assert(x->mv_limits.col_max <= tmp_mv_limits.col_max);
+ assert(x->mv_limits.row_min >= tmp_mv_limits.row_min);
+ assert(x->mv_limits.row_max <= tmp_mv_limits.row_max);
+ av1_set_mv_search_range(&x->mv_limits, &dv_ref.as_mv);
+
+ if (x->mv_limits.col_max < x->mv_limits.col_min ||
+ x->mv_limits.row_max < x->mv_limits.row_min) {
+ x->mv_limits = tmp_mv_limits;
+ continue;
+ }
+
+ int step_param = cpi->mv_step_param;
+ MV mvp_full = dv_ref.as_mv;
+ mvp_full.col >>= 3;
+ mvp_full.row >>= 3;
+ int sadpb = x->sadperbit16;
+ int cost_list[5];
+ int bestsme = av1_full_pixel_search(
+ cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, 0,
+ sadpb, cond_cost_list(cpi, cost_list), &dv_ref.as_mv, INT_MAX, 1,
+ (MI_SIZE * mi_col), (MI_SIZE * mi_row), 1);
+
+ x->mv_limits = tmp_mv_limits;
+ if (bestsme == INT_MAX) continue;
+ mvp_full = x->best_mv.as_mv;
+ MV dv = { .row = mvp_full.row * 8, .col = mvp_full.col * 8 };
+ if (mv_check_bounds(&x->mv_limits, &dv)) continue;
+ if (!av1_is_dv_valid(dv, cm, xd, mi_row, mi_col, bsize,
+ cm->seq_params.mib_size_log2))
+ continue;
+
+ // DV should not have sub-pel.
+ assert((dv.col & 7) == 0);
+ assert((dv.row & 7) == 0);
+ memset(&mbmi->palette_mode_info, 0, sizeof(mbmi->palette_mode_info));
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->use_intrabc = 1;
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->mv[0].as_mv = dv;
+ mbmi->interp_filters = av1_broadcast_interp_filter(BILINEAR);
+ mbmi->skip = 0;
+ x->skip = 0;
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+
+ int *dvcost[2] = { (int *)&cpi->dv_cost[0][MV_MAX],
+ (int *)&cpi->dv_cost[1][MV_MAX] };
+ // TODO(aconverse@google.com): The full motion field defining discount
+ // in MV_COST_WEIGHT is too large. Explore other values.
+ int rate_mv = av1_mv_bit_cost(&dv, &dv_ref.as_mv, cpi->dv_joint_cost,
+ dvcost, MV_COST_WEIGHT_SUB);
+ const int rate_mode = x->intrabc_cost[1];
+ RD_STATS rd_stats, rd_stats_uv;
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ // Intrabc
+ select_tx_type_yrd(cpi, x, &rd_stats, bsize, mi_row, mi_col, INT64_MAX);
+ } else {
+ super_block_yrd(cpi, x, &rd_stats, bsize, INT64_MAX);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats.skip);
+ }
+ if (num_planes > 1) {
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ av1_merge_rd_stats(&rd_stats, &rd_stats_uv);
+ }
+#if CONFIG_RD_DEBUG
+ mbmi->rd_stats = rd_stats;
+#endif
+
+ const int skip_ctx = av1_get_skip_context(xd);
+
+ RD_STATS rdc_noskip;
+ av1_init_rd_stats(&rdc_noskip);
+ rdc_noskip.rate =
+ rate_mode + rate_mv + rd_stats.rate + x->skip_cost[skip_ctx][0];
+ rdc_noskip.dist = rd_stats.dist;
+ rdc_noskip.rdcost = RDCOST(x->rdmult, rdc_noskip.rate, rdc_noskip.dist);
+ if (rdc_noskip.rdcost < best_rd) {
+ best_rd = rdc_noskip.rdcost;
+ best_mbmi = *mbmi;
+ best_skip = x->skip;
+ best_rdcost = rdc_noskip;
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ }
+
+ if (!xd->lossless[mbmi->segment_id]) {
+ x->skip = 1;
+ mbmi->skip = 1;
+ RD_STATS rdc_skip;
+ av1_init_rd_stats(&rdc_skip);
+ rdc_skip.rate = rate_mode + rate_mv + x->skip_cost[skip_ctx][1];
+ rdc_skip.dist = rd_stats.sse;
+ rdc_skip.rdcost = RDCOST(x->rdmult, rdc_skip.rate, rdc_skip.dist);
+ if (rdc_skip.rdcost < best_rd) {
+ best_rd = rdc_skip.rdcost;
+ best_mbmi = *mbmi;
+ best_skip = x->skip;
+ best_rdcost = rdc_skip;
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ }
+ }
+ }
+ *mbmi = best_mbmi;
+ *rd_cost = best_rdcost;
+ x->skip = best_skip;
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(x->blk_skip[0]) * xd->n4_h * xd->n4_w);
+ return best_rd;
+}
+
+void av1_rd_pick_intra_mode_sb(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+ int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ int y_skip = 0, uv_skip = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+ TX_SIZE max_uv_tx_size;
+
+ ctx->skip = 0;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->use_intrabc = 0;
+ mbmi->mv[0].as_int = 0;
+
+ const int64_t intra_yrd =
+ rd_pick_intra_sby_mode(cpi, x, mi_row, mi_col, &rate_y, &rate_y_tokenonly,
+ &dist_y, &y_skip, bsize, best_rd, ctx);
+
+ if (intra_yrd < best_rd) {
+ // Only store reconstructed luma when there's chroma RDO. When there's no
+ // chroma RDO, the reconstructed luma will be stored in encode_superblock().
+ xd->cfl.is_chroma_reference =
+ is_chroma_reference(mi_row, mi_col, bsize, cm->seq_params.subsampling_x,
+ cm->seq_params.subsampling_y);
+ xd->cfl.store_y = store_cfl_required_rdo(cm, x);
+ if (xd->cfl.store_y) {
+ // Restore reconstructed luma values.
+ memcpy(x->blk_skip, ctx->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_encode_intra_block_plane(cpi, x, bsize, AOM_PLANE_Y,
+ cpi->optimize_seg_arr[mbmi->segment_id],
+ mi_row, mi_col);
+ xd->cfl.store_y = 0;
+ }
+ if (num_planes > 1) {
+ max_uv_tx_size = av1_get_tx_size(AOM_PLANE_U, xd);
+ init_sbuv_mode(mbmi);
+ if (!x->skip_chroma_rd)
+ rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly, &dist_uv,
+ &uv_skip, bsize, max_uv_tx_size);
+ }
+
+ if (y_skip && (uv_skip || x->skip_chroma_rd)) {
+ rd_cost->rate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
+ x->skip_cost[av1_get_skip_context(xd)][1];
+ rd_cost->dist = dist_y + dist_uv;
+ } else {
+ rd_cost->rate =
+ rate_y + rate_uv + x->skip_cost[av1_get_skip_context(xd)][0];
+ rd_cost->dist = dist_y + dist_uv;
+ }
+ rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ } else {
+ rd_cost->rate = INT_MAX;
+ }
+
+ if (rd_cost->rate != INT_MAX && rd_cost->rdcost < best_rd)
+ best_rd = rd_cost->rdcost;
+ if (rd_pick_intrabc_mode_sb(cpi, x, rd_cost, bsize, best_rd) < best_rd) {
+ ctx->skip = x->skip;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ assert(rd_cost->rate != INT_MAX);
+ }
+ if (rd_cost->rate == INT_MAX) return;
+
+ ctx->mic = *xd->mi[0];
+ ctx->mbmi_ext = *x->mbmi_ext;
+}
+
+static void restore_uv_color_map(const AV1_COMP *const cpi, MACROBLOCK *x) {
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int src_stride = x->plane[1].src.stride;
+ const uint8_t *const src_u = x->plane[1].src.buf;
+ const uint8_t *const src_v = x->plane[2].src.buf;
+ int *const data = x->palette_buffer->kmeans_data_buf;
+ int centroids[2 * PALETTE_MAX_SIZE];
+ uint8_t *const color_map = xd->plane[1].color_index_map;
+ int r, c;
+ const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+ const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+ int plane_block_width, plane_block_height, rows, cols;
+ av1_get_block_dimensions(bsize, 1, xd, &plane_block_width,
+ &plane_block_height, &rows, &cols);
+
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c) {
+ if (cpi->common.seq_params.use_highbitdepth) {
+ data[(r * cols + c) * 2] = src_u16[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v16[r * src_stride + c];
+ } else {
+ data[(r * cols + c) * 2] = src_u[r * src_stride + c];
+ data[(r * cols + c) * 2 + 1] = src_v[r * src_stride + c];
+ }
+ }
+ }
+
+ for (r = 1; r < 3; ++r) {
+ for (c = 0; c < pmi->palette_size[1]; ++c) {
+ centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+ }
+ }
+
+ av1_calc_indices(data, centroids, color_map, rows * cols,
+ pmi->palette_size[1], 2);
+ extend_palette_color_map(color_map, cols, rows, plane_block_width,
+ plane_block_height);
+}
+
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, int mi_row,
+ int mi_col, const uint8_t *above,
+ int above_stride, const uint8_t *left,
+ int left_stride);
+
+static const int ref_frame_flag_list[REF_FRAMES] = { 0,
+ AOM_LAST_FLAG,
+ AOM_LAST2_FLAG,
+ AOM_LAST3_FLAG,
+ AOM_GOLD_FLAG,
+ AOM_BWD_FLAG,
+ AOM_ALT2_FLAG,
+ AOM_ALT_FLAG };
+
+static void rd_pick_skip_mode(RD_STATS *rd_cost,
+ InterModeSearchState *search_state,
+ const AV1_COMP *const cpi, MACROBLOCK *const x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+
+ x->compound_idx = 1; // COMPOUND_AVERAGE
+ RD_STATS skip_mode_rd_stats;
+ av1_invalid_rd_stats(&skip_mode_rd_stats);
+
+ if (cm->ref_frame_idx_0 == INVALID_IDX ||
+ cm->ref_frame_idx_1 == INVALID_IDX) {
+ return;
+ }
+
+ const MV_REFERENCE_FRAME ref_frame = LAST_FRAME + cm->ref_frame_idx_0;
+ const MV_REFERENCE_FRAME second_ref_frame = LAST_FRAME + cm->ref_frame_idx_1;
+ const PREDICTION_MODE this_mode = NEAREST_NEARESTMV;
+ const int mode_index =
+ get_prediction_mode_idx(this_mode, ref_frame, second_ref_frame);
+
+ if (mode_index == -1) {
+ return;
+ }
+
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = ref_frame;
+ mbmi->ref_frame[1] = second_ref_frame;
+ const uint8_t ref_frame_type = av1_ref_frame_type(mbmi->ref_frame);
+ if (x->mbmi_ext->ref_mv_count[ref_frame_type] == UINT8_MAX) {
+ if (x->mbmi_ext->ref_mv_count[ref_frame] == UINT8_MAX ||
+ x->mbmi_ext->ref_mv_count[second_ref_frame] == UINT8_MAX) {
+ return;
+ }
+ MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame_type, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+ }
+
+ assert(this_mode == NEAREST_NEARESTMV);
+ if (!build_cur_mv(mbmi->mv, this_mode, cm, x)) {
+ return;
+ }
+
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ mbmi->comp_group_idx = 0;
+ mbmi->compound_idx = x->compound_idx;
+ mbmi->interinter_comp.type = COMPOUND_AVERAGE;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->ref_mv_idx = 0;
+ mbmi->skip_mode = mbmi->skip = 1;
+
+ set_default_interp_filters(mbmi, cm->interp_filter);
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ BUFFER_SET orig_dst;
+ for (int i = 0; i < num_planes; i++) {
+ orig_dst.plane[i] = xd->plane[i].dst.buf;
+ orig_dst.stride[i] = xd->plane[i].dst.stride;
+ }
+
+ // Obtain the rdcost for skip_mode.
+ skip_mode_rd(&skip_mode_rd_stats, cpi, x, bsize, mi_row, mi_col, &orig_dst);
+
+ // Compare the use of skip_mode with the best intra/inter mode obtained.
+ const int skip_mode_ctx = av1_get_skip_mode_context(xd);
+ const int64_t best_intra_inter_mode_cost =
+ (rd_cost->dist < INT64_MAX && rd_cost->rate < INT32_MAX)
+ ? RDCOST(x->rdmult,
+ rd_cost->rate + x->skip_mode_cost[skip_mode_ctx][0],
+ rd_cost->dist)
+ : INT64_MAX;
+
+ if (skip_mode_rd_stats.rdcost <= best_intra_inter_mode_cost) {
+ assert(mode_index != -1);
+ search_state->best_mbmode.skip_mode = 1;
+ search_state->best_mbmode = *mbmi;
+
+ search_state->best_mbmode.skip_mode = search_state->best_mbmode.skip = 1;
+ search_state->best_mbmode.mode = NEAREST_NEARESTMV;
+ search_state->best_mbmode.ref_frame[0] = mbmi->ref_frame[0];
+ search_state->best_mbmode.ref_frame[1] = mbmi->ref_frame[1];
+ search_state->best_mbmode.mv[0].as_int = mbmi->mv[0].as_int;
+ search_state->best_mbmode.mv[1].as_int = mbmi->mv[1].as_int;
+ search_state->best_mbmode.ref_mv_idx = 0;
+
+ // Set up tx_size related variables for skip-specific loop filtering.
+ search_state->best_mbmode.tx_size =
+ block_signals_txsize(bsize) ? tx_size_from_tx_mode(bsize, cm->tx_mode)
+ : max_txsize_rect_lookup[bsize];
+ memset(search_state->best_mbmode.inter_tx_size,
+ search_state->best_mbmode.tx_size,
+ sizeof(search_state->best_mbmode.inter_tx_size));
+ set_txfm_ctxs(search_state->best_mbmode.tx_size, xd->n4_w, xd->n4_h,
+ search_state->best_mbmode.skip && is_inter_block(mbmi), xd);
+
+ // Set up color-related variables for skip mode.
+ search_state->best_mbmode.uv_mode = UV_DC_PRED;
+ search_state->best_mbmode.palette_mode_info.palette_size[0] = 0;
+ search_state->best_mbmode.palette_mode_info.palette_size[1] = 0;
+
+ search_state->best_mbmode.comp_group_idx = 0;
+ search_state->best_mbmode.compound_idx = x->compound_idx;
+ search_state->best_mbmode.interinter_comp.type = COMPOUND_AVERAGE;
+ search_state->best_mbmode.motion_mode = SIMPLE_TRANSLATION;
+
+ search_state->best_mbmode.interintra_mode =
+ (INTERINTRA_MODE)(II_DC_PRED - 1);
+ search_state->best_mbmode.filter_intra_mode_info.use_filter_intra = 0;
+
+ set_default_interp_filters(&search_state->best_mbmode, cm->interp_filter);
+
+ search_state->best_mode_index = mode_index;
+
+ // Update rd_cost
+ rd_cost->rate = skip_mode_rd_stats.rate;
+ rd_cost->dist = rd_cost->sse = skip_mode_rd_stats.dist;
+ rd_cost->rdcost = skip_mode_rd_stats.rdcost;
+
+ search_state->best_rd = rd_cost->rdcost;
+ search_state->best_skip2 = 1;
+ search_state->best_mode_skippable = (skip_mode_rd_stats.sse == 0);
+
+ x->skip = 1;
+ }
+}
+
+// speed feature: fast intra/inter transform type search
+// Used for speed >= 2
+// When this speed feature is on, in rd mode search, only DCT is used.
+// After the mode is determined, this function is called, to select
+// transform types and get accurate rdcost.
+static void sf_refine_fast_tx_type_search(
+ const AV1_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int best_mode_index, MB_MODE_INFO *best_mbmode,
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE], int best_rate_y,
+ int best_rate_uv, int *best_skip2) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int num_planes = av1_num_planes(cm);
+
+ if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+ ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+ is_inter_mode(best_mbmode->mode)) ||
+ (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+ !is_inter_mode(best_mbmode->mode)))) {
+ int skip_blk = 0;
+ RD_STATS rd_stats_y, rd_stats_uv;
+
+ x->use_default_inter_tx_type = 0;
+ x->use_default_intra_tx_type = 0;
+
+ *mbmi = *best_mbmode;
+
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ for (int i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (has_second_ref(mbmi))
+ xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ if (is_inter_mode(mbmi->mode)) {
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ av1_subtract_plane(x, bsize, 0);
+ if (cm->tx_mode == TX_MODE_SELECT && !xd->lossless[mbmi->segment_id]) {
+ // av1_rd_pick_inter_mode_sb
+ select_tx_type_yrd(cpi, x, &rd_stats_y, bsize, mi_row, mi_col,
+ INT64_MAX);
+ assert(rd_stats_y.rate != INT_MAX);
+ } else {
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ memset(mbmi->inter_tx_size, mbmi->tx_size, sizeof(mbmi->inter_tx_size));
+ for (int i = 0; i < xd->n4_h * xd->n4_w; ++i)
+ set_blk_skip(x, 0, i, rd_stats_y.skip);
+ }
+ if (num_planes > 1) {
+ inter_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX, INT64_MAX,
+ FTXS_NONE);
+ } else {
+ av1_init_rd_stats(&rd_stats_uv);
+ }
+ } else {
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, INT64_MAX);
+ if (num_planes > 1) {
+ super_block_uvrd(cpi, x, &rd_stats_uv, bsize, INT64_MAX);
+ } else {
+ av1_init_rd_stats(&rd_stats_uv);
+ }
+ }
+
+ if (RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist)) >
+ RDCOST(x->rdmult, 0, (rd_stats_y.sse + rd_stats_uv.sse))) {
+ skip_blk = 1;
+ rd_stats_y.rate = x->skip_cost[av1_get_skip_context(xd)][1];
+ rd_stats_uv.rate = 0;
+ rd_stats_y.dist = rd_stats_y.sse;
+ rd_stats_uv.dist = rd_stats_uv.sse;
+ } else {
+ skip_blk = 0;
+ rd_stats_y.rate += x->skip_cost[av1_get_skip_context(xd)][0];
+ }
+
+ if (RDCOST(x->rdmult, best_rate_y + best_rate_uv, rd_cost->dist) >
+ RDCOST(x->rdmult, rd_stats_y.rate + rd_stats_uv.rate,
+ (rd_stats_y.dist + rd_stats_uv.dist))) {
+ best_mbmode->tx_size = mbmi->tx_size;
+ av1_copy(best_mbmode->inter_tx_size, mbmi->inter_tx_size);
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ av1_copy(best_mbmode->txk_type, mbmi->txk_type);
+ rd_cost->rate +=
+ (rd_stats_y.rate + rd_stats_uv.rate - best_rate_y - best_rate_uv);
+ rd_cost->dist = rd_stats_y.dist + rd_stats_uv.dist;
+ rd_cost->rdcost = RDCOST(x->rdmult, rd_cost->rate, rd_cost->dist);
+ *best_skip2 = skip_blk;
+ }
+ }
+}
+
+// Please add/modify parameter setting in this function, making it consistent
+// and easy to read and maintain.
+static void set_params_rd_pick_inter_mode(
+ const AV1_COMP *cpi, MACROBLOCK *x, HandleInterModeArgs *args,
+ BLOCK_SIZE bsize, int mi_row, int mi_col, uint16_t ref_frame_skip_mask[2],
+ uint32_t mode_skip_mask[REF_FRAMES], int skip_ref_frame_mask,
+ unsigned int ref_costs_single[REF_FRAMES],
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES],
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE]) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
+ const struct segmentation *const seg = &cm->seg;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ unsigned char segment_id = mbmi->segment_id;
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1,
+ MAX_SB_SIZE >> 1 };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ for (int i = 0; i < MB_MODE_COUNT; ++i)
+ for (int k = 0; k < REF_FRAMES; ++k) args->single_filter[i][k] = SWITCHABLE;
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ args->above_pred_buf[0] = CONVERT_TO_BYTEPTR(x->above_pred_buf);
+ args->above_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+ args->above_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(x->above_pred_buf + MAX_SB_SQUARE * len);
+ args->left_pred_buf[0] = CONVERT_TO_BYTEPTR(x->left_pred_buf);
+ args->left_pred_buf[1] =
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + (MAX_SB_SQUARE >> 1) * len);
+ args->left_pred_buf[2] =
+ CONVERT_TO_BYTEPTR(x->left_pred_buf + MAX_SB_SQUARE * len);
+ } else {
+ args->above_pred_buf[0] = x->above_pred_buf;
+ args->above_pred_buf[1] = x->above_pred_buf + (MAX_SB_SQUARE >> 1);
+ args->above_pred_buf[2] = x->above_pred_buf + MAX_SB_SQUARE;
+ args->left_pred_buf[0] = x->left_pred_buf;
+ args->left_pred_buf[1] = x->left_pred_buf + (MAX_SB_SQUARE >> 1);
+ args->left_pred_buf[2] = x->left_pred_buf + MAX_SB_SQUARE;
+ }
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ x->pred_mv_sad[ref_frame] = INT_MAX;
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+ x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ if (cpi->ref_frame_flags & ref_frame_flag_list[ref_frame]) {
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ int skip = 1;
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_frame || rf[1] == ref_frame) {
+ skip = 0;
+ break;
+ }
+ }
+ }
+ if (skip) continue;
+ }
+ }
+ assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
+ setup_buffer_ref_mvs_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
+ yv12_mb);
+ }
+ }
+ // ref_frame = ALTREF_FRAME
+ for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+ x->mbmi_ext->mode_context[ref_frame] = 0;
+ mbmi_ext->ref_mv_count[ref_frame] = UINT8_MAX;
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[ref_frame - REF_FRAMES];
+ if (!((cpi->ref_frame_flags & ref_frame_flag_list[rf[0]]) &&
+ (cpi->ref_frame_flags & ref_frame_flag_list[rf[1]]))) {
+ continue;
+ }
+
+ if (mbmi->partition != PARTITION_NONE &&
+ mbmi->partition != PARTITION_SPLIT) {
+ if (skip_ref_frame_mask & (1 << ref_frame)) {
+ continue;
+ }
+ }
+ av1_find_mv_refs(cm, xd, mbmi, ref_frame, mbmi_ext->ref_mv_count,
+ mbmi_ext->ref_mv_stack, NULL, mbmi_ext->global_mvs, mi_row,
+ mi_col, mbmi_ext->mode_context);
+ }
+
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+
+ if (check_num_overlappable_neighbors(mbmi) &&
+ is_motion_variation_allowed_bsize(bsize)) {
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+ args->above_pred_buf, dst_width1,
+ dst_height1, args->above_pred_stride);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+ args->left_pred_buf, dst_width2,
+ dst_height2, args->left_pred_stride);
+ av1_setup_dst_planes(xd->plane, bsize, get_frame_new_buffer(cm), mi_row,
+ mi_col, 0, num_planes);
+ calc_target_weighted_pred(
+ cm, x, xd, mi_row, mi_col, args->above_pred_buf[0],
+ args->above_pred_stride[0], args->left_pred_buf[0],
+ args->left_pred_stride[0]);
+ }
+
+ int min_pred_mv_sad = INT_MAX;
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame)
+ min_pred_mv_sad = AOMMIN(min_pred_mv_sad, x->pred_mv_sad[ref_frame]);
+
+ for (int i = 0; i < 2; ++i) {
+ ref_frame_skip_mask[i] = 0;
+ }
+ memset(mode_skip_mask, 0, REF_FRAMES * sizeof(*mode_skip_mask));
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame])) {
+ // Skip checking missing references in both single and compound reference
+ // modes. Note that a mode will be skipped iff both reference frames
+ // are masked out.
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ } else {
+ // Skip fixed mv modes for poor references
+ if ((x->pred_mv_sad[ref_frame] >> 2) > min_pred_mv_sad) {
+ mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
+ }
+ }
+ // If the segment reference frame feature is enabled....
+ // then do nothing if the current ref frame is not allowed..
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
+ ref_frame_skip_mask[0] |= (1 << ref_frame);
+ ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ // Disable this drop out case if the ref frame
+ // segment level feature is enabled for this segment. This is to
+ // prevent the possibility that we end up unable to pick any mode.
+ if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
+ // Only consider GLOBALMV/ALTREF_FRAME for alt ref frame,
+ // unless ARNR filtering is enabled in which case we want
+ // an unfiltered alternative. We allow near/nearest as well
+ // because they may result in zero-zero MVs but be cheaper.
+ if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
+ ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << LAST2_FRAME) |
+ (1 << LAST3_FRAME) | (1 << BWDREF_FRAME) |
+ (1 << ALTREF2_FRAME) | (1 << GOLDEN_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ // TODO(zoeliu): To further explore whether following needs to be done for
+ // BWDREF_FRAME as well.
+ mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
+ const MV_REFERENCE_FRAME tmp_ref_frames[2] = { ALTREF_FRAME, NONE_FRAME };
+ int_mv near_mv, nearest_mv, global_mv;
+ get_this_mv(&nearest_mv, NEARESTMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ get_this_mv(&near_mv, NEARMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+ get_this_mv(&global_mv, GLOBALMV, 0, 0, tmp_ref_frames, x->mbmi_ext);
+
+ if (near_mv.as_int != global_mv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
+ if (nearest_mv.as_int != global_mv.as_int)
+ mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+ }
+ }
+
+ if (cpi->rc.is_src_frame_alt_ref) {
+ if (sf->alt_ref_search_fp) {
+ assert(cpi->ref_frame_flags & ref_frame_flag_list[ALTREF_FRAME]);
+ mode_skip_mask[ALTREF_FRAME] = 0;
+ ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+ ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+ }
+ }
+
+ if (sf->alt_ref_search_fp)
+ if (!cm->show_frame && x->pred_mv_sad[GOLDEN_FRAME] < INT_MAX)
+ if (x->pred_mv_sad[ALTREF_FRAME] > (x->pred_mv_sad[GOLDEN_FRAME] << 1))
+ mode_skip_mask[ALTREF_FRAME] |= INTER_ALL;
+
+ if (sf->adaptive_mode_search) {
+ if (cm->show_frame && !cpi->rc.is_src_frame_alt_ref &&
+ cpi->rc.frames_since_golden >= 3)
+ if ((x->pred_mv_sad[GOLDEN_FRAME] >> 1) > x->pred_mv_sad[LAST_FRAME])
+ mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
+ }
+
+ if (bsize > sf->max_intra_bsize) {
+ ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+ ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+ }
+
+ mode_skip_mask[INTRA_FRAME] |=
+ ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
+ if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+ x->use_default_intra_tx_type = 1;
+ else
+ x->use_default_intra_tx_type = 0;
+
+ if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+ x->use_default_inter_tx_type = 1;
+ else
+ x->use_default_inter_tx_type = 0;
+ if (cpi->sf.skip_repeat_interpolation_filter_search) {
+ x->interp_filter_stats_idx[0] = 0;
+ x->interp_filter_stats_idx[1] = 0;
+ }
+}
+
+static void search_palette_mode(const AV1_COMP *cpi, MACROBLOCK *x, int mi_row,
+ int mi_col, RD_STATS *rd_cost,
+ PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+ MB_MODE_INFO *const mbmi,
+ PALETTE_MODE_INFO *const pmi,
+ unsigned int *ref_costs_single,
+ InterModeSearchState *search_state) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int rate2 = 0;
+ int64_t distortion2 = 0, best_rd_palette = search_state->best_rd, this_rd,
+ best_model_rd_palette = INT64_MAX;
+ int skippable = 0, rate_overhead_palette = 0;
+ RD_STATS rd_stats_y;
+ TX_SIZE uv_tx = TX_4X4;
+ uint8_t *const best_palette_color_map =
+ x->palette_buffer->best_palette_color_map;
+ uint8_t *const color_map = xd->plane[0].color_index_map;
+ MB_MODE_INFO best_mbmi_palette = *mbmi;
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+
+ mbmi->mode = DC_PRED;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = INTRA_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ rate_overhead_palette = rd_pick_palette_intra_sby(
+ cpi, x, bsize, mi_row, mi_col, intra_mode_cost[DC_PRED],
+ &best_mbmi_palette, best_palette_color_map, &best_rd_palette,
+ &best_model_rd_palette, NULL, NULL, NULL, NULL, ctx, best_blk_skip);
+ if (pmi->palette_size[0] == 0) return;
+
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(best_blk_skip[0]) * bsize_to_num_blk(bsize));
+
+ memcpy(color_map, best_palette_color_map,
+ rows * cols * sizeof(best_palette_color_map[0]));
+ super_block_yrd(cpi, x, &rd_stats_y, bsize, search_state->best_rd);
+ if (rd_stats_y.rate == INT_MAX) return;
+
+ skippable = rd_stats_y.skip;
+ distortion2 = rd_stats_y.dist;
+ rate2 = rd_stats_y.rate + rate_overhead_palette;
+ rate2 += ref_costs_single[INTRA_FRAME];
+ if (num_planes > 1) {
+ uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+ choose_intra_uv_mode(
+ cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+ &search_state->rate_uv_tokenonly[uv_tx],
+ &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+ &search_state->mode_uv[uv_tx]);
+ search_state->pmi_uv[uv_tx] = *pmi;
+ search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+ }
+ mbmi->uv_mode = search_state->mode_uv[uv_tx];
+ pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+ if (pmi->palette_size[1] > 0) {
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+ skippable = skippable && search_state->skip_uvs[uv_tx];
+ distortion2 += search_state->dist_uvs[uv_tx];
+ rate2 += search_state->rate_uv_intra[uv_tx];
+ }
+
+ if (skippable) {
+ rate2 -= rd_stats_y.rate;
+ if (num_planes > 1) rate2 -= search_state->rate_uv_tokenonly[uv_tx];
+ rate2 += x->skip_cost[av1_get_skip_context(xd)][1];
+ } else {
+ rate2 += x->skip_cost[av1_get_skip_context(xd)][0];
+ }
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+ if (this_rd < search_state->best_rd) {
+ search_state->best_mode_index = 3;
+ mbmi->mv[0].as_int = 0;
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ search_state->best_rd = this_rd;
+ search_state->best_mbmode = *mbmi;
+ search_state->best_skip2 = 0;
+ search_state->best_mode_skippable = skippable;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+}
+
+static void init_inter_mode_search_state(InterModeSearchState *search_state,
+ const AV1_COMP *cpi,
+ const TileDataEnc *tile_data,
+ const MACROBLOCK *x, BLOCK_SIZE bsize,
+ int64_t best_rd_so_far) {
+ search_state->best_rd = best_rd_so_far;
+
+ av1_zero(search_state->best_mbmode);
+
+ search_state->best_rate_y = INT_MAX;
+
+ search_state->best_rate_uv = INT_MAX;
+
+ search_state->best_mode_skippable = 0;
+
+ search_state->best_skip2 = 0;
+
+ search_state->best_mode_index = -1;
+
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const unsigned char segment_id = mbmi->segment_id;
+
+ search_state->skip_intra_modes = 0;
+
+ search_state->num_available_refs = 0;
+ memset(search_state->dist_refs, -1, sizeof(search_state->dist_refs));
+ memset(search_state->dist_order_refs, -1,
+ sizeof(search_state->dist_order_refs));
+
+ for (int i = 0; i <= LAST_NEW_MV_INDEX; ++i)
+ search_state->mode_threshold[i] = 0;
+ const int *const rd_threshes = cpi->rd.threshes[segment_id][bsize];
+ for (int i = LAST_NEW_MV_INDEX + 1; i < MAX_MODES; ++i)
+ search_state->mode_threshold[i] =
+ ((int64_t)rd_threshes[i] * tile_data->thresh_freq_fact[bsize][i]) >> 5;
+
+ search_state->best_intra_mode = DC_PRED;
+ search_state->best_intra_rd = INT64_MAX;
+
+ search_state->angle_stats_ready = 0;
+
+ search_state->best_pred_sse = UINT_MAX;
+
+ for (int i = 0; i < TX_SIZES_ALL; i++)
+ search_state->rate_uv_intra[i] = INT_MAX;
+
+ av1_zero(search_state->pmi_uv);
+
+ for (int i = 0; i < REFERENCE_MODES; ++i)
+ search_state->best_pred_rd[i] = INT64_MAX;
+
+ av1_zero(search_state->single_newmv);
+ av1_zero(search_state->single_newmv_rate);
+ av1_zero(search_state->single_newmv_valid);
+ for (int i = 0; i < MB_MODE_COUNT; ++i) {
+ for (int j = 0; j < MAX_REF_MV_SERCH; ++j) {
+ for (int ref_frame = 0; ref_frame < REF_FRAMES; ++ref_frame) {
+ search_state->modelled_rd[i][j][ref_frame] = INT64_MAX;
+ search_state->simple_rd[i][j][ref_frame] = INT64_MAX;
+ }
+ }
+ }
+
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ SingleInterModeState *state;
+
+ state = &search_state->single_state[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+
+ state = &search_state->single_state_modelled[dir][mode][ref_frame];
+ state->ref_frame = NONE_FRAME;
+ state->rd = INT64_MAX;
+ }
+ }
+ }
+ for (int dir = 0; dir < 2; ++dir) {
+ for (int mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (int ref_frame = 0; ref_frame < FWD_REFS; ++ref_frame) {
+ search_state->single_rd_order[dir][mode][ref_frame] = NONE_FRAME;
+ }
+ }
+ }
+ av1_zero(search_state->single_state_cnt);
+ av1_zero(search_state->single_state_modelled_cnt);
+}
+
+// Case 1: return 0, means don't skip this mode
+// Case 2: return 1, means skip this mode completely
+// Case 3: return 2, means skip compound only, but still try single motion modes
+static int inter_mode_search_order_independent_skip(
+ const AV1_COMP *cpi, const PICK_MODE_CONTEXT *ctx, const MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mode_index, int mi_row, int mi_col,
+ uint32_t *mode_skip_mask, uint16_t *ref_frame_skip_mask,
+ InterModeSearchState *search_state) {
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ const AV1_COMMON *const cm = &cpi->common;
+ const struct segmentation *const seg = &cm->seg;
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const unsigned char segment_id = mbmi->segment_id;
+ const MV_REFERENCE_FRAME *ref_frame = av1_mode_order[mode_index].ref_frame;
+ const PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ int skip_motion_mode = 0;
+ if (mbmi->partition != PARTITION_NONE && mbmi->partition != PARTITION_SPLIT) {
+ const int ref_type = av1_ref_frame_type(ref_frame);
+ int skip_ref = ctx->skip_ref_frame_mask & (1 << ref_type);
+ if (ref_type <= ALTREF_FRAME && skip_ref) {
+ // Since the compound ref modes depends on the motion estimation result of
+ // two single ref modes( best mv of single ref modes as the start point )
+ // If current single ref mode is marked skip, we need to check if it will
+ // be used in compound ref modes.
+ for (int r = ALTREF_FRAME + 1; r < MODE_CTX_REF_FRAMES; ++r) {
+ if (!(ctx->skip_ref_frame_mask & (1 << r))) {
+ const MV_REFERENCE_FRAME *rf = ref_frame_map[r - REF_FRAMES];
+ if (rf[0] == ref_type || rf[1] == ref_type) {
+ // Found a not skipped compound ref mode which contains current
+ // single ref. So this single ref can't be skipped completly
+ // Just skip it's motion mode search, still try it's simple
+ // transition mode.
+ skip_motion_mode = 1;
+ skip_ref = 0;
+ break;
+ }
+ }
+ }
+ }
+ if (skip_ref) return 1;
+ }
+
+ if (cpi->sf.mode_pruning_based_on_two_pass_partition_search &&
+ !x->cb_partition_scan) {
+ const int mi_width = mi_size_wide[bsize];
+ const int mi_height = mi_size_high[bsize];
+ int found = 0;
+ // Search in the stats table to see if the ref frames have been used in the
+ // first pass of partition search.
+ for (int row = mi_row; row < mi_row + mi_width && !found;
+ row += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ for (int col = mi_col; col < mi_col + mi_height && !found;
+ col += FIRST_PARTITION_PASS_SAMPLE_REGION) {
+ const int index = av1_first_partition_pass_stats_index(row, col);
+ const FIRST_PARTITION_PASS_STATS *const stats =
+ &x->first_partition_pass_stats[index];
+ if (stats->ref0_counts[ref_frame[0]] &&
+ (ref_frame[1] < 0 || stats->ref1_counts[ref_frame[1]])) {
+ found = 1;
+ break;
+ }
+ }
+ }
+ if (!found) return 1;
+ }
+
+ if (ref_frame[0] > INTRA_FRAME && ref_frame[1] == INTRA_FRAME) {
+ // Mode must by compatible
+ if (!is_interintra_allowed_mode(this_mode)) return 1;
+ if (!is_interintra_allowed_bsize(bsize)) return 1;
+ }
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test && ref_frame[0] == INTRA_FRAME)
+ return 1;
+
+ if (ref_frame[0] == INTRA_FRAME) {
+ if (this_mode != DC_PRED) {
+ // Disable intra modes other than DC_PRED for blocks with low variance
+ // Threshold for intra skipping based on source variance
+ // TODO(debargha): Specialize the threshold for super block sizes
+ const unsigned int skip_intra_var_thresh = 64;
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
+ x->source_variance < skip_intra_var_thresh)
+ return 1;
+ }
+ } else {
+ if (!is_comp_ref_allowed(bsize) && ref_frame[1] > INTRA_FRAME) return 1;
+ }
+
+ const int comp_pred = ref_frame[1] > INTRA_FRAME;
+ if (comp_pred) {
+ if (!cpi->allow_comp_inter_inter) return 1;
+
+ if (cm->reference_mode == SINGLE_REFERENCE) return 1;
+
+ // Skip compound inter modes if ARF is not available.
+ if (!(cpi->ref_frame_flags & ref_frame_flag_list[ref_frame[1]])) return 1;
+
+ // Do not allow compound prediction if the segment level reference frame
+ // feature is in use as in this case there can only be one reference.
+ if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) return 1;
+ }
+
+ if (sf->selective_ref_frame) {
+ if (sf->selective_ref_frame >= 2 || x->cb_partition_scan) {
+ if (ref_frame[0] == ALTREF2_FRAME || ref_frame[1] == ALTREF2_FRAME)
+ if (get_relative_dist(
+ cm, cm->cur_frame->ref_frame_offset[ALTREF2_FRAME - LAST_FRAME],
+ cm->frame_offset) < 0)
+ return 1;
+ if (ref_frame[0] == BWDREF_FRAME || ref_frame[1] == BWDREF_FRAME)
+ if (get_relative_dist(
+ cm, cm->cur_frame->ref_frame_offset[BWDREF_FRAME - LAST_FRAME],
+ cm->frame_offset) < 0)
+ return 1;
+ }
+ if (ref_frame[0] == LAST3_FRAME || ref_frame[1] == LAST3_FRAME)
+ if (get_relative_dist(
+ cm, cm->cur_frame->ref_frame_offset[LAST3_FRAME - LAST_FRAME],
+ cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+ return 1;
+ if (ref_frame[0] == LAST2_FRAME || ref_frame[1] == LAST2_FRAME)
+ if (get_relative_dist(
+ cm, cm->cur_frame->ref_frame_offset[LAST2_FRAME - LAST_FRAME],
+ cm->cur_frame->ref_frame_offset[GOLDEN_FRAME - LAST_FRAME]) <= 0)
+ return 1;
+ }
+
+ // One-sided compound is used only when all reference frames are one-sided.
+ if (sf->selective_ref_frame && comp_pred && !cpi->all_one_sided_refs) {
+ unsigned int ref_offsets[2];
+ for (int i = 0; i < 2; ++i) {
+ const int buf_idx = cm->frame_refs[ref_frame[i] - LAST_FRAME].idx;
+ assert(buf_idx >= 0);
+ ref_offsets[i] = cm->buffer_pool->frame_bufs[buf_idx].cur_frame_offset;
+ }
+ if ((get_relative_dist(cm, ref_offsets[0], cm->frame_offset) <= 0 &&
+ get_relative_dist(cm, ref_offsets[1], cm->frame_offset) <= 0) ||
+ (get_relative_dist(cm, ref_offsets[0], cm->frame_offset) > 0 &&
+ get_relative_dist(cm, ref_offsets[1], cm->frame_offset) > 0))
+ return 1;
+ }
+
+ if (mode_skip_mask[ref_frame[0]] & (1 << this_mode)) {
+ return 1;
+ }
+
+ if ((ref_frame_skip_mask[0] & (1 << ref_frame[0])) &&
+ (ref_frame_skip_mask[1] & (1 << AOMMAX(0, ref_frame[1])))) {
+ return 1;
+ }
+
+ if (skip_repeated_mv(cm, x, this_mode, ref_frame, search_state)) {
+ return 1;
+ }
+ if (skip_motion_mode) {
+ return 2;
+ }
+ return 0;
+}
+
+static INLINE void init_mbmi(MB_MODE_INFO *mbmi, int mode_index,
+ const AV1_COMMON *cm) {
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ PREDICTION_MODE this_mode = av1_mode_order[mode_index].mode;
+ mbmi->ref_mv_idx = 0;
+ mbmi->mode = this_mode;
+ mbmi->uv_mode = UV_DC_PRED;
+ mbmi->ref_frame[0] = av1_mode_order[mode_index].ref_frame[0];
+ mbmi->ref_frame[1] = av1_mode_order[mode_index].ref_frame[1];
+ pmi->palette_size[0] = 0;
+ pmi->palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->interintra_mode = (INTERINTRA_MODE)(II_DC_PRED - 1);
+ set_default_interp_filters(mbmi, cm->interp_filter);
+}
+
+static int64_t handle_intra_mode(InterModeSearchState *search_state,
+ const AV1_COMP *cpi, MACROBLOCK *x,
+ BLOCK_SIZE bsize, int mi_row, int mi_col,
+ int ref_frame_cost,
+ const PICK_MODE_CONTEXT *ctx, int disable_skip,
+ RD_STATS *rd_stats, RD_STATS *rd_stats_y,
+ RD_STATS *rd_stats_uv) {
+ const AV1_COMMON *cm = &cpi->common;
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ assert(mbmi->ref_frame[0] == INTRA_FRAME);
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const int try_palette =
+ av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+ const int *const intra_mode_cost = x->mbmode_cost[size_group_lookup[bsize]];
+ const int intra_cost_penalty = av1_get_intra_cost_penalty(
+ cm->base_qindex, cm->y_dc_delta_q, cm->seq_params.bit_depth);
+ const int rows = block_size_high[bsize];
+ const int cols = block_size_wide[bsize];
+ const int num_planes = av1_num_planes(cm);
+ const int skip_ctx = av1_get_skip_context(xd);
+
+ int known_rate = intra_mode_cost[mbmi->mode];
+ known_rate += ref_frame_cost;
+ if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+ known_rate += intra_cost_penalty;
+ known_rate += AOMMIN(x->skip_cost[skip_ctx][0], x->skip_cost[skip_ctx][1]);
+ const int64_t known_rd = RDCOST(x->rdmult, known_rate, 0);
+ if (known_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
+
+ TX_SIZE uv_tx;
+ int is_directional_mode = av1_is_directional_mode(mbmi->mode);
+ if (is_directional_mode && av1_use_angle_delta(bsize)) {
+ int rate_dummy;
+ int64_t model_rd = INT64_MAX;
+ if (!search_state->angle_stats_ready) {
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *src = x->plane[0].src.buf;
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ highbd_angle_estimation(src, src_stride, rows, cols, bsize,
+ search_state->directional_mode_skip_mask);
+ else
+ angle_estimation(src, src_stride, rows, cols, bsize,
+ search_state->directional_mode_skip_mask);
+ search_state->angle_stats_ready = 1;
+ }
+ if (search_state->directional_mode_skip_mask[mbmi->mode]) return INT64_MAX;
+ av1_init_rd_stats(rd_stats_y);
+ rd_stats_y->rate = INT_MAX;
+ rd_pick_intra_angle_sby(cpi, x, mi_row, mi_col, &rate_dummy, rd_stats_y,
+ bsize, intra_mode_cost[mbmi->mode],
+ search_state->best_rd, &model_rd);
+ } else {
+ av1_init_rd_stats(rd_stats_y);
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ super_block_yrd(cpi, x, rd_stats_y, bsize, search_state->best_rd);
+ }
+ uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE];
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ int try_filter_intra = 0;
+ int64_t best_rd_tmp = INT64_MAX;
+ if (mbmi->mode == DC_PRED && av1_filter_intra_allowed_bsize(cm, bsize)) {
+ if (rd_stats_y->rate != INT_MAX) {
+ const int tmp_rate = rd_stats_y->rate + x->filter_intra_cost[bsize][0] +
+ intra_mode_cost[mbmi->mode];
+ best_rd_tmp = RDCOST(x->rdmult, tmp_rate, rd_stats_y->dist);
+ try_filter_intra = !((best_rd_tmp / 2) > search_state->best_rd);
+ } else {
+ try_filter_intra = !(search_state->best_mbmode.skip);
+ }
+ }
+ if (try_filter_intra) {
+ RD_STATS rd_stats_y_fi;
+ int filter_intra_selected_flag = 0;
+ TX_SIZE best_tx_size = mbmi->tx_size;
+ TX_TYPE best_txk_type[TXK_TYPE_BUF_LEN];
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ FILTER_INTRA_MODE best_fi_mode = FILTER_DC_PRED;
+
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ for (FILTER_INTRA_MODE fi_mode = FILTER_DC_PRED;
+ fi_mode < FILTER_INTRA_MODES; ++fi_mode) {
+ int64_t this_rd_tmp;
+ mbmi->filter_intra_mode_info.filter_intra_mode = fi_mode;
+ super_block_yrd(cpi, x, &rd_stats_y_fi, bsize, search_state->best_rd);
+ if (rd_stats_y_fi.rate == INT_MAX) {
+ continue;
+ }
+ const int this_rate_tmp =
+ rd_stats_y_fi.rate +
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize,
+ intra_mode_cost[mbmi->mode]);
+ this_rd_tmp = RDCOST(x->rdmult, this_rate_tmp, rd_stats_y_fi.dist);
+
+ if (this_rd_tmp != INT64_MAX && this_rd_tmp / 2 > search_state->best_rd) {
+ break;
+ }
+ if (this_rd_tmp < best_rd_tmp) {
+ best_tx_size = mbmi->tx_size;
+ memcpy(best_txk_type, mbmi->txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(best_blk_skip, x->blk_skip,
+ sizeof(best_blk_skip[0]) * ctx->num_4x4_blk);
+ best_fi_mode = fi_mode;
+ *rd_stats_y = rd_stats_y_fi;
+ filter_intra_selected_flag = 1;
+ best_rd_tmp = this_rd_tmp;
+ }
+ }
+
+ mbmi->tx_size = best_tx_size;
+ memcpy(mbmi->txk_type, best_txk_type,
+ sizeof(*best_txk_type) * TXK_TYPE_BUF_LEN);
+ memcpy(x->blk_skip, best_blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+
+ if (filter_intra_selected_flag) {
+ mbmi->filter_intra_mode_info.use_filter_intra = 1;
+ mbmi->filter_intra_mode_info.filter_intra_mode = best_fi_mode;
+ } else {
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ }
+ }
+ if (rd_stats_y->rate == INT_MAX) return INT64_MAX;
+ const int mode_cost_y =
+ intra_mode_info_cost_y(cpi, x, mbmi, bsize, intra_mode_cost[mbmi->mode]);
+ av1_init_rd_stats(rd_stats);
+ av1_init_rd_stats(rd_stats_uv);
+ if (num_planes > 1) {
+ uv_tx = av1_get_tx_size(AOM_PLANE_U, xd);
+ if (search_state->rate_uv_intra[uv_tx] == INT_MAX) {
+ int rate_y =
+ rd_stats_y->skip ? x->skip_cost[skip_ctx][1] : rd_stats_y->rate;
+ const int64_t rdy =
+ RDCOST(x->rdmult, rate_y + mode_cost_y, rd_stats_y->dist);
+ if (search_state->best_rd < (INT64_MAX / 2) &&
+ rdy > (search_state->best_rd + (search_state->best_rd >> 2))) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
+ choose_intra_uv_mode(
+ cpi, x, bsize, uv_tx, &search_state->rate_uv_intra[uv_tx],
+ &search_state->rate_uv_tokenonly[uv_tx],
+ &search_state->dist_uvs[uv_tx], &search_state->skip_uvs[uv_tx],
+ &search_state->mode_uv[uv_tx]);
+ if (try_palette) search_state->pmi_uv[uv_tx] = *pmi;
+ search_state->uv_angle_delta[uv_tx] = mbmi->angle_delta[PLANE_TYPE_UV];
+
+ const int uv_rate = search_state->rate_uv_tokenonly[uv_tx];
+ const int64_t uv_dist = search_state->dist_uvs[uv_tx];
+ const int64_t uv_rd = RDCOST(x->rdmult, uv_rate, uv_dist);
+ if (uv_rd > search_state->best_rd) {
+ search_state->skip_intra_modes = 1;
+ return INT64_MAX;
+ }
+ }
+
+ rd_stats_uv->rate = search_state->rate_uv_tokenonly[uv_tx];
+ rd_stats_uv->dist = search_state->dist_uvs[uv_tx];
+ rd_stats_uv->skip = search_state->skip_uvs[uv_tx];
+ rd_stats->skip = rd_stats_y->skip && rd_stats_uv->skip;
+ mbmi->uv_mode = search_state->mode_uv[uv_tx];
+ if (try_palette) {
+ pmi->palette_size[1] = search_state->pmi_uv[uv_tx].palette_size[1];
+ memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+ search_state->pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+ 2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+ }
+ mbmi->angle_delta[PLANE_TYPE_UV] = search_state->uv_angle_delta[uv_tx];
+ }
+ rd_stats->rate = rd_stats_y->rate + mode_cost_y;
+ if (!xd->lossless[mbmi->segment_id] && block_signals_txsize(bsize)) {
+ // super_block_yrd above includes the cost of the tx_size in the
+ // tokenonly rate, but for intra blocks, tx_size is always coded
+ // (prediction granularity), so we account for it in the full rate,
+ // not the tokenonly rate.
+ rd_stats_y->rate -= tx_size_cost(cm, x, bsize, mbmi->tx_size);
+ }
+ if (num_planes > 1 && !x->skip_chroma_rd) {
+ const int uv_mode_cost =
+ x->intra_uv_mode_cost[is_cfl_allowed(xd)][mbmi->mode][mbmi->uv_mode];
+ rd_stats->rate +=
+ rd_stats_uv->rate +
+ intra_mode_info_cost_uv(cpi, x, mbmi, bsize, uv_mode_cost);
+ }
+ if (mbmi->mode != DC_PRED && mbmi->mode != PAETH_PRED)
+ rd_stats->rate += intra_cost_penalty;
+ rd_stats->dist = rd_stats_y->dist + rd_stats_uv->dist;
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rd_stats->rate += ref_frame_cost;
+ if (rd_stats->skip) {
+ // Back out the coefficient coding costs
+ rd_stats->rate -= (rd_stats_y->rate + rd_stats_uv->rate);
+ rd_stats_y->rate = 0;
+ rd_stats_uv->rate = 0;
+ // Cost the skip mb case
+ rd_stats->rate += x->skip_cost[skip_ctx][1];
+ } else {
+ // Add in the cost of the no skip flag.
+ rd_stats->rate += x->skip_cost[skip_ctx][0];
+ }
+ // Calculate the final RD estimate for this mode.
+ const int64_t this_rd = RDCOST(x->rdmult, rd_stats->rate, rd_stats->dist);
+ // Keep record of best intra rd
+ if (this_rd < search_state->best_intra_rd) {
+ search_state->best_intra_rd = this_rd;
+ search_state->best_intra_mode = mbmi->mode;
+ }
+
+ if (sf->skip_intra_in_interframe) {
+ if (search_state->best_rd < (INT64_MAX / 2) &&
+ this_rd > (search_state->best_rd + (search_state->best_rd >> 1)))
+ search_state->skip_intra_modes = 1;
+ }
+
+ if (!disable_skip) {
+ for (int i = 0; i < REFERENCE_MODES; ++i)
+ search_state->best_pred_rd[i] =
+ AOMMIN(search_state->best_pred_rd[i], this_rd);
+ }
+ return this_rd;
+}
+
+static void collect_single_states(MACROBLOCK *x,
+ InterModeSearchState *search_state,
+ const MB_MODE_INFO *const mbmi) {
+ int i, j;
+ const MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame[0];
+ const PREDICTION_MODE this_mode = mbmi->mode;
+ const int dir = ref_frame <= GOLDEN_FRAME ? 0 : 1;
+ const int mode_offset = INTER_OFFSET(this_mode);
+ const int ref_set = get_drl_refmv_count(x, mbmi->ref_frame, this_mode);
+
+ // Simple rd
+ int64_t simple_rd = search_state->simple_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->simple_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < simple_rd) simple_rd = rd;
+ }
+
+ // Insertion sort of single_state
+ SingleInterModeState this_state_s = { simple_rd, ref_frame, 1 };
+ SingleInterModeState *state_s = search_state->single_state[dir][mode_offset];
+ i = search_state->single_state_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_s[j - 1].rd > this_state_s.rd; --j)
+ state_s[j] = state_s[j - 1];
+ state_s[j] = this_state_s;
+ search_state->single_state_cnt[dir][mode_offset]++;
+
+ // Modelled rd
+ int64_t modelled_rd = search_state->modelled_rd[this_mode][0][ref_frame];
+ for (int ref_mv_idx = 1; ref_mv_idx < ref_set; ++ref_mv_idx) {
+ int64_t rd = search_state->modelled_rd[this_mode][ref_mv_idx][ref_frame];
+ if (rd < modelled_rd) modelled_rd = rd;
+ }
+
+ // Insertion sort of single_state_modelled
+ SingleInterModeState this_state_m = { modelled_rd, ref_frame, 1 };
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode_offset];
+ i = search_state->single_state_modelled_cnt[dir][mode_offset];
+ for (j = i; j > 0 && state_m[j - 1].rd > this_state_m.rd; --j)
+ state_m[j] = state_m[j - 1];
+ state_m[j] = this_state_m;
+ search_state->single_state_modelled_cnt[dir][mode_offset]++;
+}
+
+static void analyze_single_states(const AV1_COMP *cpi,
+ InterModeSearchState *search_state) {
+ int i, j, dir, mode;
+ if (cpi->sf.prune_comp_search_by_single_result >= 1) {
+ for (dir = 0; dir < 2; ++dir) {
+ int64_t best_rd;
+ SingleInterModeState(*state)[FWD_REFS];
+
+ // Use the best rd of GLOBALMV or NEWMV to prune the unlikely
+ // reference frames for all the modes (NEARESTMV and NEARMV may not
+ // have same motion vectors). Always keep the best of each mode
+ // because it might form the best possible combination with other mode.
+ state = search_state->single_state[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_cnt[dir][mode]; ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+
+ state = search_state->single_state_modelled[dir];
+ best_rd = AOMMIN(state[INTER_OFFSET(NEWMV)][0].rd,
+ state[INTER_OFFSET(GLOBALMV)][0].rd);
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ for (i = 1; i < search_state->single_state_modelled_cnt[dir][mode];
+ ++i) {
+ if (state[mode][i].rd != INT64_MAX &&
+ (state[mode][i].rd >> 1) > best_rd) {
+ state[mode][i].valid = 0;
+ }
+ }
+ }
+ }
+ }
+
+ // Ordering by simple rd first, then by modelled rd
+ for (dir = 0; dir < 2; ++dir) {
+ for (mode = 0; mode < SINGLE_INTER_MODE_NUM; ++mode) {
+ const int state_cnt_s = search_state->single_state_cnt[dir][mode];
+ const int state_cnt_m =
+ search_state->single_state_modelled_cnt[dir][mode];
+ SingleInterModeState *state_s = search_state->single_state[dir][mode];
+ SingleInterModeState *state_m =
+ search_state->single_state_modelled[dir][mode];
+ int count = 0;
+ const int max_candidates = AOMMAX(state_cnt_s, state_cnt_m);
+ for (i = 0; i < state_cnt_s; ++i) {
+ if (state_s[i].rd == INT64_MAX) break;
+ if (state_s[i].valid)
+ search_state->single_rd_order[dir][mode][count++] =
+ state_s[i].ref_frame;
+ }
+ if (count < max_candidates) {
+ for (i = 0; i < state_cnt_m; ++i) {
+ if (state_m[i].rd == INT64_MAX) break;
+ if (state_m[i].valid) {
+ int ref_frame = state_m[i].ref_frame;
+ int match = 0;
+ // Check if existing already
+ for (j = 0; j < count; ++j) {
+ if (search_state->single_rd_order[dir][mode][j] == ref_frame) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) {
+ // Check if this ref_frame is removed in simple rd
+ int valid = 1;
+ for (j = 0; j < state_cnt_s; j++) {
+ if (ref_frame == state_s[j].ref_frame && !state_s[j].valid) {
+ valid = 0;
+ break;
+ }
+ }
+ if (valid)
+ search_state->single_rd_order[dir][mode][count++] = ref_frame;
+ }
+ if (count >= max_candidates) break;
+ }
+ }
+ }
+ }
+ }
+}
+
+static int compound_skip_get_candidates(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const int dir, const PREDICTION_MODE mode) {
+ const int mode_offset = INTER_OFFSET(mode);
+ const SingleInterModeState *state =
+ search_state->single_state[dir][mode_offset];
+ const SingleInterModeState *state_modelled =
+ search_state->single_state_modelled[dir][mode_offset];
+ int max_candidates = 0;
+ int candidates;
+
+ for (int i = 0; i < FWD_REFS; ++i) {
+ if (search_state->single_rd_order[dir][mode_offset][i] == NONE_FRAME) break;
+ max_candidates++;
+ }
+
+ candidates = max_candidates;
+ if (cpi->sf.prune_comp_search_by_single_result >= 2) {
+ candidates = AOMMIN(2, max_candidates);
+ }
+ if (cpi->sf.prune_comp_search_by_single_result >= 3) {
+ if (state[0].rd != INT64_MAX && state_modelled[0].rd != INT64_MAX &&
+ state[0].ref_frame == state_modelled[0].ref_frame)
+ candidates = 1;
+ if (mode == NEARMV || mode == GLOBALMV) candidates = 1;
+ }
+ return candidates;
+}
+
+static int compound_skip_by_single_states(
+ const AV1_COMP *cpi, const InterModeSearchState *search_state,
+ const PREDICTION_MODE this_mode, const MV_REFERENCE_FRAME ref_frame,
+ const MV_REFERENCE_FRAME second_ref_frame, const MACROBLOCK *x) {
+ const MV_REFERENCE_FRAME refs[2] = { ref_frame, second_ref_frame };
+ const int mode[2] = { compound_ref0_mode(this_mode),
+ compound_ref1_mode(this_mode) };
+ const int mode_offset[2] = { INTER_OFFSET(mode[0]), INTER_OFFSET(mode[1]) };
+ const int mode_dir[2] = { refs[0] <= GOLDEN_FRAME ? 0 : 1,
+ refs[1] <= GOLDEN_FRAME ? 0 : 1 };
+ int ref_searched[2] = { 0, 0 };
+ int ref_mv_match[2] = { 1, 1 };
+ int i, j;
+
+ for (i = 0; i < 2; ++i) {
+ const SingleInterModeState *state =
+ search_state->single_state[mode_dir[i]][mode_offset[i]];
+ const int state_cnt =
+ search_state->single_state_cnt[mode_dir[i]][mode_offset[i]];
+ for (j = 0; j < state_cnt; ++j) {
+ if (state[j].ref_frame == refs[i]) {
+ ref_searched[i] = 1;
+ break;
+ }
+ }
+ }
+
+ const int ref_set = get_drl_refmv_count(x, refs, this_mode);
+ for (i = 0; i < 2; ++i) {
+ if (mode[i] == NEARESTMV || mode[i] == NEARMV) {
+ const MV_REFERENCE_FRAME single_refs[2] = { refs[i], NONE_FRAME };
+ int idential = 1;
+ for (int ref_mv_idx = 0; ref_mv_idx < ref_set; ref_mv_idx++) {
+ int_mv single_mv;
+ int_mv comp_mv;
+ get_this_mv(&single_mv, mode[i], 0, ref_mv_idx, single_refs,
+ x->mbmi_ext);
+ get_this_mv(&comp_mv, this_mode, i, ref_mv_idx, refs, x->mbmi_ext);
+
+ idential &= (single_mv.as_int == comp_mv.as_int);
+ if (!idential) {
+ ref_mv_match[i] = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ for (i = 0; i < 2; ++i) {
+ if (ref_searched[i] && ref_mv_match[i]) {
+ const int candidates =
+ compound_skip_get_candidates(cpi, search_state, mode_dir[i], mode[i]);
+ const MV_REFERENCE_FRAME *ref_order =
+ search_state->single_rd_order[mode_dir[i]][mode_offset[i]];
+ int match = 0;
+ for (j = 0; j < candidates; ++j) {
+ if (refs[i] == ref_order[j]) {
+ match = 1;
+ break;
+ }
+ }
+ if (!match) return 1;
+ }
+ }
+
+ return 0;
+}
+
+static INLINE int sf_check_is_drop_ref(const MODE_DEFINITION *mode,
+ InterModeSearchState *search_state) {
+ const MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode->ref_frame[1];
+ if (search_state->num_available_refs > 2) {
+ if ((ref_frame == search_state->dist_order_refs[0] &&
+ second_ref_frame == search_state->dist_order_refs[1]) ||
+ (ref_frame == search_state->dist_order_refs[1] &&
+ second_ref_frame == search_state->dist_order_refs[0]))
+ return 1; // drop this pair of refs
+ }
+ return 0;
+}
+
+static INLINE void sf_drop_ref_analyze(InterModeSearchState *search_state,
+ const MODE_DEFINITION *mode,
+ int64_t distortion2) {
+ const PREDICTION_MODE this_mode = mode->mode;
+ MV_REFERENCE_FRAME ref_frame = mode->ref_frame[0];
+ const int idx = ref_frame - LAST_FRAME;
+ if (idx && distortion2 > search_state->dist_refs[idx]) {
+ search_state->dist_refs[idx] = distortion2;
+ search_state->dist_order_refs[idx] = ref_frame;
+ }
+
+ // Reach the last single ref prediction mode
+ if (ref_frame == ALTREF_FRAME && this_mode == GLOBALMV) {
+ // bubble sort dist_refs and the order index
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ for (int k = i + 1; k < REF_FRAMES; ++k) {
+ if (search_state->dist_refs[i] < search_state->dist_refs[k]) {
+ int64_t tmp_dist = search_state->dist_refs[i];
+ search_state->dist_refs[i] = search_state->dist_refs[k];
+ search_state->dist_refs[k] = tmp_dist;
+
+ int tmp_idx = search_state->dist_order_refs[i];
+ search_state->dist_order_refs[i] = search_state->dist_order_refs[k];
+ search_state->dist_order_refs[k] = tmp_idx;
+ }
+ }
+ }
+ for (int i = 0; i < REF_FRAMES; ++i) {
+ if (search_state->dist_refs[i] == -1) break;
+ search_state->num_available_refs = i;
+ }
+ search_state->num_available_refs++;
+ }
+}
+
+static void alloc_compound_type_rd_buffers(AV1_COMMON *const cm,
+ CompoundTypeRdBuffers *const bufs) {
+ CHECK_MEM_ERROR(
+ cm, bufs->pred0,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred0)));
+ CHECK_MEM_ERROR(
+ cm, bufs->pred1,
+ (uint8_t *)aom_memalign(16, 2 * MAX_SB_SQUARE * sizeof(*bufs->pred1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->residual1,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->residual1)));
+ CHECK_MEM_ERROR(
+ cm, bufs->diff10,
+ (int16_t *)aom_memalign(32, MAX_SB_SQUARE * sizeof(*bufs->diff10)));
+ CHECK_MEM_ERROR(cm, bufs->tmp_best_mask_buf,
+ (uint8_t *)aom_malloc(2 * MAX_SB_SQUARE *
+ sizeof(*bufs->tmp_best_mask_buf)));
+}
+
+static void release_compound_type_rd_buffers(
+ CompoundTypeRdBuffers *const bufs) {
+ aom_free(bufs->pred0);
+ aom_free(bufs->pred1);
+ aom_free(bufs->residual1);
+ aom_free(bufs->diff10);
+ aom_free(bufs->tmp_best_mask_buf);
+ av1_zero(*bufs); // Set all pointers to NULL for safety.
+}
+
+void av1_rd_pick_inter_mode_sb(AV1_COMP *cpi, TileDataEnc *tile_data,
+ MACROBLOCK *x, int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ const SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int try_palette =
+ av1_allow_palette(cm->allow_screen_content_tools, mbmi->sb_type);
+ PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ const struct segmentation *const seg = &cm->seg;
+ PREDICTION_MODE this_mode;
+ unsigned char segment_id = mbmi->segment_id;
+ int i;
+ struct buf_2d yv12_mb[REF_FRAMES][MAX_MB_PLANE];
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+ int *mode_map = tile_data->mode_map[bsize];
+ uint32_t mode_skip_mask[REF_FRAMES];
+ uint16_t ref_frame_skip_mask[2];
+
+ InterModeSearchState search_state;
+ init_inter_mode_search_state(&search_state, cpi, tile_data, x, bsize,
+ best_rd_so_far);
+ INTERINTRA_MODE interintra_modes[REF_FRAMES] = {
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES,
+ INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES, INTERINTRA_MODES
+ };
+ HandleInterModeArgs args = {
+ { NULL }, { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE },
+ { NULL }, { MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1, MAX_SB_SIZE >> 1 },
+ NULL, NULL,
+ NULL, search_state.modelled_rd,
+ { { 0 } }, INT_MAX,
+ INT_MAX, search_state.simple_rd,
+ 0, interintra_modes
+ };
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+
+ av1_invalid_rd_stats(rd_cost);
+
+ // init params, set frame modes, speed features
+ set_params_rd_pick_inter_mode(
+ cpi, x, &args, bsize, mi_row, mi_col, ref_frame_skip_mask, mode_skip_mask,
+ ctx->skip_ref_frame_mask, ref_costs_single, ref_costs_comp, yv12_mb);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ int64_t best_est_rd = INT64_MAX;
+ // TODO(angiebird): Turn this on when this speed feature is well tested
+#if 1
+ const InterModeRdModel *md = &tile_data->inter_mode_rd_models[bsize];
+ const int do_tx_search = !md->ready;
+#else
+ const int do_tx_search = 1;
+#endif
+ InterModesInfo *inter_modes_info = &tile_data->inter_modes_info;
+ inter_modes_info->num = 0;
+#endif
+
+ int intra_mode_num = 0;
+ int intra_mode_idx_ls[MAX_MODES];
+ int reach_first_comp_mode = 0;
+
+ // Temporary buffers used by handle_inter_mode().
+ // We allocate them once and reuse it in every call to that function.
+ // Note: Must be allocated on the heap due to large size of the arrays.
+ uint8_t *tmp_buf_orig;
+ CHECK_MEM_ERROR(
+ cm, tmp_buf_orig,
+ (uint8_t *)aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE));
+ uint8_t *const tmp_buf = get_buf_by_bd(xd, tmp_buf_orig);
+
+ CompoundTypeRdBuffers rd_buffers;
+ alloc_compound_type_rd_buffers(cm, &rd_buffers);
+
+ for (int midx = 0; midx < MAX_MODES; ++midx) {
+ int mode_index = mode_map[midx];
+ int64_t this_rd = INT64_MAX;
+ int disable_skip = 0;
+ int rate2 = 0, rate_y = 0, rate_uv = 0;
+ int64_t distortion2 = 0;
+ int skippable = 0;
+ int this_skip2 = 0;
+ const MODE_DEFINITION *mode_order = &av1_mode_order[mode_index];
+ const MV_REFERENCE_FRAME ref_frame = mode_order->ref_frame[0];
+ const MV_REFERENCE_FRAME second_ref_frame = mode_order->ref_frame[1];
+ const int comp_pred = second_ref_frame > INTRA_FRAME;
+ this_mode = mode_order->mode;
+
+ init_mbmi(mbmi, mode_index, cm);
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
+
+ // Reach the first compound prediction mode
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred &&
+ reach_first_comp_mode == 0) {
+ analyze_single_states(cpi, &search_state);
+ reach_first_comp_mode = 1;
+ }
+ const int ret = inter_mode_search_order_independent_skip(
+ cpi, ctx, x, bsize, mode_index, mi_row, mi_col, mode_skip_mask,
+ ref_frame_skip_mask, &search_state);
+ if (ret == 1) continue;
+ args.skip_motion_mode = (ret == 2);
+
+ if (sf->drop_ref && comp_pred) {
+ if (sf_check_is_drop_ref(mode_order, &search_state)) {
+ continue;
+ }
+ }
+
+ if (search_state.best_rd < search_state.mode_threshold[mode_index])
+ continue;
+
+ if (sf->prune_comp_search_by_single_result > 0 && comp_pred) {
+ if (compound_skip_by_single_states(cpi, &search_state, this_mode,
+ ref_frame, second_ref_frame, x))
+ continue;
+ }
+
+ const int ref_frame_cost = comp_pred
+ ? ref_costs_comp[ref_frame][second_ref_frame]
+ : ref_costs_single[ref_frame];
+ const int compmode_cost =
+ is_comp_ref_allowed(mbmi->sb_type) ? comp_inter_cost[comp_pred] : 0;
+ const int real_compmode_cost =
+ cm->reference_mode == REFERENCE_MODE_SELECT ? compmode_cost : 0;
+
+ if (comp_pred) {
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
+ search_state.best_mode_index >= 0 &&
+ search_state.best_mbmode.ref_frame[0] == INTRA_FRAME)
+ continue;
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ if (sf->adaptive_mode_search)
+ if ((x->source_variance << num_pels_log2_lookup[bsize]) >
+ search_state.best_pred_sse)
+ continue;
+
+ if (this_mode != DC_PRED) {
+ // Only search the oblique modes if the best so far is
+ // one of the neighboring directional modes
+ if ((sf->mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
+ (this_mode >= D45_PRED && this_mode <= PAETH_PRED)) {
+ if (search_state.best_mode_index >= 0 &&
+ search_state.best_mbmode.ref_frame[0] > INTRA_FRAME)
+ continue;
+ }
+ if (sf->mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
+ if (conditional_skipintra(this_mode, search_state.best_intra_mode))
+ continue;
+ }
+ }
+ }
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+ }
+
+ if (ref_frame == INTRA_FRAME) {
+ intra_mode_idx_ls[intra_mode_num++] = mode_index;
+ continue;
+ } else {
+ mbmi->angle_delta[PLANE_TYPE_Y] = 0;
+ mbmi->angle_delta[PLANE_TYPE_UV] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->ref_mv_idx = 0;
+ int64_t ref_best_rd = search_state.best_rd;
+ {
+ RD_STATS rd_stats, rd_stats_y, rd_stats_uv;
+ av1_init_rd_stats(&rd_stats);
+ rd_stats.rate = rate2;
+
+ // Point to variables that are maintained between loop iterations
+ args.single_newmv = search_state.single_newmv;
+ args.single_newmv_rate = search_state.single_newmv_rate;
+ args.single_newmv_valid = search_state.single_newmv_valid;
+ args.single_comp_cost = real_compmode_cost;
+ args.ref_frame_cost = ref_frame_cost;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ this_rd = handle_inter_mode(
+ cpi, x, bsize, &rd_stats, &rd_stats_y, &rd_stats_uv, &disable_skip,
+ mi_row, mi_col, &args, ref_best_rd, tmp_buf, &rd_buffers, tile_data,
+ &best_est_rd, do_tx_search, inter_modes_info);
+#else
+ this_rd = handle_inter_mode(cpi, x, bsize, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, &disable_skip, mi_row, mi_col,
+ &args, ref_best_rd, tmp_buf, &rd_buffers);
+#endif
+ rate2 = rd_stats.rate;
+ skippable = rd_stats.skip;
+ distortion2 = rd_stats.dist;
+ rate_y = rd_stats_y.rate;
+ rate_uv = rd_stats_uv.rate;
+ }
+
+ if (sf->prune_comp_search_by_single_result > 0 &&
+ is_inter_singleref_mode(this_mode)) {
+ collect_single_states(x, &search_state, mbmi);
+ }
+
+ if (this_rd == INT64_MAX) continue;
+
+ this_skip2 = mbmi->skip;
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+ if (this_skip2) {
+ rate_y = 0;
+ rate_uv = 0;
+ }
+ }
+
+ // Did this mode help.. i.e. is it the new best mode
+ if (this_rd < search_state.best_rd || x->skip) {
+ int mode_excluded = 0;
+ if (comp_pred) {
+ mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
+ }
+ if (!mode_excluded) {
+ // Note index of best mode so far
+ search_state.best_mode_index = mode_index;
+
+ if (ref_frame == INTRA_FRAME) {
+ /* required for left and above block mv */
+ mbmi->mv[0].as_int = 0;
+ } else {
+ search_state.best_pred_sse = x->pred_sse[ref_frame];
+ }
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+ search_state.best_rd = this_rd;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = this_skip2;
+ search_state.best_mode_skippable = skippable;
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (do_tx_search) {
+ // When do_tx_search == 0, handle_inter_mode won't provide correct
+ // rate_y and rate_uv because txfm_search process is replaced by
+ // rd estimation.
+ // Therfore, we should avoid updating best_rate_y and best_rate_uv
+ // here. These two values will be updated when txfm_search is called
+ search_state.best_rate_y =
+ rate_y +
+ x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+ search_state.best_rate_uv = rate_uv;
+ }
+#else // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ search_state.best_rate_y =
+ rate_y +
+ x->skip_cost[av1_get_skip_context(xd)][this_skip2 || skippable];
+ search_state.best_rate_uv = rate_uv;
+#endif // CONFIG_COLLECT_INTER_MODE_RD_STATS
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+
+ /* keep record of best compound/single-only prediction */
+ if (!disable_skip && ref_frame != INTRA_FRAME) {
+ int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+ single_rate = rate2 - compmode_cost;
+ hybrid_rate = rate2;
+ } else {
+ single_rate = rate2;
+ hybrid_rate = rate2 + compmode_cost;
+ }
+
+ single_rd = RDCOST(x->rdmult, single_rate, distortion2);
+ hybrid_rd = RDCOST(x->rdmult, hybrid_rate, distortion2);
+
+ if (!comp_pred) {
+ if (single_rd < search_state.best_pred_rd[SINGLE_REFERENCE])
+ search_state.best_pred_rd[SINGLE_REFERENCE] = single_rd;
+ } else {
+ if (single_rd < search_state.best_pred_rd[COMPOUND_REFERENCE])
+ search_state.best_pred_rd[COMPOUND_REFERENCE] = single_rd;
+ }
+ if (hybrid_rd < search_state.best_pred_rd[REFERENCE_MODE_SELECT])
+ search_state.best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
+ }
+ if (sf->drop_ref && second_ref_frame == NONE_FRAME) {
+ // Collect data from single ref mode, and analyze data.
+ sf_drop_ref_analyze(&search_state, mode_order, distortion2);
+ }
+
+ if (x->skip && !comp_pred) break;
+ }
+
+ aom_free(tmp_buf_orig);
+ tmp_buf_orig = NULL;
+ release_compound_type_rd_buffers(&rd_buffers);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+ if (!do_tx_search) {
+ inter_modes_info_sort(inter_modes_info, inter_modes_info->rd_idx_pair_arr);
+ search_state.best_rd = INT64_MAX;
+
+ int64_t top_est_rd =
+ inter_modes_info->est_rd_arr[inter_modes_info->rd_idx_pair_arr[0].idx];
+ for (int j = 0; j < inter_modes_info->num; ++j) {
+ const int data_idx = inter_modes_info->rd_idx_pair_arr[j].idx;
+ *mbmi = inter_modes_info->mbmi_arr[data_idx];
+ int64_t curr_est_rd = inter_modes_info->est_rd_arr[data_idx];
+ if (curr_est_rd * 0.9 > top_est_rd) {
+ continue;
+ }
+ const int mode_rate = inter_modes_info->mode_rate_arr[data_idx];
+
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+ // Select prediction reference frames.
+ const int is_comp_pred = mbmi->ref_frame[1] > INTRA_FRAME;
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+ if (is_comp_pred) xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+ }
+
+ RD_STATS rd_stats;
+ RD_STATS rd_stats_y;
+ RD_STATS rd_stats_uv;
+
+ av1_build_inter_predictors_sb(cm, xd, mi_row, mi_col, NULL, bsize);
+ if (mbmi->motion_mode == OBMC_CAUSAL)
+ av1_build_obmc_inter_predictors_sb(cm, xd, mi_row, mi_col);
+
+ if (!txfm_search(cpi, x, bsize, mi_row, mi_col, &rd_stats, &rd_stats_y,
+ &rd_stats_uv, mode_rate, search_state.best_rd)) {
+ continue;
+ } else {
+ const int skip_ctx = av1_get_skip_context(xd);
+ inter_mode_data_push(tile_data, mbmi->sb_type, rd_stats.sse,
+ rd_stats.dist,
+ rd_stats_y.rate + rd_stats_uv.rate +
+ x->skip_cost[skip_ctx][mbmi->skip]);
+ }
+ rd_stats.rdcost = RDCOST(x->rdmult, rd_stats.rate, rd_stats.dist);
+
+ if (rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = rd_stats.rdcost;
+ // Note index of best mode so far
+ const int mode_index = get_prediction_mode_idx(
+ mbmi->mode, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+ search_state.best_mode_index = mode_index;
+ *rd_cost = rd_stats;
+ search_state.best_rd = rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = mbmi->skip;
+ search_state.best_mode_skippable = rd_stats.skip;
+ search_state.best_rate_y =
+ rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][rd_stats.skip || mbmi->skip];
+ search_state.best_rate_uv = rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+ }
+#endif
+
+ for (int j = 0; j < intra_mode_num; ++j) {
+ const int mode_index = intra_mode_idx_ls[j];
+ const MV_REFERENCE_FRAME ref_frame =
+ av1_mode_order[mode_index].ref_frame[0];
+ assert(av1_mode_order[mode_index].ref_frame[1] == NONE_FRAME);
+ assert(ref_frame == INTRA_FRAME);
+ if (sf->skip_intra_in_interframe && search_state.skip_intra_modes) break;
+ init_mbmi(mbmi, mode_index, cm);
+ x->skip = 0;
+ set_ref_ptrs(cm, xd, INTRA_FRAME, NONE_FRAME);
+
+ // Select prediction reference frames.
+ for (i = 0; i < num_planes; i++) {
+ xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+ }
+
+ RD_STATS intra_rd_stats, intra_rd_stats_y, intra_rd_stats_uv;
+
+ const int ref_frame_cost = ref_costs_single[ref_frame];
+ intra_rd_stats.rdcost = handle_intra_mode(
+ &search_state, cpi, x, bsize, mi_row, mi_col, ref_frame_cost, ctx, 0,
+ &intra_rd_stats, &intra_rd_stats_y, &intra_rd_stats_uv);
+ if (intra_rd_stats.rdcost < search_state.best_rd) {
+ search_state.best_rd = intra_rd_stats.rdcost;
+ // Note index of best mode so far
+ search_state.best_mode_index = mode_index;
+ *rd_cost = intra_rd_stats;
+ search_state.best_rd = intra_rd_stats.rdcost;
+ search_state.best_mbmode = *mbmi;
+ search_state.best_skip2 = 0;
+ search_state.best_mode_skippable = intra_rd_stats.skip;
+ search_state.best_rate_y =
+ intra_rd_stats_y.rate +
+ x->skip_cost[av1_get_skip_context(xd)][intra_rd_stats.skip];
+ search_state.best_rate_uv = intra_rd_stats_uv.rate;
+ memcpy(ctx->blk_skip, x->blk_skip,
+ sizeof(x->blk_skip[0]) * ctx->num_4x4_blk);
+ }
+ }
+
+ // In effect only when speed >= 2.
+ sf_refine_fast_tx_type_search(
+ cpi, x, mi_row, mi_col, rd_cost, bsize, ctx, search_state.best_mode_index,
+ &search_state.best_mbmode, yv12_mb, search_state.best_rate_y,
+ search_state.best_rate_uv, &search_state.best_skip2);
+
+ // Only try palette mode when the best mode so far is an intra mode.
+ if (try_palette && !is_inter_mode(search_state.best_mbmode.mode)) {
+ search_palette_mode(cpi, x, mi_row, mi_col, rd_cost, ctx, bsize, mbmi, pmi,
+ ref_costs_single, &search_state);
+ }
+
+ search_state.best_mbmode.skip_mode = 0;
+ if (cm->skip_mode_flag &&
+ !segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
+ is_comp_ref_allowed(bsize)) {
+ rd_pick_skip_mode(rd_cost, &search_state, cpi, x, bsize, mi_row, mi_col,
+ yv12_mb);
+ }
+
+ // Make sure that the ref_mv_idx is only nonzero when we're
+ // using a mode which can support ref_mv_idx
+ if (search_state.best_mbmode.ref_mv_idx != 0 &&
+ !(search_state.best_mbmode.mode == NEWMV ||
+ search_state.best_mbmode.mode == NEW_NEWMV ||
+ have_nearmv_in_inter_mode(search_state.best_mbmode.mode))) {
+ search_state.best_mbmode.ref_mv_idx = 0;
+ }
+
+ if (search_state.best_mode_index < 0 ||
+ search_state.best_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert(
+ (cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter ==
+ av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 0)) ||
+ !is_inter_block(&search_state.best_mbmode));
+ assert(
+ (cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter ==
+ av1_extract_interp_filter(search_state.best_mbmode.interp_filters, 1)) ||
+ !is_inter_block(&search_state.best_mbmode));
+
+ if (!cpi->rc.is_src_frame_alt_ref)
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ sf->adaptive_rd_thresh, bsize,
+ search_state.best_mode_index);
+
+ // macroblock modes
+ *mbmi = search_state.best_mbmode;
+ x->skip |= search_state.best_skip2;
+
+ // Note: this section is needed since the mode may have been forced to
+ // GLOBALMV by the all-zero mode handling of ref-mv.
+ if (mbmi->mode == GLOBALMV || mbmi->mode == GLOBAL_GLOBALMV) {
+ // Correct the interp filters for GLOBALMV
+ if (is_nontrans_global_motion(xd, xd->mi[0])) {
+ assert(mbmi->interp_filters ==
+ av1_broadcast_interp_filter(
+ av1_unswitchable_filter(cm->interp_filter)));
+ }
+ }
+
+ for (i = 0; i < REFERENCE_MODES; ++i) {
+ if (search_state.best_pred_rd[i] == INT64_MAX)
+ search_state.best_pred_diff[i] = INT_MIN;
+ else
+ search_state.best_pred_diff[i] =
+ search_state.best_rd - search_state.best_pred_rd[i];
+ }
+
+ x->skip |= search_state.best_mode_skippable;
+
+ assert(search_state.best_mode_index >= 0);
+
+ store_coding_context(x, ctx, search_state.best_mode_index,
+ search_state.best_pred_diff,
+ search_state.best_mode_skippable);
+
+ if (pmi->palette_size[1] > 0) {
+ assert(try_palette);
+ restore_uv_color_map(cpi, x);
+ }
+}
+
+void av1_rd_pick_inter_mode_sb_seg_skip(const AV1_COMP *cpi,
+ TileDataEnc *tile_data, MACROBLOCK *x,
+ int mi_row, int mi_col,
+ RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd_so_far) {
+ const AV1_COMMON *const cm = &cpi->common;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ unsigned char segment_id = mbmi->segment_id;
+ const int comp_pred = 0;
+ int i;
+ int64_t best_pred_diff[REFERENCE_MODES];
+ unsigned int ref_costs_single[REF_FRAMES];
+ unsigned int ref_costs_comp[REF_FRAMES][REF_FRAMES];
+ int *comp_inter_cost = x->comp_inter_cost[av1_get_reference_mode_context(xd)];
+ InterpFilter best_filter = SWITCHABLE;
+ int64_t this_rd = INT64_MAX;
+ int rate2 = 0;
+ const int64_t distortion2 = 0;
+ (void)mi_row;
+ (void)mi_col;
+
+ av1_collect_neighbors_ref_counts(xd);
+
+ estimate_ref_frame_costs(cm, xd, x, segment_id, ref_costs_single,
+ ref_costs_comp);
+
+ for (i = 0; i < REF_FRAMES; ++i) x->pred_sse[i] = INT_MAX;
+ for (i = LAST_FRAME; i < REF_FRAMES; ++i) x->pred_mv_sad[i] = INT_MAX;
+
+ rd_cost->rate = INT_MAX;
+
+ assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
+
+ mbmi->palette_mode_info.palette_size[0] = 0;
+ mbmi->palette_mode_info.palette_size[1] = 0;
+ mbmi->filter_intra_mode_info.use_filter_intra = 0;
+ mbmi->mode = GLOBALMV;
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ mbmi->uv_mode = UV_DC_PRED;
+ if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME))
+ mbmi->ref_frame[0] = get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME);
+ else
+ mbmi->ref_frame[0] = LAST_FRAME;
+ mbmi->ref_frame[1] = NONE_FRAME;
+ mbmi->mv[0].as_int =
+ gm_get_motion_vector(&cm->global_motion[mbmi->ref_frame[0]],
+ cm->allow_high_precision_mv, bsize, mi_col, mi_row,
+ cm->cur_frame_force_integer_mv)
+ .as_int;
+ mbmi->tx_size = max_txsize_lookup[bsize];
+ x->skip = 1;
+
+ mbmi->ref_mv_idx = 0;
+
+ mbmi->motion_mode = SIMPLE_TRANSLATION;
+ av1_count_overlappable_neighbors(cm, xd, mi_row, mi_col);
+ if (is_motion_variation_allowed_bsize(bsize) && !has_second_ref(mbmi)) {
+ int pts[SAMPLES_ARRAY_SIZE], pts_inref[SAMPLES_ARRAY_SIZE];
+ mbmi->num_proj_ref = findSamples(cm, xd, mi_row, mi_col, pts, pts_inref);
+ // Select the samples according to motion vector difference
+ if (mbmi->num_proj_ref > 1)
+ mbmi->num_proj_ref = selectSamples(&mbmi->mv[0].as_mv, pts, pts_inref,
+ mbmi->num_proj_ref, bsize);
+ }
+
+ set_default_interp_filters(mbmi, cm->interp_filter);
+
+ if (cm->interp_filter != SWITCHABLE) {
+ best_filter = cm->interp_filter;
+ } else {
+ best_filter = EIGHTTAP_REGULAR;
+ if (av1_is_interp_needed(xd) && av1_is_interp_search_needed(xd) &&
+ x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
+ int rs;
+ int best_rs = INT_MAX;
+ for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+ mbmi->interp_filters = av1_broadcast_interp_filter(i);
+ rs = av1_get_switchable_rate(cm, x, xd);
+ if (rs < best_rs) {
+ best_rs = rs;
+ best_filter = av1_extract_interp_filter(mbmi->interp_filters, 0);
+ }
+ }
+ }
+ }
+ // Set the appropriate filter
+ mbmi->interp_filters = av1_broadcast_interp_filter(best_filter);
+ rate2 += av1_get_switchable_rate(cm, x, xd);
+
+ if (cm->reference_mode == REFERENCE_MODE_SELECT)
+ rate2 += comp_inter_cost[comp_pred];
+
+ // Estimate the reference frame signaling cost and add it
+ // to the rolling cost variable.
+ rate2 += ref_costs_single[LAST_FRAME];
+ this_rd = RDCOST(x->rdmult, rate2, distortion2);
+
+ rd_cost->rate = rate2;
+ rd_cost->dist = distortion2;
+ rd_cost->rdcost = this_rd;
+
+ if (this_rd >= best_rd_so_far) {
+ rd_cost->rate = INT_MAX;
+ rd_cost->rdcost = INT64_MAX;
+ return;
+ }
+
+ assert((cm->interp_filter == SWITCHABLE) ||
+ (cm->interp_filter ==
+ av1_extract_interp_filter(mbmi->interp_filters, 0)));
+
+ av1_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+ cpi->sf.adaptive_rd_thresh, bsize, THR_GLOBALMV);
+
+ av1_zero(best_pred_diff);
+
+ store_coding_context(x, ctx, THR_GLOBALMV, best_pred_diff, 0);
+}
+
+struct calc_target_weighted_pred_ctxt {
+ const MACROBLOCK *x;
+ const uint8_t *tmp;
+ int tmp_stride;
+ int overlap;
+};
+
+static INLINE void calc_target_weighted_pred_above(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t nb_mi_width, MB_MODE_INFO *nb_mi,
+ void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_col * MI_SIZE);
+ int32_t *mask = ctxt->x->mask_buf + (rel_mi_col * MI_SIZE);
+ const uint8_t *tmp = ctxt->tmp + rel_mi_col * MI_SIZE;
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+ if (!is_hbd) {
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < ctxt->overlap; ++row) {
+ const uint8_t m0 = mask1d[row];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ for (int col = 0; col < nb_mi_width * MI_SIZE; ++col) {
+ wsrc[col] = m1 * tmp16[col];
+ mask[col] = m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+static INLINE void calc_target_weighted_pred_left(
+ MACROBLOCKD *xd, int rel_mi_row, uint8_t nb_mi_height, MB_MODE_INFO *nb_mi,
+ void *fun_ctxt, const int num_planes) {
+ (void)nb_mi;
+ (void)num_planes;
+
+ struct calc_target_weighted_pred_ctxt *ctxt =
+ (struct calc_target_weighted_pred_ctxt *)fun_ctxt;
+
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
+ const uint8_t *const mask1d = av1_get_obmc_mask(ctxt->overlap);
+
+ int32_t *wsrc = ctxt->x->wsrc_buf + (rel_mi_row * MI_SIZE * bw);
+ int32_t *mask = ctxt->x->mask_buf + (rel_mi_row * MI_SIZE * bw);
+ const uint8_t *tmp = ctxt->tmp + (rel_mi_row * MI_SIZE * ctxt->tmp_stride);
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+
+ if (!is_hbd) {
+ for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp += ctxt->tmp_stride;
+ }
+ } else {
+ const uint16_t *tmp16 = CONVERT_TO_SHORTPTR(tmp);
+
+ for (int row = 0; row < nb_mi_height * MI_SIZE; ++row) {
+ for (int col = 0; col < ctxt->overlap; ++col) {
+ const uint8_t m0 = mask1d[col];
+ const uint8_t m1 = AOM_BLEND_A64_MAX_ALPHA - m0;
+ wsrc[col] = (wsrc[col] >> AOM_BLEND_A64_ROUND_BITS) * m0 +
+ (tmp16[col] << AOM_BLEND_A64_ROUND_BITS) * m1;
+ mask[col] = (mask[col] >> AOM_BLEND_A64_ROUND_BITS) * m0;
+ }
+ wsrc += bw;
+ mask += bw;
+ tmp16 += ctxt->tmp_stride;
+ }
+ }
+}
+
+// This function has a structure similar to av1_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+// PObmc(x,y) =
+// AOM_BLEND_A64(Mh(x),
+// AOM_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+// PLeft(x, y))
+//
+// Scaling up by AOM_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+// Mh(x) * Mv(y) * P(x,y) +
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+// Cv(y) = AOM_BLEND_A64_MAX_ALPHA - Mv(y)
+// Ch(y) = AOM_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+// wsrc(x, y) =
+// AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA * src(x, y) -
+// Mh(x) * Cv(y) * Pabove(x,y) +
+// AOM_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+// error(x, y) =
+// wsrc(x, y) - mask(x, y) * P(x, y) / (AOM_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(const AV1_COMMON *cm, const MACROBLOCK *x,
+ const MACROBLOCKD *xd, int mi_row,
+ int mi_col, const uint8_t *above,
+ int above_stride, const uint8_t *left,
+ int left_stride) {
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ const int bw = xd->n4_w << MI_SIZE_LOG2;
+ const int bh = xd->n4_h << MI_SIZE_LOG2;
+ int32_t *mask_buf = x->mask_buf;
+ int32_t *wsrc_buf = x->wsrc_buf;
+
+ const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int src_scale = AOM_BLEND_A64_MAX_ALPHA * AOM_BLEND_A64_MAX_ALPHA;
+
+ // plane 0 should not be subsampled
+ assert(xd->plane[0].subsampling_x == 0);
+ assert(xd->plane[0].subsampling_y == 0);
+
+ av1_zero_array(wsrc_buf, bw * bh);
+ for (int i = 0; i < bw * bh; ++i) mask_buf[i] = AOM_BLEND_A64_MAX_ALPHA;
+
+ // handle above row
+ if (xd->up_available) {
+ const int overlap =
+ AOMMIN(block_size_high[bsize], block_size_high[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { x, above, above_stride,
+ overlap };
+ foreach_overlappable_nb_above(cm, (MACROBLOCKD *)xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ calc_target_weighted_pred_above, &ctxt);
+ }
+
+ for (int i = 0; i < bw * bh; ++i) {
+ wsrc_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ mask_buf[i] *= AOM_BLEND_A64_MAX_ALPHA;
+ }
+
+ // handle left column
+ if (xd->left_available) {
+ const int overlap =
+ AOMMIN(block_size_wide[bsize], block_size_wide[BLOCK_64X64]) >> 1;
+ struct calc_target_weighted_pred_ctxt ctxt = { x, left, left_stride,
+ overlap };
+ foreach_overlappable_nb_left(cm, (MACROBLOCKD *)xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ calc_target_weighted_pred_left, &ctxt);
+ }
+
+ if (!is_hbd) {
+ const uint8_t *src = x->plane[0].src.buf;
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ } else {
+ const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+ for (int row = 0; row < bh; ++row) {
+ for (int col = 0; col < bw; ++col) {
+ wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+ }
+ wsrc_buf += bw;
+ src += x->plane[0].src.stride;
+ }
+ }
+}
diff --git a/third_party/aom/av1/encoder/rdopt.h b/third_party/aom/av1/encoder/rdopt.h
new file mode 100644
index 0000000000..4c11f90b8f
--- /dev/null
+++ b/third_party/aom/av1/encoder/rdopt.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RDOPT_H_
+#define AOM_AV1_ENCODER_RDOPT_H_
+
+#include "av1/common/blockd.h"
+#include "av1/common/txb_common.h"
+
+#include "av1/encoder/block.h"
+#include "av1/encoder/context_tree.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_REF_MV_SERCH 3
+#define DEFAULT_LUMA_INTERP_SKIP_FLAG 1
+#define DEFAULT_CHROMA_INTERP_SKIP_FLAG 2
+#define DEFAULT_INTERP_SKIP_FLAG \
+ (DEFAULT_LUMA_INTERP_SKIP_FLAG | DEFAULT_CHROMA_INTERP_SKIP_FLAG)
+
+struct TileInfo;
+struct macroblock;
+struct RD_STATS;
+
+#if CONFIG_RD_DEBUG
+static INLINE void av1_update_txb_coeff_cost(RD_STATS *rd_stats, int plane,
+ TX_SIZE tx_size, int blk_row,
+ int blk_col, int txb_coeff_cost) {
+ (void)blk_row;
+ (void)blk_col;
+ (void)tx_size;
+ rd_stats->txb_coeff_cost[plane] += txb_coeff_cost;
+
+ {
+ const int txb_h = tx_size_high_unit[tx_size];
+ const int txb_w = tx_size_wide_unit[tx_size];
+ int idx, idy;
+ for (idy = 0; idy < txb_h; ++idy)
+ for (idx = 0; idx < txb_w; ++idx)
+ rd_stats->txb_coeff_cost_map[plane][blk_row + idy][blk_col + idx] = 0;
+
+ rd_stats->txb_coeff_cost_map[plane][blk_row][blk_col] = txb_coeff_cost;
+ }
+ assert(blk_row < TXB_COEFF_COST_MAP_SIZE);
+ assert(blk_col < TXB_COEFF_COST_MAP_SIZE);
+}
+#endif
+
+// Returns the number of colors in 'src'.
+int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
+ int *val_count);
+// Same as av1_count_colors(), but for high-bitdepth mode.
+int av1_count_colors_highbd(const uint8_t *src8, int stride, int rows, int cols,
+ int bit_depth, int *val_count);
+
+#if CONFIG_DIST_8X8
+int64_t av1_dist_8x8(const struct AV1_COMP *const cpi, const MACROBLOCK *x,
+ const uint8_t *src, int src_stride, const uint8_t *dst,
+ int dst_stride, const BLOCK_SIZE tx_bsize, int bsw,
+ int bsh, int visible_w, int visible_h, int qindex);
+#endif
+
+static INLINE int av1_cost_skip_txb(MACROBLOCK *x, const TXB_CTX *const txb_ctx,
+ int plane, TX_SIZE tx_size) {
+ const TX_SIZE txs_ctx = get_txsize_entropy_ctx(tx_size);
+ const PLANE_TYPE plane_type = get_plane_type(plane);
+ const LV_MAP_COEFF_COST *const coeff_costs =
+ &x->coeff_costs[txs_ctx][plane_type];
+ return coeff_costs->txb_skip_cost[txb_ctx->txb_skip_ctx][1];
+}
+
+static INLINE int av1_cost_coeffs(const AV1_COMMON *const cm, MACROBLOCK *x,
+ int plane, int block, TX_SIZE tx_size,
+ const TX_TYPE tx_type,
+ const TXB_CTX *const txb_ctx,
+ int use_fast_coef_costing) {
+#if TXCOEFF_COST_TIMER
+ struct aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+#endif
+ (void)use_fast_coef_costing;
+ const int cost =
+ av1_cost_coeffs_txb(cm, x, plane, block, tx_size, tx_type, txb_ctx);
+#if TXCOEFF_COST_TIMER
+ AV1_COMMON *tmp_cm = (AV1_COMMON *)&cpi->common;
+ aom_usec_timer_mark(&timer);
+ const int64_t elapsed_time = aom_usec_timer_elapsed(&timer);
+ tmp_cm->txcoeff_cost_timer += elapsed_time;
+ ++tmp_cm->txcoeff_cost_count;
+#endif
+ return cost;
+}
+
+void av1_rd_pick_intra_mode_sb(const struct AV1_COMP *cpi, struct macroblock *x,
+ int mi_row, int mi_col, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+ int64_t best_rd);
+
+unsigned int av1_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs);
+unsigned int av1_high_get_sby_perpixel_variance(const struct AV1_COMP *cpi,
+ const struct buf_2d *ref,
+ BLOCK_SIZE bs, int bd);
+
+void av1_rd_pick_inter_mode_sb(struct AV1_COMP *cpi,
+ struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col,
+ struct RD_STATS *rd_cost, BLOCK_SIZE bsize,
+ PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+void av1_rd_pick_inter_mode_sb_seg_skip(
+ const struct AV1_COMP *cpi, struct TileDataEnc *tile_data,
+ struct macroblock *x, int mi_row, int mi_col, struct RD_STATS *rd_cost,
+ BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, int64_t best_rd_so_far);
+
+#if CONFIG_COLLECT_INTER_MODE_RD_STATS
+void av1_inter_mode_data_init(struct TileDataEnc *tile_data);
+void av1_inter_mode_data_fit(TileDataEnc *tile_data, int rdmult);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RDOPT_H_
diff --git a/third_party/aom/av1/encoder/reconinter_enc.c b/third_party/aom/av1/encoder/reconinter_enc.c
new file mode 100644
index 0000000000..23d920fc32
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.c
@@ -0,0 +1,627 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_scale_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+
+#include "av1/common/blockd.h"
+#include "av1/common/mvref_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/reconintra.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/obmc.h"
+#include "av1/encoder/reconinter_enc.h"
+
+static INLINE void calc_subpel_params(
+ MACROBLOCKD *xd, const struct scale_factors *const sf, const MV mv,
+ int plane, const int pre_x, const int pre_y, int x, int y,
+ struct buf_2d *const pre_buf, uint8_t **pre, SubpelParams *subpel_params,
+ int bw, int bh) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int is_scaled = av1_is_scaled(sf);
+ if (is_scaled) {
+ int ssx = pd->subsampling_x;
+ int ssy = pd->subsampling_y;
+ int orig_pos_y = (pre_y + y) << SUBPEL_BITS;
+ orig_pos_y += mv.row * (1 << (1 - ssy));
+ int orig_pos_x = (pre_x + x) << SUBPEL_BITS;
+ orig_pos_x += mv.col * (1 << (1 - ssx));
+ int pos_y = sf->scale_value_y(orig_pos_y, sf);
+ int pos_x = sf->scale_value_x(orig_pos_x, sf);
+ pos_x += SCALE_EXTRA_OFF;
+ pos_y += SCALE_EXTRA_OFF;
+
+ const int top = -AOM_LEFT_TOP_MARGIN_SCALED(ssy);
+ const int left = -AOM_LEFT_TOP_MARGIN_SCALED(ssx);
+ const int bottom = (pre_buf->height + AOM_INTERP_EXTEND)
+ << SCALE_SUBPEL_BITS;
+ const int right = (pre_buf->width + AOM_INTERP_EXTEND) << SCALE_SUBPEL_BITS;
+ pos_y = clamp(pos_y, top, bottom);
+ pos_x = clamp(pos_x, left, right);
+
+ *pre = pre_buf->buf0 + (pos_y >> SCALE_SUBPEL_BITS) * pre_buf->stride +
+ (pos_x >> SCALE_SUBPEL_BITS);
+ subpel_params->subpel_x = pos_x & SCALE_SUBPEL_MASK;
+ subpel_params->subpel_y = pos_y & SCALE_SUBPEL_MASK;
+ subpel_params->xs = sf->x_step_q4;
+ subpel_params->ys = sf->y_step_q4;
+ } else {
+ const MV mv_q4 = clamp_mv_to_umv_border_sb(
+ xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+ subpel_params->xs = subpel_params->ys = SCALE_SUBPEL_SHIFTS;
+ subpel_params->subpel_x = (mv_q4.col & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ subpel_params->subpel_y = (mv_q4.row & SUBPEL_MASK) << SCALE_EXTRA_BITS;
+ *pre = pre_buf->buf + (y + (mv_q4.row >> SUBPEL_BITS)) * pre_buf->stride +
+ (x + (mv_q4.col >> SUBPEL_BITS));
+ }
+}
+
+static INLINE void build_inter_predictors(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int plane, const MB_MODE_INFO *mi,
+ int build_for_obmc, int bw, int bh,
+ int mi_x, int mi_y) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ int is_compound = has_second_ref(mi);
+ int ref;
+ const int is_intrabc = is_intrabc_block(mi);
+ assert(IMPLIES(is_intrabc, !is_compound));
+ int is_global[2] = { 0, 0 };
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ is_global[ref] = is_global_mv_block(mi, wm->wmtype);
+ }
+
+ const BLOCK_SIZE bsize = mi->sb_type;
+ const int ss_x = pd->subsampling_x;
+ const int ss_y = pd->subsampling_y;
+ int sub8x8_inter = (block_size_wide[bsize] < 8 && ss_x) ||
+ (block_size_high[bsize] < 8 && ss_y);
+
+ if (is_intrabc) sub8x8_inter = 0;
+
+ // For sub8x8 chroma blocks, we may be covering more than one luma block's
+ // worth of pixels. Thus (mi_x, mi_y) may not be the correct coordinates for
+ // the top-left corner of the prediction source - the correct top-left corner
+ // is at (pre_x, pre_y).
+ const int row_start =
+ (block_size_high[bsize] == 4) && ss_y && !build_for_obmc ? -1 : 0;
+ const int col_start =
+ (block_size_wide[bsize] == 4) && ss_x && !build_for_obmc ? -1 : 0;
+ const int pre_x = (mi_x + MI_SIZE * col_start) >> ss_x;
+ const int pre_y = (mi_y + MI_SIZE * row_start) >> ss_y;
+
+ sub8x8_inter = sub8x8_inter && !build_for_obmc;
+ if (sub8x8_inter) {
+ for (int row = row_start; row <= 0 && sub8x8_inter; ++row) {
+ for (int col = col_start; col <= 0; ++col) {
+ const MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ if (!is_inter_block(this_mbmi)) sub8x8_inter = 0;
+ if (is_intrabc_block(this_mbmi)) sub8x8_inter = 0;
+ }
+ }
+ }
+
+ if (sub8x8_inter) {
+ // block size
+ const int b4_w = block_size_wide[bsize] >> ss_x;
+ const int b4_h = block_size_high[bsize] >> ss_y;
+ const BLOCK_SIZE plane_bsize = scale_chroma_bsize(bsize, ss_x, ss_y);
+ const int b8_w = block_size_wide[plane_bsize] >> ss_x;
+ const int b8_h = block_size_high[plane_bsize] >> ss_y;
+ assert(!is_compound);
+
+ const struct buf_2d orig_pred_buf[2] = { pd->pre[0], pd->pre[1] };
+
+ int row = row_start;
+ for (int y = 0; y < b8_h; y += b4_h) {
+ int col = col_start;
+ for (int x = 0; x < b8_w; x += b4_w) {
+ MB_MODE_INFO *this_mbmi = xd->mi[row * xd->mi_stride + col];
+ is_compound = has_second_ref(this_mbmi);
+ int tmp_dst_stride = 8;
+ assert(bw < 8 || bh < 8);
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, tmp_dst_stride, is_compound, xd->bd);
+ conv_params.use_jnt_comp_avg = 0;
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *dst = dst_buf->buf + dst_buf->stride * y + x;
+
+ ref = 0;
+ const RefBuffer *ref_buf =
+ &cm->frame_refs[this_mbmi->ref_frame[ref] - LAST_FRAME];
+
+ pd->pre[ref].buf0 =
+ (plane == 1) ? ref_buf->buf->u_buffer : ref_buf->buf->v_buffer;
+ pd->pre[ref].buf =
+ pd->pre[ref].buf0 + scaled_buffer_offset(pre_x, pre_y,
+ ref_buf->buf->uv_stride,
+ &ref_buf->sf);
+ pd->pre[ref].width = ref_buf->buf->uv_crop_width;
+ pd->pre[ref].height = ref_buf->buf->uv_crop_height;
+ pd->pre[ref].stride = ref_buf->buf->uv_stride;
+
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &ref_buf->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+
+ const MV mv = this_mbmi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = this_mbmi->motion_mode == WARPED_CAUSAL;
+
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+ conv_params.do_average = ref;
+ if (is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ }
+
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf,
+ b4_w, b4_h, &conv_params, this_mbmi->interp_filters, &warp_types,
+ (mi_x >> pd->subsampling_x) + x, (mi_y >> pd->subsampling_y) + y,
+ plane, ref, mi, build_for_obmc, xd, cm->allow_warped_motion);
+
+ ++col;
+ }
+ ++row;
+ }
+
+ for (ref = 0; ref < 2; ++ref) pd->pre[ref] = orig_pred_buf[ref];
+ return;
+ }
+
+ {
+ ConvolveParams conv_params = get_conv_params_no_round(
+ 0, plane, xd->tmp_conv_dst, MAX_SB_SIZE, is_compound, xd->bd);
+ av1_jnt_comp_weight_assign(cm, mi, 0, &conv_params.fwd_offset,
+ &conv_params.bck_offset,
+ &conv_params.use_jnt_comp_avg, is_compound);
+
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ const struct scale_factors *const sf =
+ is_intrabc ? &cm->sf_identity : &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = is_intrabc ? dst_buf : &pd->pre[ref];
+ const MV mv = mi->mv[ref].as_mv;
+
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, 0, 0, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ WarpTypesAllowed warp_types;
+ warp_types.global_warp_allowed = is_global[ref];
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+
+ if (ref && is_masked_compound_type(mi->interinter_comp.type)) {
+ // masked compound type has its own average mechanism
+ conv_params.do_average = 0;
+ av1_make_masked_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, plane, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, ref, xd,
+ cm->allow_warped_motion);
+ } else {
+ conv_params.do_average = ref;
+ av1_make_inter_predictor(
+ pre, pre_buf->stride, dst, dst_buf->stride, &subpel_params, sf, bw,
+ bh, &conv_params, mi->interp_filters, &warp_types,
+ mi_x >> pd->subsampling_x, mi_y >> pd->subsampling_y, plane, ref,
+ mi, build_for_obmc, xd, cm->allow_warped_motion);
+ }
+ }
+ }
+}
+
+static void build_inter_predictors_for_planes(const AV1_COMMON *cm,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int mi_row, int mi_col,
+ int plane_from, int plane_to) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const struct macroblockd_plane *pd = &xd->plane[plane];
+ const int bw = pd->width;
+ const int bh = pd->height;
+
+ if (!is_chroma_reference(mi_row, mi_col, bsize, pd->subsampling_x,
+ pd->subsampling_y))
+ continue;
+
+ build_inter_predictors(cm, xd, plane, xd->mi[0], 0, bw, bh, mi_x, mi_y);
+ }
+}
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize, 0);
+}
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ for (int plane_idx = 1; plane_idx < MAX_MB_PLANE; plane_idx++) {
+ av1_build_inter_predictors_sbp(cm, xd, mi_row, mi_col, ctx, bsize,
+ plane_idx);
+ }
+}
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx) {
+ build_inter_predictors_for_planes(cm, xd, bsize, mi_row, mi_col, plane_idx,
+ plane_idx);
+
+ if (is_interintra_pred(xd->mi[0])) {
+ BUFFER_SET default_ctx = { { NULL, NULL, NULL }, { 0, 0, 0 } };
+ if (!ctx) {
+ default_ctx.plane[plane_idx] = xd->plane[plane_idx].dst.buf;
+ default_ctx.stride[plane_idx] = xd->plane[plane_idx].dst.stride;
+ ctx = &default_ctx;
+ }
+ av1_build_interintra_predictors_sbp(cm, xd, xd->plane[plane_idx].dst.buf,
+ xd->plane[plane_idx].dst.stride, ctx,
+ plane_idx, bsize);
+ }
+}
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize) {
+ const int num_planes = av1_num_planes(cm);
+ av1_build_inter_predictors_sby(cm, xd, mi_row, mi_col, ctx, bsize);
+ if (num_planes > 1)
+ av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize);
+}
+
+// TODO(sarahparker):
+// av1_build_inter_predictor should be combined with
+// av1_make_inter_predictor
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous) {
+ const int is_q4 = precision == MV_PRECISION_Q4;
+ const MV mv_q4 = { is_q4 ? src_mv->row : src_mv->row * 2,
+ is_q4 ? src_mv->col : src_mv->col * 2 };
+ MV32 mv = av1_scale_mv(&mv_q4, x, y, sf);
+ mv.col += SCALE_EXTRA_OFF;
+ mv.row += SCALE_EXTRA_OFF;
+
+ const SubpelParams subpel_params = { sf->x_step_q4, sf->y_step_q4,
+ mv.col & SCALE_SUBPEL_MASK,
+ mv.row & SCALE_SUBPEL_MASK };
+ src += (mv.row >> SCALE_SUBPEL_BITS) * src_stride +
+ (mv.col >> SCALE_SUBPEL_BITS);
+
+ av1_make_inter_predictor(src, src_stride, dst, dst_stride, &subpel_params, sf,
+ w, h, conv_params, interp_filters, warp_types, p_col,
+ p_row, plane, ref, xd->mi[0], 0, xd,
+ can_use_previous);
+}
+
+static INLINE void build_prediction_by_above_pred(
+ MACROBLOCKD *xd, int rel_mi_col, uint8_t above_mi_width,
+ MB_MODE_INFO *above_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int above_mi_col = ctxt->mi_col + rel_mi_col;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *above_mbmi;
+
+ av1_setup_build_prediction_by_above_pred(xd, rel_mi_col, above_mi_width,
+ above_mbmi, ctxt, num_planes);
+ mi_x = above_mi_col << MI_SIZE_LOG2;
+ mi_y = ctxt->mi_row << MI_SIZE_LOG2;
+
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = (above_mi_width * MI_SIZE) >> pd->subsampling_x;
+ int bh = clamp(block_size_high[bsize] >> (pd->subsampling_y + 1), 4,
+ block_size_high[BLOCK_64X64] >> (pd->subsampling_y + 1));
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 0)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, above_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *above_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->up_available) return;
+
+ // Adjust mb_to_bottom_edge to have the correct value for the OBMC
+ // prediction block. This is half the height of the original block,
+ // except for 128-wide blocks, where we only use a height of 32.
+ int this_height = xd->n4_h * MI_SIZE;
+ int pred_height = AOMMIN(this_height / 2, 32);
+ xd->mb_to_bottom_edge += (this_height - pred_height) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_right_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_above(cm, xd, mi_col,
+ max_neighbor_obmc[mi_size_wide_log2[bsize]],
+ build_prediction_by_above_pred, &ctxt);
+
+ xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+ xd->mb_to_right_edge = ctxt.mb_to_far_edge;
+ xd->mb_to_bottom_edge -= (this_height - pred_height) * 8;
+}
+
+static INLINE void build_prediction_by_left_pred(
+ MACROBLOCKD *xd, int rel_mi_row, uint8_t left_mi_height,
+ MB_MODE_INFO *left_mbmi, void *fun_ctxt, const int num_planes) {
+ struct build_prediction_ctxt *ctxt = (struct build_prediction_ctxt *)fun_ctxt;
+ const int left_mi_row = ctxt->mi_row + rel_mi_row;
+ int mi_x, mi_y;
+ MB_MODE_INFO backup_mbmi = *left_mbmi;
+
+ av1_setup_build_prediction_by_left_pred(xd, rel_mi_row, left_mi_height,
+ left_mbmi, ctxt, num_planes);
+ mi_x = ctxt->mi_col << MI_SIZE_LOG2;
+ mi_y = left_mi_row << MI_SIZE_LOG2;
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+
+ for (int j = 0; j < num_planes; ++j) {
+ const struct macroblockd_plane *pd = &xd->plane[j];
+ int bw = clamp(block_size_wide[bsize] >> (pd->subsampling_x + 1), 4,
+ block_size_wide[BLOCK_64X64] >> (pd->subsampling_x + 1));
+ int bh = (left_mi_height << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+ if (av1_skip_u4x4_pred_in_obmc(bsize, pd, 1)) continue;
+ build_inter_predictors(ctxt->cm, xd, j, left_mbmi, 1, bw, bh, mi_x, mi_y);
+ }
+ *left_mbmi = backup_mbmi;
+}
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]) {
+ if (!xd->left_available) return;
+
+ // Adjust mb_to_right_edge to have the correct value for the OBMC
+ // prediction block. This is half the width of the original block,
+ // except for 128-wide blocks, where we only use a width of 32.
+ int this_width = xd->n4_w * MI_SIZE;
+ int pred_width = AOMMIN(this_width / 2, 32);
+ xd->mb_to_right_edge += (this_width - pred_width) * 8;
+
+ struct build_prediction_ctxt ctxt = { cm, mi_row,
+ mi_col, tmp_buf,
+ tmp_width, tmp_height,
+ tmp_stride, xd->mb_to_bottom_edge };
+ BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ foreach_overlappable_nb_left(cm, xd, mi_row,
+ max_neighbor_obmc[mi_size_high_log2[bsize]],
+ build_prediction_by_left_pred, &ctxt);
+
+ xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+ xd->mb_to_right_edge -= (this_width - pred_width) * 8;
+ xd->mb_to_bottom_edge = ctxt.mb_to_far_edge;
+}
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col) {
+ const int num_planes = av1_num_planes(cm);
+ uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+ int dst_stride1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_stride2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_width2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height1[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+ int dst_height2[MAX_MB_PLANE] = { MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE };
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int len = sizeof(uint16_t);
+ dst_buf1[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0]);
+ dst_buf1[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * len);
+ dst_buf1[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2 * len);
+ dst_buf2[0] = CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1]);
+ dst_buf2[1] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * len);
+ dst_buf2[2] =
+ CONVERT_TO_BYTEPTR(xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2 * len);
+ } else {
+ dst_buf1[0] = xd->tmp_obmc_bufs[0];
+ dst_buf1[1] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE;
+ dst_buf1[2] = xd->tmp_obmc_bufs[0] + MAX_SB_SQUARE * 2;
+ dst_buf2[0] = xd->tmp_obmc_bufs[1];
+ dst_buf2[1] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE;
+ dst_buf2[2] = xd->tmp_obmc_bufs[1] + MAX_SB_SQUARE * 2;
+ }
+ av1_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+ dst_width1, dst_height1, dst_stride1);
+ av1_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+ dst_width2, dst_height2, dst_stride2);
+ av1_setup_dst_planes(xd->plane, xd->mi[0]->sb_type, get_frame_new_buffer(cm),
+ mi_row, mi_col, 0, num_planes);
+ av1_build_obmc_inter_prediction(cm, xd, mi_row, mi_col, dst_buf1, dst_stride1,
+ dst_buf2, dst_stride2);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+ int bw, int bh, int x, int y,
+ int w, int h, int mi_x, int mi_y,
+ int ref, uint8_t *const ext_dst,
+ int ext_dst_stride,
+ int can_use_previous) {
+ struct macroblockd_plane *const pd = &xd->plane[plane];
+ const MB_MODE_INFO *mi = xd->mi[0];
+
+ const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+ struct buf_2d *const pre_buf = &pd->pre[ref];
+ uint8_t *const dst = get_buf_by_bd(xd, ext_dst) + ext_dst_stride * y + x;
+ const MV mv = mi->mv[ref].as_mv;
+
+ ConvolveParams conv_params = get_conv_params(0, plane, xd->bd);
+ WarpTypesAllowed warp_types;
+ const WarpedMotionParams *const wm = &xd->global_motion[mi->ref_frame[ref]];
+ warp_types.global_warp_allowed = is_global_mv_block(mi, wm->wmtype);
+ warp_types.local_warp_allowed = mi->motion_mode == WARPED_CAUSAL;
+ const int pre_x = (mi_x) >> pd->subsampling_x;
+ const int pre_y = (mi_y) >> pd->subsampling_y;
+ uint8_t *pre;
+ SubpelParams subpel_params;
+ calc_subpel_params(xd, sf, mv, plane, pre_x, pre_y, x, y, pre_buf, &pre,
+ &subpel_params, bw, bh);
+
+ av1_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+ &subpel_params, sf, w, h, &conv_params,
+ mi->interp_filters, &warp_types, pre_x + x,
+ pre_y + y, plane, ref, mi, 0, xd, can_use_previous);
+}
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous) {
+ int plane;
+ const int mi_x = mi_col * MI_SIZE;
+ const int mi_y = mi_row * MI_SIZE;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_inter_predictors_single_buf(xd, plane, bw, bh, 0, 0, bw, bh, mi_x,
+ mi_y, ref, ext_dst[plane],
+ ext_dst_stride[plane], can_use_previous);
+ }
+}
+
+static void build_masked_compound(
+ uint8_t *dst, int dst_stride, const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ aom_blend_a64_mask(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, block_size_wide[sb_type], w, h, subw, subh);
+}
+
+static void build_masked_compound_highbd(
+ uint8_t *dst_8, int dst_stride, const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ const INTERINTER_COMPOUND_DATA *const comp_data, BLOCK_SIZE sb_type, int h,
+ int w, int bd) {
+ // Derive subsampling from h and w passed in. May be refactored to
+ // pass in subsampling factors directly.
+ const int subh = (2 << mi_size_high_log2[sb_type]) == h;
+ const int subw = (2 << mi_size_wide_log2[sb_type]) == w;
+ const uint8_t *mask = av1_get_compound_type_mask(comp_data, sb_type);
+ // const uint8_t *mask =
+ // av1_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
+ aom_highbd_blend_a64_mask(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, block_size_wide[sb_type], w, h,
+ subw, subh, bd);
+}
+
+static void build_wedge_inter_predictor_from_buf(
+ MACROBLOCKD *xd, int plane, int x, int y, int w, int h, uint8_t *ext_dst0,
+ int ext_dst_stride0, uint8_t *ext_dst1, int ext_dst_stride1) {
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const int is_compound = has_second_ref(mbmi);
+ MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+ struct buf_2d *const dst_buf = &pd->dst;
+ uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+ mbmi->interinter_comp.seg_mask = xd->seg_mask;
+ const INTERINTER_COMPOUND_DATA *comp_data = &mbmi->interinter_comp;
+
+ if (is_compound && is_masked_compound_type(comp_data->type)) {
+ if (!plane && comp_data->type == COMPOUND_DIFFWTD) {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ av1_build_compound_diffwtd_mask_highbd(
+ comp_data->seg_mask, comp_data->mask_type,
+ CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, h, w, xd->bd);
+ else
+ av1_build_compound_diffwtd_mask(
+ comp_data->seg_mask, comp_data->mask_type, ext_dst0,
+ ext_dst_stride0, ext_dst1, ext_dst_stride1, h, w);
+ }
+
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ build_masked_compound_highbd(
+ dst, dst_buf->stride, CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1, comp_data,
+ mbmi->sb_type, h, w, xd->bd);
+ else
+ build_masked_compound(dst, dst_buf->stride, ext_dst0, ext_dst_stride0,
+ ext_dst1, ext_dst_stride1, comp_data, mbmi->sb_type,
+ h, w);
+ } else {
+ if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+ aom_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+ dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+ xd->bd);
+ else
+ aom_convolve_copy(ext_dst0, ext_dst_stride0, dst, dst_buf->stride, NULL,
+ 0, NULL, 0, w, h);
+ }
+}
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]) {
+ int plane;
+ for (plane = plane_from; plane <= plane_to; ++plane) {
+ const BLOCK_SIZE plane_bsize = get_plane_block_size(
+ bsize, xd->plane[plane].subsampling_x, xd->plane[plane].subsampling_y);
+ const int bw = block_size_wide[plane_bsize];
+ const int bh = block_size_high[plane_bsize];
+ build_wedge_inter_predictor_from_buf(
+ xd, plane, 0, 0, bw, bh, ext_dst0[plane], ext_dst_stride0[plane],
+ ext_dst1[plane], ext_dst_stride1[plane]);
+ }
+}
diff --git a/third_party/aom/av1/encoder/reconinter_enc.h b/third_party/aom/av1/encoder/reconinter_enc.h
new file mode 100644
index 0000000000..10d5e8c284
--- /dev/null
+++ b/third_party/aom/av1/encoder/reconinter_enc.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_RECONINTER_ENC_H_
+#define AOM_AV1_ENCODER_RECONINTER_ENC_H_
+
+#include "aom/aom_integer.h"
+#include "av1/common/filter.h"
+#include "av1/common/blockd.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/convolve.h"
+#include "av1/common/warped_motion.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_build_inter_predictors_sby(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbuv(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictors_sbp(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize, int plane_idx);
+
+void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BUFFER_SET *ctx,
+ BLOCK_SIZE bsize);
+
+void av1_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, const MV *src_mv,
+ const struct scale_factors *sf, int w, int h,
+ ConvolveParams *conv_params,
+ InterpFilters interp_filters,
+ const WarpTypesAllowed *warp_types, int p_col,
+ int p_row, int plane, int ref,
+ enum mv_precision precision, int x, int y,
+ const MACROBLOCKD *xd, int can_use_previous);
+
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+#define CHECK_SUBPEL 0
+static INLINE int has_subpel_mv_component(const MB_MODE_INFO *const mbmi,
+ const MACROBLOCKD *const xd,
+ int dir) {
+#if CHECK_SUBPEL
+ const BLOCK_SIZE bsize = mbmi->sb_type;
+ int plane;
+ int ref = (dir >> 1);
+
+ if (dir & 0x01) {
+ if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK) return 1;
+ } else {
+ if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK) return 1;
+ }
+
+ return 0;
+#else
+ (void)mbmi;
+ (void)xd;
+ (void)dir;
+ return 1;
+#endif
+}
+
+static INLINE int av1_is_interp_search_needed(const MACROBLOCKD *const xd) {
+ MB_MODE_INFO *const mi = xd->mi[0];
+ const int is_compound = has_second_ref(mi);
+ int ref;
+ for (ref = 0; ref < 1 + is_compound; ++ref) {
+ int row_col;
+ for (row_col = 0; row_col < 2; ++row_col) {
+ const int dir = (ref << 1) + row_col;
+ if (has_subpel_mv_component(mi, xd, dir)) {
+ return 1;
+ }
+ }
+ }
+ return 0;
+}
+
+void av1_build_prediction_by_above_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_prediction_by_left_preds(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col,
+ uint8_t *tmp_buf[MAX_MB_PLANE],
+ int tmp_width[MAX_MB_PLANE],
+ int tmp_height[MAX_MB_PLANE],
+ int tmp_stride[MAX_MB_PLANE]);
+
+void av1_build_obmc_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col);
+
+void av1_build_inter_predictors_for_planes_single_buf(
+ MACROBLOCKD *xd, BLOCK_SIZE bsize, int plane_from, int plane_to, int mi_row,
+ int mi_col, int ref, uint8_t *ext_dst[3], int ext_dst_stride[3],
+ int can_use_previous);
+
+void av1_build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, BLOCK_SIZE bsize,
+ int plane_from, int plane_to,
+ uint8_t *ext_dst0[3],
+ int ext_dst_stride0[3],
+ uint8_t *ext_dst1[3],
+ int ext_dst_stride1[3]);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_RECONINTER_ENC_H_
diff --git a/third_party/aom/av1/encoder/segmentation.c b/third_party/aom/av1/encoder/segmentation.c
new file mode 100644
index 0000000000..2e9102745c
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/pred_common.h"
+#include "av1/common/tile_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/segmentation.h"
+
+void av1_enable_segmentation(struct segmentation *seg) {
+ seg->enabled = 1;
+ seg->update_map = 1;
+ seg->update_data = 1;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segmentation(struct segmentation *seg) {
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ seg->temporal_update = 0;
+}
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_mask[segment_id] &= ~(1 << feature_id);
+}
+
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id) {
+ seg->feature_data[segment_id][feature_id] = 0;
+}
+
+static void count_segs(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MB_MODE_INFO **mi,
+ unsigned *no_pred_segcounts,
+ unsigned (*temporal_predictor_count)[2],
+ unsigned *t_unpred_seg_counts, int bw, int bh,
+ int mi_row, int mi_col) {
+ int segment_id;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ xd->mi = mi;
+ segment_id = xd->mi[0]->segment_id;
+
+ set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+ // Count the number of hits on each segment with no prediction
+ no_pred_segcounts[segment_id]++;
+
+ // Temporal prediction not allowed on key frames
+ if (cm->frame_type != KEY_FRAME) {
+ const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
+ // Test to see if the segment id matches the predicted value.
+ const int pred_segment_id =
+ cm->last_frame_seg_map
+ ? get_segment_id(cm, cm->last_frame_seg_map, bsize, mi_row, mi_col)
+ : 0;
+ const int pred_flag = pred_segment_id == segment_id;
+ const int pred_context = av1_get_pred_context_seg_id(xd);
+
+ // Store the prediction status for this mb and update counts
+ // as appropriate
+ xd->mi[0]->seg_id_predicted = pred_flag;
+ temporal_predictor_count[pred_context][pred_flag]++;
+
+ // Update the "unpredicted" segment count
+ if (!pred_flag) t_unpred_seg_counts[segment_id]++;
+ }
+}
+
+static void count_segs_sb(const AV1_COMMON *cm, MACROBLOCKD *xd,
+ const TileInfo *tile, MB_MODE_INFO **mi,
+ unsigned *no_pred_segcounts,
+ unsigned (*temporal_predictor_count)[2],
+ unsigned *t_unpred_seg_counts, int mi_row, int mi_col,
+ BLOCK_SIZE bsize) {
+ const int mis = cm->mi_stride;
+ const int bs = mi_size_wide[bsize], hbs = bs / 2;
+ PARTITION_TYPE partition;
+ const int qbs = bs / 4;
+
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+#define CSEGS(cs_bw, cs_bh, cs_rowoff, cs_coloff) \
+ count_segs(cm, xd, tile, mi + mis * (cs_rowoff) + (cs_coloff), \
+ no_pred_segcounts, temporal_predictor_count, t_unpred_seg_counts, \
+ (cs_bw), (cs_bh), mi_row + (cs_rowoff), mi_col + (cs_coloff));
+
+ if (bsize == BLOCK_8X8)
+ partition = PARTITION_NONE;
+ else
+ partition = get_partition(cm, mi_row, mi_col, bsize);
+ switch (partition) {
+ case PARTITION_NONE: CSEGS(bs, bs, 0, 0); break;
+ case PARTITION_HORZ:
+ CSEGS(bs, hbs, 0, 0);
+ CSEGS(bs, hbs, hbs, 0);
+ break;
+ case PARTITION_VERT:
+ CSEGS(hbs, bs, 0, 0);
+ CSEGS(hbs, bs, 0, hbs);
+ break;
+ case PARTITION_HORZ_A:
+ CSEGS(hbs, hbs, 0, 0);
+ CSEGS(hbs, hbs, 0, hbs);
+ CSEGS(bs, hbs, hbs, 0);
+ break;
+ case PARTITION_HORZ_B:
+ CSEGS(bs, hbs, 0, 0);
+ CSEGS(hbs, hbs, hbs, 0);
+ CSEGS(hbs, hbs, hbs, hbs);
+ break;
+ case PARTITION_VERT_A:
+ CSEGS(hbs, hbs, 0, 0);
+ CSEGS(hbs, hbs, hbs, 0);
+ CSEGS(hbs, bs, 0, hbs);
+ break;
+ case PARTITION_VERT_B:
+ CSEGS(hbs, bs, 0, 0);
+ CSEGS(hbs, hbs, 0, hbs);
+ CSEGS(hbs, hbs, hbs, hbs);
+ break;
+ case PARTITION_HORZ_4:
+ CSEGS(bs, qbs, 0, 0);
+ CSEGS(bs, qbs, qbs, 0);
+ CSEGS(bs, qbs, 2 * qbs, 0);
+ if (mi_row + 3 * qbs < cm->mi_rows) CSEGS(bs, qbs, 3 * qbs, 0);
+ break;
+
+ case PARTITION_VERT_4:
+ CSEGS(qbs, bs, 0, 0);
+ CSEGS(qbs, bs, 0, qbs);
+ CSEGS(qbs, bs, 0, 2 * qbs);
+ if (mi_col + 3 * qbs < cm->mi_cols) CSEGS(qbs, bs, 0, 3 * qbs);
+ break;
+
+ case PARTITION_SPLIT: {
+ const BLOCK_SIZE subsize = get_partition_subsize(bsize, PARTITION_SPLIT);
+ int n;
+
+ for (n = 0; n < 4; n++) {
+ const int mi_dc = hbs * (n & 1);
+ const int mi_dr = hbs * (n >> 1);
+
+ count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc], no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts,
+ mi_row + mi_dr, mi_col + mi_dc, subsize);
+ }
+ } break;
+ default: assert(0);
+ }
+
+#undef CSEGS
+}
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd) {
+ struct segmentation *seg = &cm->seg;
+ struct segmentation_probs *segp = &cm->fc->seg;
+ int no_pred_cost;
+ int t_pred_cost = INT_MAX;
+ int tile_col, tile_row, mi_row, mi_col;
+ unsigned temporal_predictor_count[SEG_TEMPORAL_PRED_CTXS][2] = { { 0 } };
+ unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
+ unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
+ (void)xd;
+
+ // First of all generate stats regarding how well the last segment map
+ // predicts this one
+ for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+ TileInfo tile_info;
+ av1_tile_set_row(&tile_info, cm, tile_row);
+ for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+ MB_MODE_INFO **mi_ptr;
+ av1_tile_set_col(&tile_info, cm, tile_col);
+ mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+ tile_info.mi_col_start;
+ for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+ mi_row += cm->seq_params.mib_size,
+ mi_ptr += cm->seq_params.mib_size * cm->mi_stride) {
+ MB_MODE_INFO **mi = mi_ptr;
+ for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+ mi_col += cm->seq_params.mib_size, mi += cm->seq_params.mib_size) {
+ count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+ temporal_predictor_count, t_unpred_seg_counts, mi_row,
+ mi_col, cm->seq_params.sb_size);
+ }
+ }
+ }
+ }
+
+ int seg_id_cost[MAX_SEGMENTS];
+ av1_cost_tokens_from_cdf(seg_id_cost, segp->tree_cdf, NULL);
+ no_pred_cost = 0;
+ for (int i = 0; i < MAX_SEGMENTS; ++i)
+ no_pred_cost += no_pred_segcounts[i] * seg_id_cost[i];
+
+ // Frames without past dependency cannot use temporal prediction
+ if (cm->primary_ref_frame != PRIMARY_REF_NONE) {
+ int pred_flag_cost[SEG_TEMPORAL_PRED_CTXS][2];
+ for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i)
+ av1_cost_tokens_from_cdf(pred_flag_cost[i], segp->pred_cdf[i], NULL);
+ t_pred_cost = 0;
+ // Cost for signaling the prediction flag.
+ for (int i = 0; i < SEG_TEMPORAL_PRED_CTXS; ++i) {
+ for (int j = 0; j < 2; ++j)
+ t_pred_cost += temporal_predictor_count[i][j] * pred_flag_cost[i][j];
+ }
+ // Cost for signaling the unpredicted segment id.
+ for (int i = 0; i < MAX_SEGMENTS; ++i)
+ t_pred_cost += t_unpred_seg_counts[i] * seg_id_cost[i];
+ }
+
+ // Now choose which coding method to use.
+ if (t_pred_cost < no_pred_cost) {
+ assert(!cm->error_resilient_mode);
+ seg->temporal_update = 1;
+ } else {
+ seg->temporal_update = 0;
+ }
+}
+
+void av1_reset_segment_features(AV1_COMMON *cm) {
+ struct segmentation *seg = &cm->seg;
+
+ // Set up default state for MB feature flags
+ seg->enabled = 0;
+ seg->update_map = 0;
+ seg->update_data = 0;
+ av1_clearall_segfeatures(seg);
+}
diff --git a/third_party/aom/av1/encoder/segmentation.h b/third_party/aom/av1/encoder/segmentation.h
new file mode 100644
index 0000000000..1ad13d66a9
--- /dev/null
+++ b/third_party/aom/av1/encoder/segmentation.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SEGMENTATION_H_
+#define AOM_AV1_ENCODER_SEGMENTATION_H_
+
+#include "av1/common/blockd.h"
+#include "av1/encoder/encoder.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_enable_segmentation(struct segmentation *seg);
+void av1_disable_segmentation(struct segmentation *seg);
+
+void av1_disable_segfeature(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+void av1_clear_segdata(struct segmentation *seg, int segment_id,
+ SEG_LVL_FEATURES feature_id);
+
+void av1_choose_segmap_coding_method(AV1_COMMON *cm, MACROBLOCKD *xd);
+
+void av1_reset_segment_features(AV1_COMMON *cm);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SEGMENTATION_H_
diff --git a/third_party/aom/av1/encoder/speed_features.c b/third_party/aom/av1/encoder/speed_features.c
new file mode 100644
index 0000000000..4c35baae01
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.c
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <limits.h>
+
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/speed_features.h"
+#include "av1/encoder/rdopt.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// Setting this to 1 will disable trellis optimization completely.
+// Setting this to 2 will disable trellis optimization within the
+// transform search. Trellis optimization will still be applied
+// in the final encode.
+#define DISABLE_TRELLISQ_SEARCH 0
+
+#define MAX_MESH_SPEED 5 // Max speed setting for mesh motion method
+static MESH_PATTERN
+ good_quality_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
+ { { 64, 8 }, { 14, 2 }, { 7, 1 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ { { 64, 16 }, { 24, 8 }, { 12, 4 }, { 7, 1 } },
+ };
+static unsigned char good_quality_max_mesh_pct[MAX_MESH_SPEED + 1] = {
+ 50, 50, 25, 15, 5, 1
+};
+
+// TODO(huisu@google.com): These settings are pretty relaxed, tune them for
+// each speed setting
+static MESH_PATTERN intrabc_mesh_patterns[MAX_MESH_SPEED + 1][MAX_MESH_STEP] = {
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 256, 1 }, { 256, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 1 }, { 64, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+ { { 64, 4 }, { 16, 1 }, { 0, 0 }, { 0, 0 } },
+};
+static uint8_t intrabc_max_mesh_pct[MAX_MESH_SPEED + 1] = { 100, 100, 100,
+ 25, 25, 10 };
+
+// Intra only frames, golden frames (except alt ref overlays) and
+// alt ref frames tend to be coded at a higher than ambient quality
+static int frame_is_boosted(const AV1_COMP *cpi) {
+ return frame_is_kf_gf_arf(cpi);
+}
+
+// Sets a partition size down to which the auto partition code will always
+// search (can go lower), based on the image dimensions. The logic here
+// is that the extent to which ringing artefacts are offensive, depends
+// partly on the screen area that over which they propogate. Propogation is
+// limited by transform block size but the screen area take up by a given block
+// size will be larger for a small image format stretched to full screen.
+static BLOCK_SIZE set_partition_min_limit(AV1_COMMON *const cm) {
+ unsigned int screen_area = (cm->width * cm->height);
+
+ // Select block size based on image format size.
+ if (screen_area < 1280 * 720) {
+ // Formats smaller in area than 720P
+ return BLOCK_4X4;
+ } else if (screen_area < 1920 * 1080) {
+ // Format >= 720P and < 1080P
+ return BLOCK_8X8;
+ } else {
+ // Formats 1080P and up
+ return BLOCK_16X16;
+ }
+}
+
+// Do we have an internal image edge (e.g. formatting bars).
+static int has_internal_image_edge(const AV1_COMP *cpi) {
+ return (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.this_frame_stats.inactive_zone_rows > 0) ||
+ (cpi->twopass.this_frame_stats.inactive_zone_cols > 0));
+}
+
+static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int is_720p_or_larger = AOMMIN(cm->width, cm->height) >= 720;
+ const int is_480p_or_larger = AOMMIN(cm->width, cm->height) >= 480;
+
+ if (is_480p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
+ } else {
+ sf->use_square_partition_only_threshold = BLOCK_64X64;
+ }
+
+ // TODO(huisu@google.com): train models for 720P and above.
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 500; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+
+ if (speed >= 1) {
+ if (is_720p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
+ } else if (is_480p_or_larger) {
+ sf->use_square_partition_only_threshold = BLOCK_64X64;
+ } else {
+ sf->use_square_partition_only_threshold = BLOCK_32X32;
+ }
+
+ if (!is_720p_or_larger) {
+ sf->ml_partition_search_breakout_thresh[0] = 200; // BLOCK_8X8
+ sf->ml_partition_search_breakout_thresh[1] = 250; // BLOCK_16X16
+ sf->ml_partition_search_breakout_thresh[2] = 300; // BLOCK_32X32
+ sf->ml_partition_search_breakout_thresh[3] = 300; // BLOCK_64X64
+ sf->ml_partition_search_breakout_thresh[4] = -1; // BLOCK_128X128
+ }
+ }
+
+ if (speed >= 2) {
+ if (is_720p_or_larger) {
+ sf->disable_split_mask =
+ cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ sf->partition_search_breakout_rate_thr = 120;
+ } else {
+ sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
+ sf->partition_search_breakout_dist_thr = (1 << 22);
+ sf->partition_search_breakout_rate_thr = 100;
+ }
+ sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
+ }
+
+ if (speed >= 3) {
+ if (is_720p_or_larger) {
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 25);
+ sf->partition_search_breakout_rate_thr = 200;
+ } else {
+ sf->max_intra_bsize = BLOCK_32X32;
+ sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ sf->partition_search_breakout_dist_thr = (1 << 23);
+ sf->partition_search_breakout_rate_thr = 120;
+ }
+ }
+
+ // If this is a two pass clip that fits the criteria for animated or
+ // graphics content then reset disable_split_mask for speeds 2+.
+ // Also if the image edge is internal to the coded area.
+ if ((speed >= 2) && (cpi->oxcf.pass == 2) &&
+ ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
+ (has_internal_image_edge(cpi)))) {
+ sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
+ }
+
+ if (speed >= 4) {
+ if (is_720p_or_larger) {
+ sf->partition_search_breakout_dist_thr = (1 << 26);
+ } else {
+ sf->partition_search_breakout_dist_thr = (1 << 24);
+ }
+ sf->disable_split_mask = DISABLE_ALL_SPLIT;
+ }
+}
+
+static void set_good_speed_features_framesize_independent(AV1_COMP *cpi,
+ SPEED_FEATURES *sf,
+ int speed) {
+ AV1_COMMON *const cm = &cpi->common;
+ const int boosted = frame_is_boosted(cpi);
+
+ // Speed 0 for all speed features that give neutral coding performance change.
+ sf->reduce_inter_modes = 1;
+ sf->prune_ext_partition_types_search_level = 1;
+ sf->ml_prune_rect_partition = 1;
+ sf->ml_prune_ab_partition = 1;
+ sf->ml_prune_4_partition = 1;
+ sf->adaptive_txb_search_level = 1;
+ sf->jnt_comp_skip_mv_search = 1;
+ sf->model_based_prune_tx_search_level = 1;
+ sf->model_based_post_interp_filter_breakout = 1;
+ sf->inter_mode_rd_model_estimation = 1;
+ sf->prune_ref_frame_for_rect_partitions =
+ !(boosted || cpi->refresh_bwd_ref_frame || cpi->refresh_alt2_ref_frame);
+ sf->less_rectangular_check_level = 1;
+ sf->gm_search_type = GM_REDUCED_REF_SEARCH;
+ sf->gm_disable_recode = 1;
+
+ if (speed >= 1) {
+ sf->gm_erroradv_type = GM_ERRORADV_TR_1;
+ sf->selective_ref_frame = 1;
+ sf->inter_tx_size_search_init_depth_rect = 1;
+ sf->inter_tx_size_search_init_depth_sqr = 1;
+ sf->intra_tx_size_search_init_depth_rect = 1;
+ sf->intra_tx_size_search_init_depth_sqr = 1;
+ sf->tx_size_search_lgr_block = 1;
+ if (speed >= CONFIG_2PASS_PARTITION_SEARCH_LVL) {
+ sf->two_pass_partition_search = 1;
+ sf->mode_pruning_based_on_two_pass_partition_search = 1;
+ }
+ sf->prune_ext_partition_types_search_level = 2;
+ sf->use_fast_interpolation_filter_search = 1;
+ sf->skip_repeat_interpolation_filter_search = 1;
+ sf->tx_type_search.skip_tx_search = 1;
+ sf->tx_type_search.ml_tx_split_thresh = 40;
+ sf->model_based_prune_tx_search_level = 0;
+ sf->model_based_post_interp_filter_breakout = 0;
+ // TODO(angiebird): Re-evaluate the impact of inter_mode_rd_model_estimation
+ // on speed 1
+ sf->inter_mode_rd_model_estimation = 0;
+ sf->adaptive_txb_search_level = 2;
+ sf->use_intra_txb_hash = 1;
+ sf->optimize_b_precheck = 1;
+ sf->dual_sgr_penalty_level = 1;
+ sf->use_accurate_subpel_search = 1;
+ sf->reuse_inter_intra_mode = 1;
+ sf->prune_comp_search_by_single_result = 1;
+ sf->skip_repeated_newmv = 1;
+ sf->obmc_full_pixel_search_level = 1;
+ }
+
+ if (speed >= 2) {
+ sf->gm_erroradv_type = GM_ERRORADV_TR_2;
+
+ sf->selective_ref_frame = 2;
+ sf->fast_cdef_search = 1;
+
+ sf->adaptive_rd_thresh = 1;
+ sf->mv.auto_mv_step_size = 1;
+ sf->mv.subpel_iters_per_step = 1;
+ sf->disable_filter_search_var_thresh = 100;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZES_ALL;
+
+ sf->partition_search_breakout_rate_thr = 80;
+ // Note: This speed feature is disable as it seems to be worse in
+ // compression/quality and is also slower.
+ // sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
+ sf->allow_partition_search_skip = 1;
+ sf->disable_wedge_search_var_thresh = 100;
+ sf->fast_wedge_sign_estimate = 1;
+ }
+
+ if (speed >= 3) {
+ sf->tx_size_search_method = boosted ? USE_FULL_RD : USE_LARGESTALL;
+ sf->less_rectangular_check_level = 2;
+ sf->adaptive_pred_interp_filter = 1;
+ // adaptive_motion_search breaks encoder multi-thread tests.
+ // The values in x->pred_mv[] differ for single and multi-thread cases.
+ // See aomedia:1778.
+ // sf->adaptive_motion_search = 1;
+ sf->recode_loop = ALLOW_RECODE_KFARFGF;
+ sf->use_transform_domain_distortion = 1;
+ sf->use_accurate_subpel_search = 0;
+ sf->adaptive_rd_thresh = 2;
+ sf->tx_type_search.prune_mode = PRUNE_2D_FAST;
+ sf->gm_search_type = GM_DISABLE_SEARCH;
+ sf->prune_comp_search_by_single_result = 2;
+ }
+
+ if (speed >= 4) {
+ sf->tx_type_search.fast_intra_tx_type_search = 1;
+ sf->tx_type_search.fast_inter_tx_type_search = 1;
+ sf->use_square_partition_only_threshold =
+ boosted ? BLOCK_128X128 : BLOCK_4X4;
+ sf->tx_size_search_method =
+ frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 1;
+ sf->cb_partition_search = !boosted;
+ sf->alt_ref_search_fp = 1;
+ }
+
+ if (speed >= 5) {
+ sf->recode_loop = ALLOW_RECODE_KFMAXBW;
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_64X64] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_32X32] = UV_INTRA_DC_H_V_CFL;
+ sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
+ sf->intra_uv_mode_mask[TX_16X16] = UV_INTRA_DC_H_V_CFL;
+ sf->use_square_partition_only_threshold = BLOCK_4X4;
+ sf->tx_size_search_method = USE_LARGESTALL;
+ sf->mv.search_method = BIGDIA;
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_MORE;
+ sf->adaptive_rd_thresh = 4;
+ sf->mode_search_skip_flags =
+ (cm->frame_type == KEY_FRAME)
+ ? 0
+ : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
+ FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR |
+ FLAG_EARLY_TERMINATE;
+ sf->disable_filter_search_var_thresh = 200;
+ sf->use_fast_coef_costing = 1;
+ sf->partition_search_breakout_rate_thr = 300;
+ sf->use_transform_domain_distortion = 2;
+ }
+
+ if (speed >= 6) {
+ int i;
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+ sf->mv.search_method = HEX;
+ sf->disable_filter_search_var_thresh = 500;
+ for (i = 0; i < TX_SIZES; ++i) {
+ sf->intra_y_mode_mask[i] = INTRA_DC;
+ sf->intra_uv_mode_mask[i] = UV_INTRA_DC_CFL;
+ }
+ sf->partition_search_breakout_rate_thr = 500;
+ sf->mv.reduce_first_step_size = 1;
+ sf->simple_model_rd_from_var = 1;
+ }
+ if (speed >= 7) {
+ sf->default_max_partition_size = BLOCK_32X32;
+ sf->default_min_partition_size = BLOCK_8X8;
+ sf->intra_y_mode_mask[TX_64X64] = INTRA_DC;
+ sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
+ sf->frame_parameter_update = 0;
+ sf->mv.search_method = FAST_HEX;
+ sf->partition_search_type = REFERENCE_PARTITION;
+ sf->mode_search_skip_flags |= FLAG_SKIP_INTRA_DIRMISMATCH;
+ }
+ if (speed >= 8) {
+ sf->mv.search_method = FAST_DIAMOND;
+ sf->mv.subpel_force_stop = 2;
+ sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
+ }
+}
+
+void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
+ SPEED_FEATURES *const sf = &cpi->sf;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ RD_OPT *const rd = &cpi->rd;
+ int i;
+
+ if (oxcf->mode == GOOD) {
+ set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
+ }
+
+ if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
+ sf->adaptive_pred_interp_filter = 0;
+ }
+
+ // Check for masked out split cases.
+ for (i = 0; i < MAX_REFS; ++i) {
+ if (sf->disable_split_mask & (1 << i)) {
+ rd->thresh_mult_sub8x8[i] = INT_MAX;
+ }
+ }
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test == 1)
+ cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (cpi->oxcf.motion_vector_unit_test == 2)
+ cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+}
+
+void av1_set_speed_features_framesize_independent(AV1_COMP *cpi) {
+ AV1_COMMON *const cm = &cpi->common;
+ SPEED_FEATURES *const sf = &cpi->sf;
+ MACROBLOCK *const x = &cpi->td.mb;
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ int i;
+
+ // best quality defaults
+ sf->frame_parameter_update = 1;
+ sf->mv.search_method = NSTEP;
+ sf->recode_loop = ALLOW_RECODE;
+ sf->mv.subpel_search_method = SUBPEL_TREE;
+ sf->mv.subpel_iters_per_step = 2;
+ sf->mv.subpel_force_stop = 0;
+#if DISABLE_TRELLISQ_SEARCH == 2
+ sf->optimize_coefficients = !is_lossless_requested(&cpi->oxcf)
+ ? FINAL_PASS_TRELLIS_OPT
+ : NO_TRELLIS_OPT;
+#elif DISABLE_TRELLISQ_SEARCH == 1
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+#else
+ if (is_lossless_requested(&cpi->oxcf))
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+ else
+ sf->optimize_coefficients = FULL_TRELLIS_OPT;
+#endif // DISABLE_TRELLISQ_SEARCH
+ sf->gm_erroradv_type = GM_ERRORADV_TR_0;
+ sf->mv.reduce_first_step_size = 0;
+ sf->mv.auto_mv_step_size = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+ sf->adaptive_rd_thresh = 0;
+ sf->tx_size_search_method = USE_FULL_RD;
+ sf->inter_tx_size_search_init_depth_sqr = 0;
+ sf->inter_tx_size_search_init_depth_rect = 0;
+ sf->intra_tx_size_search_init_depth_rect = 0;
+ sf->intra_tx_size_search_init_depth_sqr = 0;
+ sf->tx_size_search_lgr_block = 0;
+ sf->model_based_prune_tx_search_level = 0;
+ sf->model_based_post_interp_filter_breakout = 0;
+ sf->reduce_inter_modes = 0;
+ sf->selective_ref_gm = 1;
+ sf->adaptive_motion_search = 0;
+ sf->adaptive_pred_interp_filter = 0;
+ sf->adaptive_mode_search = 0;
+ sf->cb_partition_search = 0;
+ sf->alt_ref_search_fp = 0;
+ sf->partition_search_type = SEARCH_PARTITION;
+ sf->tx_type_search.prune_mode = PRUNE_2D_ACCURATE;
+ sf->tx_type_search.ml_tx_split_thresh = 30;
+ sf->tx_type_search.use_skip_flag_prediction = 1;
+ sf->tx_type_search.fast_intra_tx_type_search = 0;
+ sf->tx_type_search.fast_inter_tx_type_search = 0;
+ sf->tx_type_search.skip_tx_search = 0;
+ sf->selective_ref_frame = 0;
+ sf->less_rectangular_check_level = 0;
+ sf->use_square_partition_only_threshold = BLOCK_128X128;
+ sf->prune_ref_frame_for_rect_partitions = 0;
+ sf->auto_min_max_partition_size = NOT_IN_USE;
+ sf->rd_auto_partition_min_limit = BLOCK_4X4;
+ sf->default_max_partition_size = BLOCK_LARGEST;
+ sf->default_min_partition_size = BLOCK_4X4;
+ sf->adjust_partitioning_from_last_frame = 0;
+ sf->disable_split_mask = 0;
+ sf->mode_search_skip_flags = 0;
+ sf->disable_filter_search_var_thresh = 0;
+ sf->allow_partition_search_skip = 0;
+ sf->use_accurate_subpel_search = 2;
+ sf->disable_wedge_search_var_thresh = 0;
+ sf->fast_wedge_sign_estimate = 0;
+ sf->drop_ref = 0;
+ sf->skip_intra_in_interframe = 1;
+ sf->txb_split_cap = 1;
+ sf->adaptive_txb_search_level = 0;
+ sf->two_pass_partition_search = 0;
+ sf->mode_pruning_based_on_two_pass_partition_search = 0;
+ sf->use_intra_txb_hash = 0;
+ sf->use_inter_txb_hash = 1;
+ sf->use_mb_rd_hash = 1;
+ sf->optimize_b_precheck = 0;
+ sf->jnt_comp_fast_tx_search = 0;
+ sf->jnt_comp_skip_mv_search = 0;
+ sf->reuse_inter_intra_mode = 0;
+
+ for (i = 0; i < TX_SIZES; i++) {
+ sf->intra_y_mode_mask[i] = INTRA_ALL;
+ sf->intra_uv_mode_mask[i] = UV_INTRA_ALL;
+ }
+ sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
+ sf->use_fast_coef_costing = 0;
+ sf->max_intra_bsize = BLOCK_LARGEST;
+ // This setting only takes effect when partition_search_type is set
+ // to FIXED_PARTITION.
+ sf->always_this_block_size = BLOCK_16X16;
+ // Recode loop tolerance %.
+ sf->recode_tolerance = 25;
+ sf->partition_search_breakout_dist_thr = 0;
+ sf->partition_search_breakout_rate_thr = 0;
+ sf->simple_model_rd_from_var = 0;
+ sf->prune_ext_partition_types_search_level = 0;
+ sf->ml_prune_rect_partition = 0;
+ sf->ml_prune_ab_partition = 0;
+ sf->ml_prune_4_partition = 0;
+ sf->fast_cdef_search = 0;
+ for (i = 0; i < PARTITION_BLOCK_SIZES; ++i)
+ sf->ml_partition_search_breakout_thresh[i] = -1; // -1 means not enabled.
+
+ // Set this at the appropriate speed levels
+ sf->use_transform_domain_distortion = 0;
+ sf->gm_search_type = GM_FULL_SEARCH;
+ sf->gm_disable_recode = 0;
+ sf->use_fast_interpolation_filter_search = 0;
+ sf->skip_repeat_interpolation_filter_search = 0;
+ sf->use_hash_based_trellis = 0;
+ sf->prune_comp_search_by_single_result = 0;
+ sf->skip_repeated_newmv = 0;
+
+ // Set decoder side speed feature to use less dual sgr modes
+ sf->dual_sgr_penalty_level = 0;
+
+ sf->inter_mode_rd_model_estimation = 0;
+ sf->obmc_full_pixel_search_level = 0;
+
+ if (oxcf->mode == GOOD)
+ set_good_speed_features_framesize_independent(cpi, sf, oxcf->speed);
+
+ // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+ // blocks. Normalise this if the blocks are bigger.
+ if (MAX_SB_SIZE_LOG2 > 6) {
+ sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+ }
+
+ cpi->diamond_search_sad = av1_diamond_search_sad;
+
+ sf->allow_exhaustive_searches = 1;
+ int speed = (oxcf->speed > MAX_MESH_SPEED) ? MAX_MESH_SPEED : oxcf->speed;
+ if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION)
+ sf->exhaustive_searches_thresh = (1 << 24);
+ else
+ sf->exhaustive_searches_thresh = (1 << 25);
+ sf->max_exaustive_pct = good_quality_max_mesh_pct[speed];
+ if (speed > 0)
+ sf->exhaustive_searches_thresh = sf->exhaustive_searches_thresh << 1;
+
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = good_quality_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].interval =
+ good_quality_mesh_patterns[speed][i].interval;
+ }
+ if ((frame_is_intra_only(cm) && cm->allow_screen_content_tools) &&
+ (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION ||
+ cpi->oxcf.content == AOM_CONTENT_SCREEN)) {
+ for (i = 0; i < MAX_MESH_STEP; ++i) {
+ sf->mesh_patterns[i].range = intrabc_mesh_patterns[speed][i].range;
+ sf->mesh_patterns[i].interval = intrabc_mesh_patterns[speed][i].interval;
+ }
+ sf->max_exaustive_pct = intrabc_max_mesh_pct[speed];
+ }
+
+ // Slow quant, dct and trellis not worthwhile for first pass
+ // so make sure they are always turned off.
+ if (oxcf->pass == 1) sf->optimize_coefficients = NO_TRELLIS_OPT;
+
+ // No recode for 1 pass.
+ if (oxcf->pass == 0) {
+ sf->recode_loop = DISALLOW_RECODE;
+ sf->optimize_coefficients = NO_TRELLIS_OPT;
+ }
+
+ if (sf->mv.subpel_search_method == SUBPEL_TREE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_MORE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_more;
+ } else if (sf->mv.subpel_search_method == SUBPEL_TREE_PRUNED_EVENMORE) {
+ cpi->find_fractional_mv_step = av1_find_best_sub_pixel_tree_pruned_evenmore;
+ }
+
+ cpi->optimize_speed_feature =
+ oxcf->pass != 1 ? sf->optimize_coefficients : NO_TRELLIS_OPT;
+ // FIXME: trellis not very efficient for quantisation matrices
+ if (cm->using_qmatrix) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+ if (oxcf->disable_trellis_quant) cpi->optimize_speed_feature = NO_TRELLIS_OPT;
+
+ x->min_partition_size = sf->default_min_partition_size;
+ x->max_partition_size = sf->default_max_partition_size;
+
+ // This is only used in motion vector unit test.
+ if (cpi->oxcf.motion_vector_unit_test == 1)
+ cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
+ else if (cpi->oxcf.motion_vector_unit_test == 2)
+ cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
+
+#if CONFIG_DIST_8X8
+ if (sf->use_transform_domain_distortion > 0) cpi->oxcf.using_dist_8x8 = 0;
+
+ if (cpi->oxcf.using_dist_8x8) x->min_partition_size = BLOCK_8X8;
+#endif // CONFIG_DIST_8X8
+}
diff --git a/third_party/aom/av1/encoder/speed_features.h b/third_party/aom/av1/encoder/speed_features.h
new file mode 100644
index 0000000000..41013b2e79
--- /dev/null
+++ b/third_party/aom/av1/encoder/speed_features.h
@@ -0,0 +1,568 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_SPEED_FEATURES_H_
+#define AOM_AV1_ENCODER_SPEED_FEATURES_H_
+
+#include "av1/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum {
+ INTRA_ALL = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED) | (1 << D45_PRED) |
+ (1 << D135_PRED) | (1 << D113_PRED) | (1 << D157_PRED) |
+ (1 << D203_PRED) | (1 << D67_PRED) | (1 << SMOOTH_PRED) |
+ (1 << SMOOTH_V_PRED) | (1 << SMOOTH_H_PRED) | (1 << PAETH_PRED),
+ UV_INTRA_ALL =
+ (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_D45_PRED) | (1 << UV_D135_PRED) | (1 << UV_D113_PRED) |
+ (1 << UV_D157_PRED) | (1 << UV_D203_PRED) | (1 << UV_D67_PRED) |
+ (1 << UV_SMOOTH_PRED) | (1 << UV_SMOOTH_V_PRED) |
+ (1 << UV_SMOOTH_H_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC = (1 << UV_DC_PRED),
+ UV_INTRA_DC_CFL = (1 << UV_DC_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_TM = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED),
+ UV_INTRA_DC_PAETH_CFL =
+ (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_H_V = (1 << UV_DC_PRED) | (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_V_PRED) |
+ (1 << UV_H_PRED) | (1 << UV_CFL_PRED),
+ UV_INTRA_DC_PAETH_H_V = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED),
+ UV_INTRA_DC_PAETH_H_V_CFL = (1 << UV_DC_PRED) | (1 << UV_PAETH_PRED) |
+ (1 << UV_V_PRED) | (1 << UV_H_PRED) |
+ (1 << UV_CFL_PRED),
+ INTRA_DC = (1 << DC_PRED),
+ INTRA_DC_TM = (1 << DC_PRED) | (1 << PAETH_PRED),
+ INTRA_DC_H_V = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+ INTRA_DC_PAETH_H_V =
+ (1 << DC_PRED) | (1 << PAETH_PRED) | (1 << V_PRED) | (1 << H_PRED)
+};
+
+enum {
+ INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEWMV) | (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) |
+ (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) | (1 << NEAR_NEWMV) |
+ (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) | (1 << GLOBAL_GLOBALMV),
+ INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << GLOBALMV) |
+ (1 << NEAREST_NEARESTMV) | (1 << GLOBAL_GLOBALMV) |
+ (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+ (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) |
+ (1 << NEAR_NEARMV),
+};
+
+enum {
+ DISABLE_ALL_INTER_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD) | (1 << THR_LAST),
+
+ DISABLE_ALL_SPLIT = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+ DISABLE_COMPOUND_SPLIT = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+ LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) | (1 << THR_COMP_LA) |
+ (1 << THR_ALTR) | (1 << THR_GOLD)
+};
+
+typedef enum {
+ TXFM_CODING_SF = 1,
+ INTER_PRED_SF = 2,
+ INTRA_PRED_SF = 4,
+ PARTITION_SF = 8,
+ LOOP_FILTER_SF = 16,
+ RD_SKIP_SF = 32,
+ RESERVE_2_SF = 64,
+ RESERVE_3_SF = 128,
+} DEV_SPEED_FEATURES;
+
+typedef enum {
+ DIAMOND = 0,
+ NSTEP = 1,
+ HEX = 2,
+ BIGDIA = 3,
+ SQUARE = 4,
+ FAST_HEX = 5,
+ FAST_DIAMOND = 6
+} SEARCH_METHODS;
+
+typedef enum {
+ // No recode.
+ DISALLOW_RECODE = 0,
+ // Allow recode for KF and exceeding maximum frame bandwidth.
+ ALLOW_RECODE_KFMAXBW = 1,
+ // Allow recode only for KF/ARF/GF frames.
+ ALLOW_RECODE_KFARFGF = 2,
+ // Allow recode for all frames based on bitrate constraints.
+ ALLOW_RECODE = 3,
+} RECODE_LOOP_TYPE;
+
+typedef enum {
+ SUBPEL_TREE = 0,
+ SUBPEL_TREE_PRUNED = 1, // Prunes 1/2-pel searches
+ SUBPEL_TREE_PRUNED_MORE = 2, // Prunes 1/2-pel searches more aggressively
+ SUBPEL_TREE_PRUNED_EVENMORE = 3, // Prunes 1/2- and 1/4-pel searches
+ // Other methods to come
+} SUBPEL_SEARCH_METHODS;
+
+typedef enum {
+ USE_FULL_RD = 0,
+ USE_FAST_RD,
+ USE_LARGESTALL,
+} TX_SIZE_SEARCH_METHOD;
+
+typedef enum {
+ NOT_IN_USE = 0,
+ RELAXED_NEIGHBORING_MIN_MAX = 1
+} AUTO_MIN_MAX_MODE;
+
+typedef enum {
+ // Try the full image with different values.
+ LPF_PICK_FROM_FULL_IMAGE,
+ // Try a small portion of the image with different values.
+ LPF_PICK_FROM_SUBIMAGE,
+ // Estimate the level based on quantizer and frame type
+ LPF_PICK_FROM_Q,
+ // Pick 0 to disable LPF if LPF was enabled last frame
+ LPF_PICK_MINIMAL_LPF
+} LPF_PICK_METHOD;
+
+typedef enum {
+ // Terminate search early based on distortion so far compared to
+ // qp step, distortion in the neighborhood of the frame, etc.
+ FLAG_EARLY_TERMINATE = 1 << 0,
+
+ // Skips comp inter modes if the best so far is an intra mode.
+ FLAG_SKIP_COMP_BESTINTRA = 1 << 1,
+
+ // Skips oblique intra modes if the best so far is an inter mode.
+ FLAG_SKIP_INTRA_BESTINTER = 1 << 3,
+
+ // Skips oblique intra modes at angles 27, 63, 117, 153 if the best
+ // intra so far is not one of the neighboring directions.
+ FLAG_SKIP_INTRA_DIRMISMATCH = 1 << 4,
+
+ // Skips intra modes other than DC_PRED if the source variance is small
+ FLAG_SKIP_INTRA_LOWVAR = 1 << 5,
+} MODE_SEARCH_SKIP_LOGIC;
+
+typedef enum {
+ NO_PRUNE = 0,
+ // eliminates one tx type in vertical and horizontal direction
+ PRUNE_ONE = 1,
+ // eliminates two tx types in each direction
+ PRUNE_TWO = 2,
+ // adaptively prunes the least perspective tx types out of all 16
+ // (tuned to provide negligible quality loss)
+ PRUNE_2D_ACCURATE = 3,
+ // similar, but applies much more aggressive pruning to get better speed-up
+ PRUNE_2D_FAST = 4,
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+ TX_TYPE_PRUNE_MODE prune_mode;
+ int fast_intra_tx_type_search;
+ int fast_inter_tx_type_search;
+
+ // Use a skip flag prediction model to detect blocks with skip = 1 early
+ // and avoid doing full TX type search for such blocks.
+ int use_skip_flag_prediction;
+
+ // Threshold used by the ML based method to predict TX block split decisions.
+ int ml_tx_split_thresh;
+
+ // skip remaining transform type search when we found the rdcost of skip is
+ // better than applying transform
+ int skip_tx_search;
+} TX_TYPE_SEARCH;
+
+typedef enum {
+ // Search partitions using RD criterion
+ SEARCH_PARTITION,
+
+ // Always use a fixed size partition
+ FIXED_PARTITION,
+
+ REFERENCE_PARTITION
+} PARTITION_SEARCH_TYPE;
+
+typedef struct MV_SPEED_FEATURES {
+ // Motion search method (Diamond, NSTEP, Hex, Big Diamond, Square, etc).
+ SEARCH_METHODS search_method;
+
+ // This parameter controls which step in the n-step process we start at.
+ // It's changed adaptively based on circumstances.
+ int reduce_first_step_size;
+
+ // If this is set to 1, we limit the motion search range to 2 times the
+ // largest motion vector found in the last frame.
+ int auto_mv_step_size;
+
+ // Subpel_search_method can only be subpel_tree which does a subpixel
+ // logarithmic search that keeps stepping at 1/2 pixel units until
+ // you stop getting a gain, and then goes on to 1/4 and repeats
+ // the same process. Along the way it skips many diagonals.
+ SUBPEL_SEARCH_METHODS subpel_search_method;
+
+ // Maximum number of steps in logarithmic subpel search before giving up.
+ int subpel_iters_per_step;
+
+ // Control when to stop subpel search
+ int subpel_force_stop;
+} MV_SPEED_FEATURES;
+
+#define MAX_MESH_STEP 4
+
+typedef struct MESH_PATTERN {
+ int range;
+ int interval;
+} MESH_PATTERN;
+
+typedef enum {
+ GM_FULL_SEARCH,
+ GM_REDUCED_REF_SEARCH,
+ GM_DISABLE_SEARCH
+} GM_SEARCH_TYPE;
+
+typedef enum {
+ GM_ERRORADV_TR_0,
+ GM_ERRORADV_TR_1,
+ GM_ERRORADV_TR_2,
+ GM_ERRORADV_TR_TYPES,
+} GM_ERRORADV_TYPE;
+
+typedef enum {
+ NO_TRELLIS_OPT, // No trellis optimization
+ FULL_TRELLIS_OPT, // Trellis optimization in all stages
+ FINAL_PASS_TRELLIS_OPT // Trellis optimization in only the final encode pass
+} TRELLIS_OPT_TYPE;
+
+typedef enum {
+ FULL_TXFM_RD,
+ LOW_TXFM_RD,
+} TXFM_RD_MODEL;
+
+typedef struct SPEED_FEATURES {
+ MV_SPEED_FEATURES mv;
+
+ // Frame level coding parameter update
+ int frame_parameter_update;
+
+ RECODE_LOOP_TYPE recode_loop;
+
+ // Trellis (dynamic programming) optimization of quantized values
+ TRELLIS_OPT_TYPE optimize_coefficients;
+
+ // Global motion warp error threshold
+ GM_ERRORADV_TYPE gm_erroradv_type;
+
+ // Always set to 0. If on it enables 0 cost background transmission
+ // (except for the initial transmission of the segmentation). The feature is
+ // disabled because the addition of very large block sizes make the
+ // backgrounds very to cheap to encode, and the segmentation we have
+ // adds overhead.
+ int static_segmentation;
+
+ // Limit the inter mode tested in the RD loop
+ int reduce_inter_modes;
+
+ // Do not compute the global motion parameters for a LAST2_FRAME or
+ // LAST3_FRAME if the GOLDEN_FRAME is closer and it has a non identity
+ // global model.
+ int selective_ref_gm;
+
+ // If 1 we iterate finding a best reference for 2 ref frames together - via
+ // a log search that iterates 4 times (check around mv for last for best
+ // error of combined predictor then check around mv for alt). If 0 we
+ // we just use the best motion vector found for each frame by itself.
+ BLOCK_SIZE comp_inter_joint_search_thresh;
+
+ // This variable is used to cap the maximum number of times we skip testing a
+ // mode to be evaluated. A high value means we will be faster.
+ int adaptive_rd_thresh;
+
+ // Determine which method we use to determine transform size. We can choose
+ // between options like full rd, largest for prediction size, largest
+ // for intra and model coefs for the rest.
+ TX_SIZE_SEARCH_METHOD tx_size_search_method;
+
+ // Init search depth for square and rectangular transform partitions.
+ // Values:
+ // 0 - search full tree, 1: search 1 level, 2: search the highest level only
+ int inter_tx_size_search_init_depth_sqr;
+ int inter_tx_size_search_init_depth_rect;
+ int intra_tx_size_search_init_depth_sqr;
+ int intra_tx_size_search_init_depth_rect;
+ // If any dimension of a coding block size above 64, always search the
+ // largest transform only, since the largest transform block size is 64x64.
+ int tx_size_search_lgr_block;
+
+ PARTITION_SEARCH_TYPE partition_search_type;
+
+ TX_TYPE_SEARCH tx_type_search;
+
+ // Skip split transform block partition when the collocated bigger block
+ // is selected as all zero coefficients.
+ int txb_split_cap;
+
+ // Shortcut the transform block partition and type search when the target
+ // rdcost is relatively lower.
+ // Values are 0 (not used) , or 1 - 2 with progressively increasing
+ // aggressiveness
+ int adaptive_txb_search_level;
+
+ // Prune level for tx_size_type search for inter based on rd model
+ // 0: no pruning
+ // 1-2: progressively increasing aggressiveness of pruning
+ int model_based_prune_tx_search_level;
+
+ // Model based breakout after interpolation filter search
+ // 0: no breakout
+ // 1: use model based rd breakout
+ int model_based_post_interp_filter_breakout;
+
+ // Used if partition_search_type = FIXED_SIZE_PARTITION
+ BLOCK_SIZE always_this_block_size;
+
+ // Drop less likely to be picked reference frames in the RD search.
+ // Has three levels for now: 0, 1 and 2, where higher levels prune more
+ // aggressively than lower ones. (0 means no pruning).
+ int selective_ref_frame;
+
+ // Prune extended partition types search
+ // Can take values 0 - 2, 0 referring to no pruning, and 1 - 2 increasing
+ // aggressiveness of pruning in order.
+ int prune_ext_partition_types_search_level;
+
+ // Use a ML model to prune horz and vert partitions
+ int ml_prune_rect_partition;
+
+ // Use a ML model to prune horz_a, horz_b, vert_a and vert_b partitions.
+ int ml_prune_ab_partition;
+
+ // Use a ML model to prune horz4 and vert4 partitions.
+ int ml_prune_4_partition;
+
+ int fast_cdef_search;
+
+ // 2-pass coding block partition search
+ int two_pass_partition_search;
+
+ // Use the mode decisions made in the initial partition search to prune mode
+ // candidates, e.g. ref frames.
+ int mode_pruning_based_on_two_pass_partition_search;
+
+ // Skip rectangular partition test when partition type none gives better
+ // rd than partition type split. Can take values 0 - 2, 0 referring to no
+ // skipping, and 1 - 2 increasing aggressiveness of skipping in order.
+ int less_rectangular_check_level;
+
+ // Use square partition only beyond this block size.
+ BLOCK_SIZE use_square_partition_only_threshold;
+
+ // Prune reference frames for rectangular partitions.
+ int prune_ref_frame_for_rect_partitions;
+
+ // Sets min and max partition sizes for this superblock based on the
+ // same superblock in last encoded frame, and the left and above neighbor.
+ AUTO_MIN_MAX_MODE auto_min_max_partition_size;
+ // Ensures the rd based auto partition search will always
+ // go down at least to the specified level.
+ BLOCK_SIZE rd_auto_partition_min_limit;
+
+ // Min and max partition size we enable (block_size) as per auto
+ // min max, but also used by adjust partitioning, and pick_partitioning.
+ BLOCK_SIZE default_min_partition_size;
+ BLOCK_SIZE default_max_partition_size;
+
+ // Whether or not we allow partitions one smaller or one greater than the last
+ // frame's partitioning. Only used if use_lastframe_partitioning is set.
+ int adjust_partitioning_from_last_frame;
+
+ // Disables sub 8x8 blocksizes in different scenarios: Choices are to disable
+ // it always, to allow it for only Last frame and Intra, disable it for all
+ // inter modes or to enable it always.
+ int disable_split_mask;
+
+ // TODO(jingning): combine the related motion search speed features
+ // This allows us to use motion search at other sizes as a starting
+ // point for this motion search and limits the search range around it.
+ int adaptive_motion_search;
+
+ // Flag for allowing some use of exhaustive searches;
+ int allow_exhaustive_searches;
+
+ // Threshold for allowing exhaistive motion search.
+ int exhaustive_searches_thresh;
+
+ // Maximum number of exhaustive searches for a frame.
+ int max_exaustive_pct;
+
+ // Pattern to be used for any exhaustive mesh searches.
+ MESH_PATTERN mesh_patterns[MAX_MESH_STEP];
+
+ // Allows sub 8x8 modes to use the prediction filter that was determined
+ // best for 8x8 mode. If set to 0 we always re check all the filters for
+ // sizes less than 8x8, 1 means we check all filter modes if no 8x8 filter
+ // was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
+ int adaptive_pred_interp_filter;
+
+ // Adaptive prediction mode search
+ int adaptive_mode_search;
+
+ int cb_partition_search;
+
+ int alt_ref_search_fp;
+
+ // Implements various heuristics to skip searching modes
+ // The heuristics selected are based on flags
+ // defined in the MODE_SEARCH_SKIP_HEURISTICS enum
+ unsigned int mode_search_skip_flags;
+
+ // A source variance threshold below which filter search is disabled
+ // Choose a very large value (UINT_MAX) to use 8-tap always
+ unsigned int disable_filter_search_var_thresh;
+
+ // A source variance threshold below which wedge search is disabled
+ unsigned int disable_wedge_search_var_thresh;
+
+ // Whether fast wedge sign estimate is used
+ int fast_wedge_sign_estimate;
+
+ // These bit masks allow you to enable or disable intra modes for each
+ // transform size separately.
+ int intra_y_mode_mask[TX_SIZES];
+ int intra_uv_mode_mask[TX_SIZES];
+
+ // This feature controls how the loop filter level is determined.
+ LPF_PICK_METHOD lpf_pick;
+
+ // This feature controls whether we do the expensive context update and
+ // calculation in the rd coefficient costing loop.
+ int use_fast_coef_costing;
+
+ // This feature controls the tolerence vs target used in deciding whether to
+ // recode a frame. It has no meaning if recode is disabled.
+ int recode_tolerance;
+
+ // This variable controls the maximum block size where intra blocks can be
+ // used in inter frames.
+ // TODO(aconverse): Fold this into one of the other many mode skips
+ BLOCK_SIZE max_intra_bsize;
+
+ // Partition search early breakout thresholds.
+ int64_t partition_search_breakout_dist_thr;
+ int partition_search_breakout_rate_thr;
+
+ // Thresholds for ML based partition search breakout.
+ int ml_partition_search_breakout_thresh[PARTITION_BLOCK_SIZES];
+
+ // Allow skipping partition search for still image frame
+ int allow_partition_search_skip;
+
+ // Fast approximation of av1_model_rd_from_var_lapndz
+ int simple_model_rd_from_var;
+
+ // If true, sub-pixel search uses the exact convolve function used for final
+ // encoding and decoding; otherwise, it uses bilinear interpolation.
+ int use_accurate_subpel_search;
+
+ // Whether to compute distortion in the image domain (slower but
+ // more accurate), or in the transform domain (faster but less acurate).
+ // 0: use image domain
+ // 1: use transform domain in tx_type search, and use image domain for
+ // RD_STATS
+ // 2: use transform domain
+ int use_transform_domain_distortion;
+
+ GM_SEARCH_TYPE gm_search_type;
+
+ // whether to disable the global motion recode loop
+ int gm_disable_recode;
+
+ // Do limited interpolation filter search for dual filters, since best choice
+ // usually includes EIGHTTAP_REGULAR.
+ int use_fast_interpolation_filter_search;
+
+ // Save results of interpolation_filter_search for a block
+ // Check mv and ref_frames before search, if they are same with previous
+ // saved results, it can be skipped.
+ int skip_repeat_interpolation_filter_search;
+
+ // Use a hash table to store previously computed optimized qcoeffs from
+ // expensive calls to optimize_txb.
+ int use_hash_based_trellis;
+
+ // flag to drop some ref frames in compound motion search
+ int drop_ref;
+
+ // flag to allow skipping intra mode for inter frame prediction
+ int skip_intra_in_interframe;
+
+ // Use hash table to store intra(keyframe only) txb transform search results
+ // to avoid repeated search on the same residue signal.
+ int use_intra_txb_hash;
+
+ // Use hash table to store inter txb transform search results
+ // to avoid repeated search on the same residue signal.
+ int use_inter_txb_hash;
+
+ // Use hash table to store macroblock RD search results
+ // to avoid repeated search on the same residue signal.
+ int use_mb_rd_hash;
+
+ // Calculate RD cost before doing optimize_b, and skip if the cost is large.
+ int optimize_b_precheck;
+
+ // Use model rd instead of transform search in jnt_comp
+ int jnt_comp_fast_tx_search;
+
+ // Skip mv search in jnt_comp
+ int jnt_comp_skip_mv_search;
+
+ // Decoder side speed feature to add penalty for use of dual-sgr filters.
+ // Takes values 0 - 10, 0 indicating no penalty and each additional level
+ // adding a penalty of 1%
+ int dual_sgr_penalty_level;
+
+ // Dynamically estimate final rd from prediction error and mode cost
+ int inter_mode_rd_model_estimation;
+
+ // Skip some ref frames in compound motion search by single motion search
+ // result. Has three levels for now: 0 referring to no skipping, and 1 - 3
+ // increasing aggressiveness of skipping in order.
+ // Note: The search order might affect the result. It is better to search same
+ // single inter mode as a group.
+ int prune_comp_search_by_single_result;
+
+ // Reuse the inter_intra_mode search result from NEARESTMV mode to other
+ // single ref modes
+ int reuse_inter_intra_mode;
+
+ // Set the full pixel search level of obmc
+ // 0: obmc_full_pixel_diamond
+ // 1: obmc_refining_search_sad (faster)
+ int obmc_full_pixel_search_level;
+
+ // flag to skip NEWMV mode in drl if the motion search result is the same
+ int skip_repeated_newmv;
+} SPEED_FEATURES;
+
+struct AV1_COMP;
+
+void av1_set_speed_features_framesize_independent(struct AV1_COMP *cpi);
+void av1_set_speed_features_framesize_dependent(struct AV1_COMP *cpi);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_SPEED_FEATURES_H_
diff --git a/third_party/aom/av1/encoder/temporal_filter.c b/third_party/aom/av1/encoder/temporal_filter.c
new file mode 100644
index 0000000000..75fdf02a52
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.c
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "av1/common/alloccommon.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/quant_common.h"
+#include "av1/common/reconinter.h"
+#include "av1/common/odintrin.h"
+#include "av1/encoder/av1_quantize.h"
+#include "av1/encoder/extend.h"
+#include "av1/encoder/firstpass.h"
+#include "av1/encoder/mcomp.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/ratectrl.h"
+#include "av1/encoder/reconinter_enc.h"
+#include "av1/encoder/segmentation.h"
+#include "av1/encoder/temporal_filter.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/aom_timer.h"
+#include "aom_scale/aom_scale.h"
+
+static void temporal_filter_predictors_mb_c(
+ MACROBLOCKD *xd, uint8_t *y_mb_ptr, uint8_t *u_mb_ptr, uint8_t *v_mb_ptr,
+ int stride, int uv_block_width, int uv_block_height, int mv_row, int mv_col,
+ uint8_t *pred, struct scale_factors *scale, int x, int y,
+ int can_use_previous, int num_planes) {
+ const MV mv = { mv_row, mv_col };
+ enum mv_precision mv_precision_uv;
+ int uv_stride;
+ // TODO(angiebird): change plane setting accordingly
+ ConvolveParams conv_params = get_conv_params(0, 0, xd->bd);
+ const InterpFilters interp_filters = xd->mi[0]->interp_filters;
+ WarpTypesAllowed warp_types;
+ memset(&warp_types, 0, sizeof(WarpTypesAllowed));
+
+ if (uv_block_width == 8) {
+ uv_stride = (stride + 1) >> 1;
+ mv_precision_uv = MV_PRECISION_Q4;
+ } else {
+ uv_stride = stride;
+ mv_precision_uv = MV_PRECISION_Q3;
+ }
+ av1_build_inter_predictor(y_mb_ptr, stride, &pred[0], 16, &mv, scale, 16, 16,
+ &conv_params, interp_filters, &warp_types, x, y, 0,
+ 0, MV_PRECISION_Q3, x, y, xd, can_use_previous);
+
+ if (num_planes > 1) {
+ av1_build_inter_predictor(
+ u_mb_ptr, uv_stride, &pred[256], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 1, 0, mv_precision_uv, x, y, xd, can_use_previous);
+
+ av1_build_inter_predictor(
+ v_mb_ptr, uv_stride, &pred[512], uv_block_width, &mv, scale,
+ uv_block_width, uv_block_height, &conv_params, interp_filters,
+ &warp_types, x, y, 2, 0, mv_precision_uv, x, y, xd, can_use_previous);
+ }
+}
+
+void av1_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride,
+ uint8_t *frame2, unsigned int block_width,
+ unsigned int block_height, int strength,
+ int filter_weight, unsigned int *accumulator,
+ uint16_t *count) {
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
+ int pixel_value = *frame2;
+
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = (int)i + idy;
+ int col = (int)j + idx;
+
+ if (row >= 0 && row < (int)block_height && col >= 0 &&
+ col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16) modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_width;
+ }
+}
+
+void av1_highbd_temporal_filter_apply_c(
+ uint8_t *frame1_8, unsigned int stride, uint8_t *frame2_8,
+ unsigned int block_width, unsigned int block_height, int strength,
+ int filter_weight, unsigned int *accumulator, uint16_t *count) {
+ uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
+ uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
+ unsigned int i, j, k;
+ int modifier;
+ int byte = 0;
+ const int rounding = strength > 0 ? 1 << (strength - 1) : 0;
+
+ for (i = 0, k = 0; i < block_height; i++) {
+ for (j = 0; j < block_width; j++, k++) {
+ int pixel_value = *frame2;
+
+ // non-local mean approach
+ int diff_sse[9] = { 0 };
+ int idx, idy, index = 0;
+
+ for (idy = -1; idy <= 1; ++idy) {
+ for (idx = -1; idx <= 1; ++idx) {
+ int row = (int)i + idy;
+ int col = (int)j + idx;
+
+ if (row >= 0 && row < (int)block_height && col >= 0 &&
+ col < (int)block_width) {
+ int diff = frame1[byte + idy * (int)stride + idx] -
+ frame2[idy * (int)block_width + idx];
+ diff_sse[index] = diff * diff;
+ ++index;
+ }
+ }
+ }
+
+ assert(index > 0);
+
+ modifier = 0;
+ for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
+
+ modifier *= 3;
+ modifier /= index;
+
+ ++frame2;
+
+ modifier += rounding;
+ modifier >>= strength;
+
+ if (modifier > 16) modifier = 16;
+
+ modifier = 16 - modifier;
+ modifier *= filter_weight;
+
+ count[k] += modifier;
+ accumulator[k] += modifier * pixel_value;
+
+ byte++;
+ }
+
+ byte += stride - block_width;
+ }
+}
+
+static int temporal_filter_find_matching_mb_c(AV1_COMP *cpi,
+ uint8_t *arf_frame_buf,
+ uint8_t *frame_ptr_buf,
+ int stride, int x_pos,
+ int y_pos) {
+ MACROBLOCK *const x = &cpi->td.mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+ int step_param;
+ int sadpb = x->sadperbit16;
+ int bestsme = INT_MAX;
+ int distortion;
+ unsigned int sse;
+ int cost_list[5];
+ MvLimits tmp_mv_limits = x->mv_limits;
+
+ MV best_ref_mv1 = kZeroMv;
+ MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+ // Save input state
+ struct buf_2d src = x->plane[0].src;
+ struct buf_2d pre = xd->plane[0].pre[0];
+
+ best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+ best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+ // Setup frame pointers
+ x->plane[0].src.buf = arf_frame_buf;
+ x->plane[0].src.stride = stride;
+ xd->plane[0].pre[0].buf = frame_ptr_buf;
+ xd->plane[0].pre[0].stride = stride;
+
+ step_param = mv_sf->reduce_first_step_size;
+ step_param = AOMMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+ av1_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+ x->mvcost = x->mv_cost_stack;
+ x->nmvjointcost = x->nmv_vec_cost;
+
+ av1_full_pixel_search(cpi, x, BLOCK_16X16, &best_ref_mv1_full, step_param,
+ NSTEP, 1, sadpb, cond_cost_list(cpi, cost_list),
+ &best_ref_mv1, 0, 0, x_pos, y_pos, 0);
+ x->mv_limits = tmp_mv_limits;
+
+ // Ignore mv costing by sending NULL pointer instead of cost array
+ if (cpi->common.cur_frame_force_integer_mv == 1) {
+ const uint8_t *const src_address = x->plane[0].src.buf;
+ const int src_stride = x->plane[0].src.stride;
+ const uint8_t *const y = xd->plane[0].pre[0].buf;
+ const int y_stride = xd->plane[0].pre[0].stride;
+ const int offset = x->best_mv.as_mv.row * y_stride + x->best_mv.as_mv.col;
+
+ x->best_mv.as_mv.row *= 8;
+ x->best_mv.as_mv.col *= 8;
+
+ bestsme = cpi->fn_ptr[BLOCK_16X16].vf(y + offset, y_stride, src_address,
+ src_stride, &sse);
+ } else {
+ bestsme = cpi->find_fractional_mv_step(
+ x, &cpi->common, 0, 0, &best_ref_mv1,
+ cpi->common.allow_high_precision_mv, x->errorperbit,
+ &cpi->fn_ptr[BLOCK_16X16], 0, mv_sf->subpel_iters_per_step,
+ cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL,
+ NULL, 0, 0, 0, 0, 0);
+ }
+
+ x->e_mbd.mi[0]->mv[0] = x->best_mv;
+
+ // Restore input state
+ x->plane[0].src = src;
+ xd->plane[0].pre[0] = pre;
+
+ return bestsme;
+}
+
+static void temporal_filter_iterate_c(AV1_COMP *cpi,
+ YV12_BUFFER_CONFIG **frames,
+ int frame_count, int alt_ref_index,
+ int strength,
+ struct scale_factors *scale) {
+ const AV1_COMMON *cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ int byte;
+ int frame;
+ int mb_col, mb_row;
+ unsigned int filter_weight;
+ int mb_cols = (frames[alt_ref_index]->y_crop_width + 15) >> 4;
+ int mb_rows = (frames[alt_ref_index]->y_crop_height + 15) >> 4;
+ int mb_y_offset = 0;
+ int mb_uv_offset = 0;
+ DECLARE_ALIGNED(16, unsigned int, accumulator[16 * 16 * 3]);
+ DECLARE_ALIGNED(16, uint16_t, count[16 * 16 * 3]);
+ MACROBLOCKD *mbd = &cpi->td.mb.e_mbd;
+ YV12_BUFFER_CONFIG *f = frames[alt_ref_index];
+ uint8_t *dst1, *dst2;
+ DECLARE_ALIGNED(32, uint16_t, predictor16[16 * 16 * 3]);
+ DECLARE_ALIGNED(32, uint8_t, predictor8[16 * 16 * 3]);
+ uint8_t *predictor;
+ const int mb_uv_height = 16 >> mbd->plane[1].subsampling_y;
+ const int mb_uv_width = 16 >> mbd->plane[1].subsampling_x;
+
+ // Save input state
+ uint8_t *input_buffer[MAX_MB_PLANE];
+ int i;
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ predictor = CONVERT_TO_BYTEPTR(predictor16);
+ } else {
+ predictor = predictor8;
+ }
+
+ for (i = 0; i < num_planes; i++) input_buffer[i] = mbd->plane[i].pre[0].buf;
+
+ for (mb_row = 0; mb_row < mb_rows; mb_row++) {
+ // Source frames are extended to 16 pixels. This is different than
+ // L/A/G reference frames that have a border of 32 (AV1ENCBORDERINPIXELS)
+ // A 6/8 tap filter is used for motion search. This requires 2 pixels
+ // before and 3 pixels after. So the largest Y mv on a border would
+ // then be 16 - AOM_INTERP_EXTEND. The UV blocks are half the size of the
+ // Y and therefore only extended by 8. The largest mv that a UV block
+ // can support is 8 - AOM_INTERP_EXTEND. A UV mv is half of a Y mv.
+ // (16 - AOM_INTERP_EXTEND) >> 1 which is greater than
+ // 8 - AOM_INTERP_EXTEND.
+ // To keep the mv in play for both Y and UV planes the max that it
+ // can be on a border is therefore 16 - (2*AOM_INTERP_EXTEND+1).
+ cpi->td.mb.mv_limits.row_min =
+ -((mb_row * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ cpi->td.mb.mv_limits.row_max =
+ ((mb_rows - 1 - mb_row) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+ for (mb_col = 0; mb_col < mb_cols; mb_col++) {
+ int j, k;
+ int stride;
+
+ memset(accumulator, 0, 16 * 16 * 3 * sizeof(accumulator[0]));
+ memset(count, 0, 16 * 16 * 3 * sizeof(count[0]));
+
+ cpi->td.mb.mv_limits.col_min =
+ -((mb_col * 16) + (17 - 2 * AOM_INTERP_EXTEND));
+ cpi->td.mb.mv_limits.col_max =
+ ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * AOM_INTERP_EXTEND);
+
+ for (frame = 0; frame < frame_count; frame++) {
+ const int thresh_low = 10000;
+ const int thresh_high = 20000;
+
+ if (frames[frame] == NULL) continue;
+
+ mbd->mi[0]->mv[0].as_mv.row = 0;
+ mbd->mi[0]->mv[0].as_mv.col = 0;
+ mbd->mi[0]->motion_mode = SIMPLE_TRANSLATION;
+
+ if (frame == alt_ref_index) {
+ filter_weight = 2;
+ } else {
+ // Find best match in this frame by MC
+ int err = temporal_filter_find_matching_mb_c(
+ cpi, frames[alt_ref_index]->y_buffer + mb_y_offset,
+ frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride,
+ mb_col * 16, mb_row * 16);
+
+ // Assign higher weight to matching MB if it's error
+ // score is lower. If not applying MC default behavior
+ // is to weight all MBs equal.
+ filter_weight = err < thresh_low ? 2 : err < thresh_high ? 1 : 0;
+ }
+
+ if (filter_weight != 0) {
+ // Construct the predictors
+ temporal_filter_predictors_mb_c(
+ mbd, frames[frame]->y_buffer + mb_y_offset,
+ frames[frame]->u_buffer + mb_uv_offset,
+ frames[frame]->v_buffer + mb_uv_offset, frames[frame]->y_stride,
+ mb_uv_width, mb_uv_height, mbd->mi[0]->mv[0].as_mv.row,
+ mbd->mi[0]->mv[0].as_mv.col, predictor, scale, mb_col * 16,
+ mb_row * 16, cm->allow_warped_motion, num_planes);
+
+ // Apply the filter (YUV)
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ int adj_strength = strength + 2 * (mbd->bd - 8);
+ av1_highbd_temporal_filter_apply(
+ f->y_buffer + mb_y_offset, f->y_stride, predictor, 16, 16,
+ adj_strength, filter_weight, accumulator, count);
+ if (num_planes > 1) {
+ av1_highbd_temporal_filter_apply(
+ f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+ accumulator + 256, count + 256);
+ av1_highbd_temporal_filter_apply(
+ f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height, adj_strength, filter_weight,
+ accumulator + 512, count + 512);
+ }
+ } else {
+ av1_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+ predictor, 16, 16, strength,
+ filter_weight, accumulator, count);
+ if (num_planes > 1) {
+ av1_temporal_filter_apply_c(
+ f->u_buffer + mb_uv_offset, f->uv_stride, predictor + 256,
+ mb_uv_width, mb_uv_height, strength, filter_weight,
+ accumulator + 256, count + 256);
+ av1_temporal_filter_apply_c(
+ f->v_buffer + mb_uv_offset, f->uv_stride, predictor + 512,
+ mb_uv_width, mb_uv_height, strength, filter_weight,
+ accumulator + 512, count + 512);
+ }
+ }
+ }
+ }
+
+ // Normalize filter output to produce AltRef frame
+ if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+ uint16_t *dst1_16;
+ uint16_t *dst2_16;
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ dst1_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // move to next pixel
+ byte++;
+ }
+
+ byte += stride - 16;
+ }
+ if (num_planes > 1) {
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ dst1_16 = CONVERT_TO_SHORTPTR(dst1);
+ dst2_16 = CONVERT_TO_SHORTPTR(dst2);
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+ // U
+ dst1_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+ // V
+ dst2_16[byte] =
+ (uint16_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+ }
+ } else {
+ dst1 = cpi->alt_ref_buffer.y_buffer;
+ stride = cpi->alt_ref_buffer.y_stride;
+ byte = mb_y_offset;
+ for (i = 0, k = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++, k++) {
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - 16;
+ }
+ if (num_planes > 1) {
+ dst1 = cpi->alt_ref_buffer.u_buffer;
+ dst2 = cpi->alt_ref_buffer.v_buffer;
+ stride = cpi->alt_ref_buffer.uv_stride;
+ byte = mb_uv_offset;
+ for (i = 0, k = 256; i < mb_uv_height; i++) {
+ for (j = 0; j < mb_uv_width; j++, k++) {
+ int m = k + 256;
+ // U
+ dst1[byte] =
+ (uint8_t)OD_DIVU(accumulator[k] + (count[k] >> 1), count[k]);
+ // V
+ dst2[byte] =
+ (uint8_t)OD_DIVU(accumulator[m] + (count[m] >> 1), count[m]);
+ // move to next pixel
+ byte++;
+ }
+ byte += stride - mb_uv_width;
+ }
+ }
+ }
+ mb_y_offset += 16;
+ mb_uv_offset += mb_uv_width;
+ }
+ mb_y_offset += 16 * (f->y_stride - mb_cols);
+ mb_uv_offset += mb_uv_height * f->uv_stride - mb_uv_width * mb_cols;
+ }
+
+ // Restore input state
+ for (i = 0; i < num_planes; i++) mbd->plane[i].pre[0].buf = input_buffer[i];
+}
+
+// Apply buffer limits and context specific adjustments to arnr filter.
+static void adjust_arnr_filter(AV1_COMP *cpi, int distance, int group_boost,
+ int *arnr_frames, int *arnr_strength) {
+ const AV1EncoderConfig *const oxcf = &cpi->oxcf;
+ const int frames_after_arf =
+ av1_lookahead_depth(cpi->lookahead) - distance - 1;
+ int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1;
+ int frames_bwd;
+ int q, frames, strength;
+
+ // Define the forward and backwards filter limits for this arnr group.
+ if (frames_fwd > frames_after_arf) frames_fwd = frames_after_arf;
+ if (frames_fwd > distance) frames_fwd = distance;
+
+ frames_bwd = frames_fwd;
+
+ // For even length filter there is one more frame backward
+ // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff.
+ if (frames_bwd < distance) frames_bwd += (oxcf->arnr_max_frames + 1) & 0x1;
+
+ // Set the baseline active filter size.
+ frames = frames_bwd + 1 + frames_fwd;
+
+ // Adjust the strength based on active max q.
+ if (cpi->common.current_video_frame > 1)
+ q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
+ cpi->common.seq_params.bit_depth));
+ else
+ q = ((int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME],
+ cpi->common.seq_params.bit_depth));
+ if (q > 16) {
+ strength = oxcf->arnr_strength;
+ } else {
+ strength = oxcf->arnr_strength - ((16 - q) / 2);
+ if (strength < 0) strength = 0;
+ }
+
+ // Adjust number of frames in filter and strength based on gf boost level.
+ if (frames > group_boost / 150) {
+ frames = group_boost / 150;
+ frames += !(frames & 1);
+ }
+
+ if (strength > group_boost / 300) {
+ strength = group_boost / 300;
+ }
+
+ *arnr_frames = frames;
+ *arnr_strength = strength;
+}
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance) {
+ RATE_CONTROL *const rc = &cpi->rc;
+ int frame;
+ int frames_to_blur;
+ int start_frame;
+ int strength;
+ int frames_to_blur_backward;
+ int frames_to_blur_forward;
+ struct scale_factors sf;
+ YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
+ const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+ // Apply context specific adjustments to the arnr filter parameters.
+ adjust_arnr_filter(cpi, distance, rc->gfu_boost, &frames_to_blur, &strength);
+ // TODO(weitinglin): Currently, we enforce the filtering strength on
+ // extra ARFs' to be zeros. We should investigate in which
+ // case it is more beneficial to use non-zero strength
+ // filtering.
+ if (gf_group->update_type[gf_group->index] == INTNL_ARF_UPDATE) {
+ strength = 0;
+ frames_to_blur = 1;
+ }
+
+ int which_arf = gf_group->arf_update_idx[gf_group->index];
+
+ // Set the temporal filtering status for the corresponding OVERLAY frame
+ if (strength == 0 && frames_to_blur == 1)
+ cpi->is_arf_filter_off[which_arf] = 1;
+ else
+ cpi->is_arf_filter_off[which_arf] = 0;
+ cpi->common.showable_frame = cpi->is_arf_filter_off[which_arf];
+
+ frames_to_blur_backward = (frames_to_blur / 2);
+ frames_to_blur_forward = ((frames_to_blur - 1) / 2);
+ start_frame = distance + frames_to_blur_forward;
+
+ // Setup frame pointers, NULL indicates frame not included in filter.
+ for (frame = 0; frame < frames_to_blur; ++frame) {
+ const int which_buffer = start_frame - frame;
+ struct lookahead_entry *buf =
+ av1_lookahead_peek(cpi->lookahead, which_buffer);
+ frames[frames_to_blur - 1 - frame] = &buf->img;
+ }
+
+ if (frames_to_blur > 0) {
+ // Setup scaling factors. Scaling on each of the arnr frames is not
+ // supported.
+ // ARF is produced at the native frame size and resized when coded.
+ av1_setup_scale_factors_for_frame(
+ &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
+ frames[0]->y_crop_width, frames[0]->y_crop_height);
+ }
+
+ temporal_filter_iterate_c(cpi, frames, frames_to_blur,
+ frames_to_blur_backward, strength, &sf);
+}
diff --git a/third_party/aom/av1/encoder/temporal_filter.h b/third_party/aom/av1/encoder/temporal_filter.h
new file mode 100644
index 0000000000..2ddc68b2c9
--- /dev/null
+++ b/third_party/aom/av1/encoder/temporal_filter.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+#define AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_temporal_filter(AV1_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TEMPORAL_FILTER_H_
diff --git a/third_party/aom/av1/encoder/tokenize.c b/third_party/aom/av1/encoder/tokenize.c
new file mode 100644
index 0000000000..16a6a9a35a
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "aom_mem/aom_mem.h"
+
+#include "av1/common/entropy.h"
+#include "av1/common/pred_common.h"
+#include "av1/common/scan.h"
+#include "av1/common/seg_common.h"
+
+#include "av1/encoder/cost.h"
+#include "av1/encoder/encoder.h"
+#include "av1/encoder/encodetxb.h"
+#include "av1/encoder/rdopt.h"
+#include "av1/encoder/tokenize.h"
+
+static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
+ int plane, int calc_rate, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ const uint8_t *const color_map = param->color_map;
+ MapCdf map_cdf = param->map_cdf;
+ ColorCost color_cost = param->color_cost;
+ const int plane_block_width = param->plane_width;
+ const int rows = param->rows;
+ const int cols = param->cols;
+ const int n = param->n_colors;
+ const int palette_size_idx = n - PALETTE_MIN_SIZE;
+ int this_rate = 0;
+ uint8_t color_order[PALETTE_MAX_SIZE];
+
+ (void)plane;
+ (void)counts;
+
+ for (int k = 1; k < rows + cols - 1; ++k) {
+ for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
+ int i = k - j;
+ int color_new_idx;
+ const int color_ctx = av1_get_palette_color_index_context(
+ color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
+ assert(color_new_idx >= 0 && color_new_idx < n);
+ if (calc_rate) {
+ this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
+ } else {
+ (*t)->token = color_new_idx;
+ (*t)->color_map_cdf = map_cdf[palette_size_idx][color_ctx];
+ ++(*t);
+ if (allow_update_cdf)
+ update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
+#if CONFIG_ENTROPY_STATS
+ if (plane) {
+ ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ } else {
+ ++counts->palette_y_color_index[palette_size_idx][color_ctx]
+ [color_new_idx];
+ }
+#endif
+ }
+ }
+ }
+ if (calc_rate) return this_rate;
+ return 0;
+}
+
+static void get_palette_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, Av1ColorMapParam *params) {
+ const MACROBLOCKD *const xd = &x->e_mbd;
+ const MB_MODE_INFO *const mbmi = xd->mi[0];
+ const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+ params->color_map = xd->plane[plane].color_index_map;
+ params->map_cdf = plane ? xd->tile_ctx->palette_uv_color_index_cdf
+ : xd->tile_ctx->palette_y_color_index_cdf;
+ params->color_cost =
+ plane ? &x->palette_uv_color_cost : &x->palette_y_color_cost;
+ params->n_colors = pmi->palette_size[plane];
+ av1_get_block_dimensions(bsize, plane, xd, &params->plane_width, NULL,
+ &params->rows, &params->cols);
+}
+
+static void get_color_map_params(const MACROBLOCK *const x, int plane,
+ BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type,
+ Av1ColorMapParam *params) {
+ (void)tx_size;
+ memset(params, 0, sizeof(*params));
+ switch (type) {
+ case PALETTE_MAP: get_palette_params(x, plane, bsize, params); break;
+ default: assert(0 && "Invalid color map type"); return;
+ }
+}
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ return cost_and_tokenize_map(&color_map_params, NULL, plane, 1, 0, NULL);
+}
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ FRAME_COUNTS *counts) {
+ assert(plane == 0 || plane == 1);
+ Av1ColorMapParam color_map_params;
+ get_color_map_params(x, plane, bsize, tx_size, type, &color_map_params);
+ // The first color index does not use context or entropy.
+ (*t)->token = color_map_params.color_map[0];
+ (*t)->color_map_cdf = NULL;
+ ++(*t);
+ cost_and_tokenize_map(&color_map_params, t, plane, 0, allow_update_cdf,
+ counts);
+}
+
+void tokenize_vartx(ThreadData *td, TOKENEXTRA **t, RUN_TYPE dry_run,
+ TX_SIZE tx_size, BLOCK_SIZE plane_bsize, int blk_row,
+ int blk_col, int block, int plane, void *arg) {
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int max_blocks_high = max_block_high(xd, plane_bsize, plane);
+ const int max_blocks_wide = max_block_wide(xd, plane_bsize, plane);
+
+ if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide) return;
+
+ const TX_SIZE plane_tx_size =
+ plane ? av1_get_max_uv_txsize(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y)
+ : mbmi->inter_tx_size[av1_get_txb_size_index(plane_bsize, blk_row,
+ blk_col)];
+
+ if (tx_size == plane_tx_size || plane) {
+ plane_bsize = get_plane_block_size(mbmi->sb_type, pd->subsampling_x,
+ pd->subsampling_y);
+ if (!dry_run) {
+ av1_update_and_record_txb_context(plane, block, blk_row, blk_col,
+ plane_bsize, tx_size, arg);
+ } else if (dry_run == DRY_RUN_NORMAL) {
+ av1_update_txb_context_b(plane, block, blk_row, blk_col, plane_bsize,
+ tx_size, arg);
+ } else {
+ printf("DRY_RUN_COSTCOEFFS is not supported yet\n");
+ assert(0);
+ }
+ } else {
+ // Half the block size in transform block unit.
+ const TX_SIZE sub_txs = sub_tx_size_map[tx_size];
+ const int bsw = tx_size_wide_unit[sub_txs];
+ const int bsh = tx_size_high_unit[sub_txs];
+ const int step = bsw * bsh;
+
+ assert(bsw > 0 && bsh > 0);
+
+ for (int row = 0; row < tx_size_high_unit[tx_size]; row += bsh) {
+ for (int col = 0; col < tx_size_wide_unit[tx_size]; col += bsw) {
+ const int offsetr = blk_row + row;
+ const int offsetc = blk_col + col;
+
+ if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide) continue;
+
+ tokenize_vartx(td, t, dry_run, sub_txs, plane_bsize, offsetr, offsetc,
+ block, plane, arg);
+ block += step;
+ }
+ }
+ }
+}
+
+void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+ RUN_TYPE dry_run, int mi_row, int mi_col,
+ BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf) {
+ const AV1_COMMON *const cm = &cpi->common;
+ const int num_planes = av1_num_planes(cm);
+ MACROBLOCK *const x = &td->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ MB_MODE_INFO *const mbmi = xd->mi[0];
+ (void)t;
+ struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
+ if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+ if (mbmi->skip) {
+ av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
+ return;
+ }
+
+ for (int plane = 0; plane < num_planes; ++plane) {
+ if (!is_chroma_reference(mi_row, mi_col, bsize,
+ xd->plane[plane].subsampling_x,
+ xd->plane[plane].subsampling_y)) {
+ continue;
+ }
+ const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const BLOCK_SIZE bsizec =
+ scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
+ const BLOCK_SIZE plane_bsize =
+ get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
+ const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
+ const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
+ const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
+ const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+ int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
+ int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
+ int idx, idy;
+ int block = 0;
+ int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];
+
+ const BLOCK_SIZE max_unit_bsize =
+ get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
+ int mu_blocks_wide =
+ block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
+ int mu_blocks_high =
+ block_size_high[max_unit_bsize] >> tx_size_high_log2[0];
+
+ mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
+ mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);
+
+ for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
+ for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
+ int blk_row, blk_col;
+ const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
+ const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
+ for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
+ for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
+ tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, blk_row,
+ blk_col, block, plane, &arg);
+ block += step;
+ }
+ }
+ }
+ }
+ }
+ if (rate) *rate += arg.this_rate;
+}
diff --git a/third_party/aom/av1/encoder/tokenize.h b/third_party/aom/av1/encoder/tokenize.h
new file mode 100644
index 0000000000..63b505f36f
--- /dev/null
+++ b/third_party/aom/av1/encoder/tokenize.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TOKENIZE_H_
+#define AOM_AV1_ENCODER_TOKENIZE_H_
+
+#include "av1/common/entropy.h"
+#include "av1/encoder/block.h"
+#include "aom_dsp/bitwriter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+ aom_cdf_prob *color_map_cdf;
+ // TODO(yaowu: use packed enum type if appropriate)
+ uint8_t token;
+} TOKENEXTRA;
+
+struct AV1_COMP;
+struct ThreadData;
+struct FRAME_COUNTS;
+
+struct tokenize_b_args {
+ const struct AV1_COMP *cpi;
+ struct ThreadData *td;
+ TOKENEXTRA **tp;
+ int this_rate;
+ uint8_t allow_update_cdf;
+};
+
+typedef enum {
+ OUTPUT_ENABLED = 0,
+ DRY_RUN_NORMAL,
+ DRY_RUN_COSTCOEFFS,
+} RUN_TYPE;
+
+// Note in all the tokenize functions rate if non NULL is incremented
+// with the coefficient token cost only if dry_run = DRY_RUN_COSTCOEFS,
+// otherwise rate is not incremented.
+void av1_tokenize_sb_vartx(const struct AV1_COMP *cpi, struct ThreadData *td,
+ TOKENEXTRA **t, RUN_TYPE dry_run, int mi_row,
+ int mi_col, BLOCK_SIZE bsize, int *rate,
+ uint8_t allow_update_cdf);
+
+int av1_cost_color_map(const MACROBLOCK *const x, int plane, BLOCK_SIZE bsize,
+ TX_SIZE tx_size, COLOR_MAP_TYPE type);
+
+void av1_tokenize_color_map(const MACROBLOCK *const x, int plane,
+ TOKENEXTRA **t, BLOCK_SIZE bsize, TX_SIZE tx_size,
+ COLOR_MAP_TYPE type, int allow_update_cdf,
+ struct FRAME_COUNTS *counts);
+
+static INLINE int av1_get_tx_eob(const struct segmentation *seg, int segment_id,
+ TX_SIZE tx_size) {
+ const int eob_max = av1_get_max_eob(tx_size);
+ return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TOKENIZE_H_
diff --git a/third_party/aom/av1/encoder/tx_prune_model_weights.h b/third_party/aom/av1/encoder/tx_prune_model_weights.h
new file mode 100644
index 0000000000..405bc9e6eb
--- /dev/null
+++ b/third_party/aom/av1/encoder/tx_prune_model_weights.h
@@ -0,0 +1,1944 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+#define AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "av1/encoder/ml.h"
+
+// Tx type model for 4x4 block.
+static const float av1_tx_type_nn_weights_4x4_hor_layer0[32] = {
+ -1.64947f, -1.54497f, -1.62832f, -0.17774f, -2.89498f, -0.72498f, 0.72036f,
+ 0.17996f, 1.20000f, -0.27654f, 0.77396f, 1.21684f, -1.75909f, -0.51272f,
+ -1.25923f, 0.35005f, -0.04257f, -0.23389f, -0.41841f, -0.08229f, 0.09503f,
+ 2.73144f, -0.16875f, -0.23482f, 0.02194f, -0.26427f, 0.28049f, 0.21260f,
+ 1.35792f, 0.27733f, 0.88660f, -0.68304f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer0[8] = {
+ 1.38742f, 0.59540f, -1.37622f, 1.92114f,
+ 0.00000f, -0.38998f, -0.32726f, -0.15650f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_hor_layer1[32] = {
+ 1.65254f, 1.00915f, -0.89318f, -2.05142f, -0.23235f, 0.96781f, -0.37145f,
+ -0.21056f, 1.13891f, 0.38675f, 0.87739f, -1.42697f, 0.48015f, 0.61883f,
+ -0.03979f, 0.11487f, 0.48042f, 0.45200f, -0.23242f, 0.75166f, 0.55458f,
+ 0.39452f, -0.35285f, 1.59120f, -1.49221f, -0.48349f, -0.64692f, 1.49297f,
+ -0.26782f, -0.65416f, -0.10648f, 0.05568f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_hor_layer1[4] = {
+ 4.07177f,
+ 3.26961f,
+ 0.58083f,
+ 1.21199f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_hor_layer0,
+ av1_tx_type_nn_weights_4x4_hor_layer1 },
+ { av1_tx_type_nn_bias_4x4_hor_layer0, av1_tx_type_nn_bias_4x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer0[32] = {
+ -0.02032f, 2.61610f, 0.02098f, -0.30217f, 0.12637f, 0.11017f, -3.01996f,
+ 0.35144f, 1.93776f, -0.20463f, 1.64102f, -1.41986f, -3.66717f, -0.51655f,
+ 0.43910f, 0.37778f, -1.02634f, 0.85337f, -0.69753f, 1.00206f, 2.11784f,
+ 1.89427f, 1.92919f, 0.43201f, -1.67358f, -1.67035f, -1.54623f, 0.16714f,
+ -0.06589f, -0.28142f, -0.33118f, 1.72227f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer0[8] = {
+ -0.33685f, 0.22025f, 0.28140f, 0.56138f,
+ 0.93489f, -1.77048f, 1.34989f, -0.93747f,
+};
+
+static const float av1_tx_type_nn_weights_4x4_ver_layer1[32] = {
+ -1.39506f, -1.06271f, -1.10886f, -1.69719f, 0.19699f, -2.39850f, -1.26457f,
+ 0.75328f, -1.26005f, -0.82738f, -0.12015f, -1.02702f, 1.40828f, -2.37739f,
+ -0.65639f, -0.71992f, -0.90453f, -1.12510f, -2.41362f, -1.16061f, -1.85577f,
+ -0.99165f, -1.91366f, 0.16785f, 0.34776f, 0.58154f, -0.18217f, -0.29257f,
+ -0.86315f, -0.53336f, 0.30320f, -1.32331f,
+};
+
+static const float av1_tx_type_nn_bias_4x4_ver_layer1[4] = {
+ -1.31519f,
+ -3.26321f,
+ 1.71794f,
+ -1.90778f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x4_ver_layer0,
+ av1_tx_type_nn_weights_4x4_ver_layer1 },
+ { av1_tx_type_nn_bias_4x4_ver_layer0, av1_tx_type_nn_bias_4x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 4x8 block.
+static const float av1_tx_type_nn_weights_4x8_hor_layer0[32] = {
+ 0.00218f, -0.41880f, -0.61215f, -0.92588f, 0.54291f, -0.10898f, 0.70691f,
+ 0.46819f, -1.61598f, -0.08834f, -0.96839f, 1.18489f, -0.45171f, -0.65445f,
+ -0.32179f, -0.10399f, 1.04379f, 0.91895f, 0.85589f, 0.08267f, 1.35388f,
+ -2.03096f, 0.08168f, -0.06372f, -0.26732f, -0.48262f, -0.08682f, 2.44071f,
+ -1.35896f, -1.17121f, 1.68866f, 0.10357f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer0[8] = {
+ 2.93391f, 0.66831f, -0.21419f, 0.00000f,
+ -0.72878f, 0.15127f, -1.46755f, 0.16658f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_hor_layer1[32] = {
+ -1.52077f, -1.06243f, 0.35319f, -0.49207f, 0.54524f, 0.44271f, 1.37117f,
+ -0.38957f, -1.28889f, -0.57133f, 0.04658f, 0.62278f, 0.37984f, 0.33247f,
+ 1.65547f, -0.56806f, -1.38645f, -0.76258f, 0.67926f, 0.08783f, -0.01443f,
+ 0.34950f, 1.45812f, -0.51332f, -1.41331f, -0.16453f, 0.05755f, 0.31405f,
+ -0.50191f, 0.18219f, 1.83664f, -0.75276f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_hor_layer1[4] = {
+ -1.17455f,
+ -2.26089f,
+ -1.79863f,
+ -2.26333f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_hor_layer0,
+ av1_tx_type_nn_weights_4x8_hor_layer1 },
+ { av1_tx_type_nn_bias_4x8_hor_layer0, av1_tx_type_nn_bias_4x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer0[128] = {
+ -0.00952f, -0.98858f, -0.93181f, 1.39594f, 0.96559f, 0.18162f, -0.76064f,
+ -0.06066f, 0.07907f, -0.09365f, -0.21313f, -0.02187f, -2.61707f, -2.68702f,
+ -0.10982f, 0.18559f, 1.17049f, 1.11387f, 1.12697f, 1.05804f, 1.12764f,
+ 1.06318f, 1.12052f, 0.17406f, 1.83157f, 0.19362f, 0.46910f, 0.39608f,
+ 0.33342f, 0.40083f, 0.27645f, 1.06864f, -4.06645f, -0.38775f, -0.11070f,
+ 0.03781f, -0.09141f, 0.06185f, -0.04852f, 0.20163f, 0.16784f, 0.16641f,
+ -0.50941f, -0.61087f, 2.07008f, -0.82381f, -0.85558f, 0.05528f, -0.10535f,
+ -2.81150f, 0.67038f, 0.43643f, 0.49062f, -0.04465f, 0.90438f, 0.00977f,
+ 0.46272f, 1.59751f, 0.95234f, 0.35086f, 0.85624f, 0.73149f, 1.67779f,
+ -2.21511f, -1.24746f, -1.09014f, -0.92441f, -1.22591f, -1.06961f, -0.95897f,
+ -1.24956f, 0.73797f, 1.23275f, -0.60064f, -0.07851f, 0.14397f, 0.22110f,
+ -0.04422f, 0.14350f, 0.75926f, 0.35032f, 0.48104f, 2.81408f, 0.34662f,
+ 0.42090f, 0.35521f, -1.36804f, -0.14974f, -0.47696f, -0.07892f, 0.36910f,
+ 0.32299f, 0.23916f, 0.06032f, -0.17844f, -0.17558f, -1.42746f, -0.55828f,
+ -1.00418f, -0.64823f, -0.73654f, -0.85197f, -1.50989f, 1.69385f, -0.04973f,
+ -0.09273f, 1.04249f, 0.79235f, 1.13229f, 0.99617f, 0.03851f, 0.56334f,
+ 0.90795f, 1.08296f, 0.58519f, 1.74765f, 0.63971f, 1.35951f, 0.07803f,
+ -0.05127f, 0.26514f, -0.84629f, -0.66343f, -2.10630f, 0.11017f, 2.18528f,
+ -0.21958f, 0.05970f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer0[16] = {
+ 0.04205f, 0.22260f, -1.03870f, -1.19568f, 0.44283f, 0.01143f,
+ 0.00235f, 4.26772f, 0.44364f, -0.33199f, -0.39076f, -0.35129f,
+ 0.08288f, 0.18195f, -0.79890f, 0.10047f,
+};
+
+static const float av1_tx_type_nn_weights_4x8_ver_layer1[64] = {
+ -0.38193f, -0.12095f, 1.57802f, 0.34932f, -0.47333f, -0.12304f, -0.01736f,
+ -2.52445f, 0.18983f, -0.64707f, -0.60889f, -0.53750f, 0.91666f, -0.62823f,
+ -0.13377f, -0.43594f, -0.38618f, -0.01328f, 0.97457f, 1.48589f, -1.03238f,
+ -0.33459f, -0.35108f, -2.42417f, 0.60229f, 0.06824f, -0.75495f, 0.26902f,
+ 0.65311f, -0.23887f, -0.44604f, -0.55800f, -0.33842f, 0.04259f, -0.59589f,
+ 0.49738f, -0.62301f, -0.30896f, -0.29602f, -2.57052f, 2.00943f, -0.66490f,
+ -0.76312f, 0.28256f, 1.06311f, -0.38364f, -0.63508f, -0.57609f, -0.88765f,
+ -1.04403f, -0.46531f, 0.34084f, -1.20498f, -0.68352f, -0.72251f, -2.63242f,
+ -0.68736f, -0.37904f, -1.32371f, 0.47288f, 1.51904f, 0.78372f, -1.01830f,
+ -1.01848f,
+};
+
+static const float av1_tx_type_nn_bias_4x8_ver_layer1[4] = {
+ -1.45955f,
+ -2.08949f,
+ -1.24813f,
+ -1.55368f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x8_ver_layer0,
+ av1_tx_type_nn_weights_4x8_ver_layer1 },
+ { av1_tx_type_nn_bias_4x8_ver_layer0, av1_tx_type_nn_bias_4x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x4 block.
+static const float av1_tx_type_nn_weights_8x4_hor_layer0[128] = {
+ -0.22492f, 0.13341f, -4.03243f, -0.64015f, 0.02783f, 0.60466f, -0.13335f,
+ 0.16828f, 0.12336f, 0.52904f, 1.18455f, -0.32425f, 0.13052f, 0.93810f,
+ -3.71165f, 0.02990f, -4.63558f, 0.05666f, 0.03524f, -0.07449f, -0.44006f,
+ -0.33215f, -0.33713f, 0.08097f, 0.60873f, 0.29582f, 0.21696f, -0.78729f,
+ -0.16757f, -0.26567f, -0.00720f, -1.11226f, 1.58189f, 1.58463f, 1.48536f,
+ 1.54374f, 1.60069f, 1.46125f, 1.53932f, 0.05974f, -1.82192f, 0.47043f,
+ 0.38090f, 0.20833f, -0.05637f, 0.05183f, 0.01323f, -0.25662f, 0.78634f,
+ -0.55069f, -0.02975f, -1.29294f, -0.77192f, -2.34299f, -1.28074f, 0.77894f,
+ -1.69740f, -1.66032f, -1.44323f, -1.55063f, -1.50845f, -1.23690f, -1.80663f,
+ 0.75079f, 2.32551f, 0.05878f, 0.80438f, 0.88584f, 0.69153f, 0.89060f,
+ 0.73660f, 0.87259f, -0.00745f, -1.30044f, -0.59430f, 2.07270f, 1.03307f,
+ -0.84697f, -1.19393f, 0.17549f, -0.24978f, -3.67234f, 0.20781f, -0.53946f,
+ -0.05068f, 0.88274f, 1.30371f, 0.10288f, 0.07585f, 0.12259f, -0.30815f,
+ 0.25437f, -2.82096f, -2.69482f, 0.02370f, 0.12500f, -0.21019f, -0.49220f,
+ 0.03638f, -0.29795f, 0.28645f, -0.48432f, -0.38584f, -0.32148f, -0.47197f,
+ 0.32437f, 0.32528f, -0.19437f, 0.30383f, -0.31879f, 0.26359f, -0.12164f,
+ -0.43647f, -0.08288f, -0.33438f, -0.63608f, -0.46647f, -0.46574f, 0.47806f,
+ -0.49012f, -1.51234f, -1.13502f, -1.20470f, -1.02913f, -1.09182f, -0.93921f,
+ -1.85523f, 0.92532f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer0[16] = {
+ 0.36631f, 0.02901f, 0.64305f, 1.53074f, -1.40229f, 0.03852f,
+ -0.05043f, 0.89632f, -1.23312f, 0.07036f, 0.17070f, 0.56250f,
+ -0.28958f, -0.32869f, -0.01704f, 0.68171f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_hor_layer1[64] = {
+ -0.49441f, -0.31960f, -0.84946f, -0.85800f, -2.37767f, 0.81373f, -0.73172f,
+ -0.69337f, 0.88807f, -0.49242f, -0.44717f, -0.11436f, 0.09978f, 0.15393f,
+ 0.17083f, 1.44850f, -0.20582f, -0.04906f, 0.42990f, -0.61939f, -1.09692f,
+ -1.14885f, -1.36879f, -1.30828f, -0.59558f, -0.30903f, -0.08906f, 0.06953f,
+ 0.15383f, -0.04193f, -0.54858f, 1.82676f, -0.22411f, 0.05264f, -0.45848f,
+ -0.72985f, 0.87553f, 0.04116f, -1.29774f, -2.63018f, 1.09089f, -0.36048f,
+ -0.16725f, 0.11627f, 0.49918f, 0.07539f, 0.00763f, 0.73706f, 0.87800f,
+ 0.57049f, 0.60969f, 1.02779f, 1.53339f, -0.35915f, 0.06410f, 1.44582f,
+ 0.09698f, 0.71888f, 0.60594f, 0.84103f, -0.50440f, -0.38825f, 0.15626f,
+ -1.10654f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_hor_layer1[4] = {
+ -0.92861f,
+ -1.45151f,
+ -1.33588f,
+ -4.33853f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_hor_layer0,
+ av1_tx_type_nn_weights_8x4_hor_layer1 },
+ { av1_tx_type_nn_bias_8x4_hor_layer0, av1_tx_type_nn_bias_8x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer0[32] = {
+ -1.10946f, 1.86574f, -1.59343f, 0.27018f, -1.70676f, -0.73982f, -0.19021f,
+ -1.94208f, -2.29759f, -1.44402f, 0.28700f, -1.18340f, -1.50158f, -0.44175f,
+ -1.36831f, 1.00374f, 2.59312f, 0.50291f, -0.71042f, -0.12238f, -0.15901f,
+ -0.22807f, -0.67376f, -0.30215f, 0.54407f, -0.45538f, 1.18262f, 2.28687f,
+ 1.66212f, 1.70826f, 1.55182f, 0.12230f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer0[8] = {
+ 0.10943f, 2.09789f, 2.16578f, 0.15766f,
+ -0.42461f, 0.00000f, 1.22090f, -1.28717f,
+};
+
+static const float av1_tx_type_nn_weights_8x4_ver_layer1[32] = {
+ 1.20426f, -1.23237f, 2.41053f, -0.72488f, 1.25249f, 0.18018f, -0.09586f,
+ 2.17901f, 0.15364f, 1.21535f, -0.38263f, -0.74309f, 0.50551f, -0.54208f,
+ 0.59139f, 1.16095f, 0.55919f, -0.60183f, 1.18949f, 1.60787f, 0.54002f,
+ -0.10712f, -0.16153f, 0.16207f, -0.32338f, 2.68712f, -2.83483f, -0.27086f,
+ -1.15005f, -0.39311f, 1.51236f, -1.68973f,
+};
+
+static const float av1_tx_type_nn_bias_8x4_ver_layer1[4] = {
+ 1.81013f,
+ 1.10517f,
+ 2.90059f,
+ 0.95391f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x4_ver_layer0,
+ av1_tx_type_nn_weights_8x4_ver_layer1 },
+ { av1_tx_type_nn_bias_8x4_ver_layer0, av1_tx_type_nn_bias_8x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x8 block.
+static const float av1_tx_type_nn_weights_8x8_hor_layer0[128] = {
+ -0.85529f, 0.37619f, 0.12754f, 0.08622f, 0.45278f, 0.54929f, 1.60651f,
+ -0.62654f, -0.54929f, -0.10131f, -0.17569f, 0.13948f, 0.31695f, -0.05616f,
+ 0.20483f, -0.36448f, 2.27203f, -0.33087f, 0.47679f, 0.86888f, 0.39370f,
+ 0.46239f, 0.01113f, 1.50327f, -1.48226f, -1.69621f, -1.49777f, -1.38885f,
+ -1.37753f, -1.22681f, -1.70576f, 0.51329f, -1.65662f, 1.74197f, -0.13579f,
+ -0.13133f, -0.58396f, -0.55510f, -1.10709f, -2.34975f, 0.22445f, -0.56491f,
+ -0.83432f, 0.13492f, 1.32147f, 2.85285f, 0.13819f, 0.03792f, -1.30792f,
+ 0.04155f, -0.70644f, -0.43430f, -0.16212f, -0.86945f, -1.16976f, 1.68339f,
+ 0.29540f, 0.01137f, -0.25335f, -0.16856f, 0.12028f, 0.05207f, 0.39357f,
+ -0.01545f, -0.21980f, -1.94091f, -1.01315f, -0.68270f, -0.40590f, -0.67111f,
+ 2.08283f, 0.19291f, -4.81426f, -0.65044f, -0.24598f, 0.06371f, -0.10272f,
+ -0.14502f, -0.06821f, 0.45202f, 0.21091f, -0.80864f, 0.39255f, 1.79189f,
+ 1.80453f, 1.10484f, 1.17608f, 0.96901f, -0.35871f, -0.94311f, 0.63147f,
+ 2.95157f, 0.45917f, -0.42849f, -0.55643f, -0.06097f, 3.49299f, -0.50972f,
+ 0.11075f, -0.08405f, -0.09274f, -0.22694f, -0.42426f, 0.48632f, -1.61074f,
+ 1.82998f, 0.37623f, -1.20330f, -0.01142f, -1.33307f, -0.27492f, -2.23621f,
+ 1.38846f, 1.42085f, 1.42568f, 1.36152f, 1.46910f, 1.27473f, 1.34752f,
+ 0.12753f, -1.08197f, -1.08280f, -0.79489f, -1.12338f, -1.06795f, -0.87857f,
+ -0.99892f, 1.09823f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer0[16] = {
+ -0.49232f, -0.29685f, -1.44020f, 1.10940f, 1.16452f, -0.34862f,
+ -0.38761f, -0.36243f, 0.21776f, 0.28234f, 2.34269f, -0.04104f,
+ -0.26319f, 2.65579f, -1.30137f, -0.01487f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_hor_layer1[64] = {
+ -0.38058f, -0.41295f, -1.26884f, -0.75560f, -1.57450f, 0.56072f, -1.42322f,
+ -0.29106f, 0.07228f, 0.04391f, 1.61388f, -0.03055f, 0.81637f, 2.06045f,
+ 0.27119f, -0.48328f, -0.45528f, -0.60534f, -1.61209f, -0.78157f, -1.65034f,
+ 0.60958f, -1.30523f, 0.25143f, 0.11398f, 0.37860f, 1.54829f, 0.02309f,
+ 0.67288f, 2.11447f, 0.44845f, -0.70406f, -0.67897f, -0.38759f, -1.30383f,
+ -1.22646f, -1.54571f, 0.60552f, -1.52565f, 0.11469f, 0.17344f, 0.08622f,
+ 1.57906f, -0.00909f, 0.81634f, 2.04909f, 1.26466f, -1.45741f, -0.75229f,
+ 0.06200f, -1.05835f, -0.66257f, -1.73766f, 0.99923f, -1.87082f, 0.14580f,
+ 0.49525f, 0.46839f, 1.32203f, 0.33923f, 0.97001f, 2.38584f, 1.58811f,
+ 0.06161f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_hor_layer1[4] = {
+ 1.70385f,
+ 1.82373f,
+ 1.78496f,
+ 1.80826f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_hor_layer0,
+ av1_tx_type_nn_weights_8x8_hor_layer1 },
+ { av1_tx_type_nn_bias_8x8_hor_layer0, av1_tx_type_nn_bias_8x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer0[128] = {
+ -0.67016f, -1.72366f, -1.86576f, -1.50962f, -1.70419f, -1.73964f, -1.84615f,
+ 2.09681f, -0.05081f, -0.61030f, 2.02541f, 0.60222f, 0.99936f, 2.02114f,
+ -0.53893f, -0.23757f, 0.73566f, 0.25443f, 0.00132f, -0.74036f, -0.75351f,
+ -0.76964f, -1.71007f, -0.15770f, 1.60982f, 2.17638f, 0.90681f, 0.64973f,
+ 0.85914f, 0.58786f, -1.46228f, 0.05187f, 1.18804f, 0.30850f, 0.29512f,
+ 0.40526f, 0.37635f, 0.32311f, 0.37471f, 1.12346f, 3.41856f, -0.36653f,
+ 0.42537f, -0.19240f, 0.00155f, 0.30826f, -0.02116f, -0.53435f, -0.34829f,
+ -0.52466f, -0.11521f, -0.29163f, -2.05689f, -2.87372f, -0.62626f, 0.09585f,
+ -0.75257f, 0.10057f, 1.43474f, 0.89450f, 0.75900f, 1.11147f, 1.00558f,
+ 0.25886f, 2.22095f, -0.17926f, 0.57161f, 0.39546f, 0.47846f, 0.40452f,
+ 0.54298f, 0.45814f, -3.62788f, -3.02374f, 0.03716f, -0.13937f, -0.09415f,
+ -0.12463f, 0.05682f, 0.03672f, 1.20746f, 1.25003f, 1.27071f, 1.31883f,
+ 1.27473f, 1.34943f, 1.23158f, 0.09039f, 0.19388f, 0.63420f, 2.79612f,
+ 0.93803f, -0.11323f, -0.02027f, 0.41286f, -0.05979f, -3.80705f, -0.52451f,
+ -0.77098f, -0.68132f, -0.65559f, -0.60975f, -1.26165f, 0.25582f, 0.05346f,
+ 0.61403f, 0.32140f, -2.39831f, -1.42355f, 1.30541f, 1.02361f, 0.12930f,
+ -1.61469f, -0.77036f, -0.59144f, 1.27769f, 1.52068f, 0.82137f, 1.83159f,
+ -0.66626f, -0.69806f, -1.00564f, -0.85995f, -0.90889f, -0.84412f, -0.85712f,
+ -1.29848f, 0.39308f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer0[16] = {
+ -0.14868f, -0.48343f, 3.94416f, -0.78037f, -1.33789f, -0.60611f,
+ 0.51793f, 0.44030f, -0.71563f, 0.22561f, -1.19083f, -0.46149f,
+ 0.83015f, 0.06024f, 1.17180f, 0.65122f,
+};
+
+static const float av1_tx_type_nn_weights_8x8_ver_layer1[64] = {
+ -1.42711f, -0.21683f, 2.12061f, 0.20489f, -0.50228f, -0.24770f, 0.23391f,
+ 1.03470f, -0.44847f, -0.63225f, -0.21583f, -0.06467f, -0.21892f, -0.07786f,
+ 1.43322f, 0.00280f, -1.53057f, -0.18912f, 1.95333f, 0.31151f, -2.07601f,
+ 0.06776f, 0.25529f, 0.94800f, -1.11453f, -0.20594f, -0.13281f, 0.01485f,
+ 0.17650f, -0.07955f, 1.43734f, -0.23193f, -2.06463f, -0.21238f, 2.13707f,
+ 0.30351f, 0.27594f, -0.36245f, 0.19539f, 0.91045f, -0.24068f, -0.37616f,
+ 0.88792f, 0.02947f, -0.16903f, -0.04932f, 1.51293f, -0.95967f, -1.62903f,
+ 0.05326f, 2.30703f, 0.64445f, -1.09464f, -0.16623f, 1.00240f, 0.07548f,
+ -0.50406f, 0.63854f, 1.02340f, 0.49833f, 0.13671f, 0.26722f, 2.09516f,
+ -0.41305f,
+};
+
+static const float av1_tx_type_nn_bias_8x8_ver_layer1[4] = {
+ 2.14067f,
+ 2.76699f,
+ 2.04233f,
+ 1.34803f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x8_ver_layer0,
+ av1_tx_type_nn_weights_8x8_ver_layer1 },
+ { av1_tx_type_nn_bias_8x8_ver_layer0, av1_tx_type_nn_bias_8x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 8x16 block.
+static const float av1_tx_type_nn_weights_8x16_hor_layer0[128] = {
+ -1.61872f, -1.58520f, -1.41236f, -1.53255f, -1.59794f, -1.25769f, -1.90043f,
+ 0.73431f, 1.10135f, 0.47054f, 0.43230f, -0.43009f, -0.09135f, -0.07289f,
+ -0.38785f, 1.23775f, -0.35312f, 0.73789f, 0.88864f, 0.75957f, 0.62579f,
+ 0.46974f, 0.21851f, 1.63821f, -2.27289f, -0.68522f, -0.69814f, -0.84368f,
+ -0.91320f, -0.63055f, -1.03296f, 0.55778f, -0.00071f, 1.27539f, 1.60068f,
+ 1.40975f, 0.97372f, 0.92843f, 1.90853f, 0.12626f, 1.71953f, 1.41978f,
+ -0.12234f, -1.27058f, 0.76207f, 0.02495f, -0.67038f, -0.05255f, 1.72923f,
+ 1.47630f, 1.47058f, 1.47614f, 1.49354f, 1.66131f, 1.50801f, 0.17145f,
+ -2.30947f, -2.10850f, -1.25636f, -0.24900f, 0.72602f, 1.26572f, 0.97865f,
+ -0.65466f, 1.31129f, 0.26916f, 0.12139f, -0.12761f, -0.39143f, -0.28134f,
+ 0.06584f, 2.24418f, 0.22516f, 0.05011f, -0.01671f, -0.29476f, -0.40326f,
+ 0.21138f, -0.11573f, -0.31154f, -0.36828f, 0.03694f, -0.07172f, -0.63419f,
+ -3.14351f, -1.23125f, 0.65311f, -0.11406f, 1.97287f, -0.10422f, 0.83896f,
+ 0.85033f, 0.49724f, 0.80482f, 0.51454f, 1.06447f, 0.76693f, 0.72599f,
+ -0.78573f, -0.53950f, 0.40894f, 0.00086f, 0.10784f, -0.70498f, 1.16395f,
+ 1.14597f, 1.13496f, 1.12177f, 1.02100f, -1.37574f, -2.97144f, 0.33899f,
+ 0.42013f, 0.86327f, 2.31983f, 2.04008f, 0.95503f, 0.15081f, 0.11530f,
+ -0.02574f, -4.77119f, 0.13257f, -0.01704f, -0.23087f, -0.00825f, 0.07029f,
+ -0.28136f, 0.42556f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer0[16] = {
+ 0.93617f, -0.24000f, -1.26821f, 0.78780f, 0.13690f, -0.21948f,
+ -1.45162f, 0.44584f, -1.92582f, -0.23169f, 0.56004f, -1.19937f,
+ 1.81560f, -1.02643f, -0.81690f, 0.08302f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_hor_layer1[64] = {
+ 0.06696f, -0.11538f, -1.42029f, 0.32965f, 0.81046f, 0.01146f, 1.20945f,
+ -0.16899f, 0.53224f, -0.40232f, 0.01786f, -0.73242f, 1.29750f, 1.95185f,
+ 0.70143f, 1.43287f, 0.76220f, 0.79937f, -1.79011f, -1.15178f, 0.42526f,
+ -0.67519f, 0.77267f, -0.30697f, 2.46004f, -0.49828f, 0.02875f, 1.09972f,
+ 1.47662f, 0.61719f, 0.61417f, -0.12363f, 2.53048f, 0.00418f, -1.38964f,
+ 0.88117f, 0.39239f, -0.19347f, -2.58600f, -0.33715f, 1.09323f, -0.32127f,
+ 0.02456f, -0.19125f, 1.12728f, 0.66502f, 0.34296f, 1.14897f, 0.29967f,
+ 1.19209f, 0.22108f, -0.11975f, 1.49776f, -1.34624f, -2.58478f, -1.34632f,
+ 1.53207f, 0.45634f, -1.48476f, 0.17489f, 0.71790f, -2.12086f, -1.21778f,
+ -1.31243f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_hor_layer1[4] = {
+ 0.83359f,
+ 1.06875f,
+ 1.77645f,
+ 1.49570f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_hor_layer0,
+ av1_tx_type_nn_weights_8x16_hor_layer1 },
+ { av1_tx_type_nn_bias_8x16_hor_layer0, av1_tx_type_nn_bias_8x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer0[128] = {
+ 0.32858f, -1.28887f, 0.25632f, -0.05262f, 2.69203f, -0.07004f, 1.37337f,
+ -0.05725f, -0.05659f, 0.05592f, 0.01039f, -0.29343f, 1.58628f, -0.30003f,
+ -3.43118f, 0.00272f, 1.70928f, -0.76348f, 0.05889f, -0.03263f, -0.07724f,
+ 0.03523f, -0.19890f, 1.18005f, -0.03605f, -0.20530f, -4.00733f, 0.10210f,
+ -0.05368f, -0.17650f, -0.15317f, 0.06499f, 0.56705f, 1.04341f, 0.62890f,
+ 0.73451f, -0.22199f, 0.86659f, 0.78443f, -0.61664f, -0.50606f, 0.30247f,
+ 0.14455f, 0.39276f, 0.49203f, 0.65019f, 0.12269f, 1.64080f, 1.68289f,
+ 1.42694f, 1.60825f, 1.58501f, 1.47252f, 1.62589f, 1.48218f, 0.17726f,
+ -0.04884f, 0.35376f, -0.04796f, 0.32589f, 0.35087f, 0.35258f, -0.46103f,
+ -0.31176f, -0.05203f, 0.07247f, -0.26756f, 0.22019f, 0.03412f, 0.33773f,
+ 0.29811f, -0.11140f, 0.12831f, -0.44673f, -0.09858f, 0.07889f, 0.15137f,
+ 0.00347f, -0.23394f, 0.08886f, -0.31201f, -0.79912f, -0.51092f, 0.14123f,
+ -1.09599f, -4.26020f, -0.68675f, -0.02842f, -1.54538f, -1.28977f, -1.30558f,
+ -1.21074f, -1.37142f, -1.14743f, -1.85397f, 0.82985f, -0.30681f, 0.04494f,
+ -0.24023f, -4.18053f, -0.16096f, -0.55492f, -0.27882f, 0.05829f, -0.41224f,
+ -2.52088f, -0.56162f, -1.04547f, -1.70685f, -0.28842f, -1.43673f, -0.01468f,
+ -3.20585f, -0.69120f, -0.43931f, -0.46270f, -0.65885f, -0.55884f, -0.75138f,
+ 0.36381f, -5.70858f, -0.14548f, -0.15745f, -0.11812f, -0.07605f, -0.07693f,
+ -0.12236f, 0.16075f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer0[16] = {
+ -0.35385f, 0.30491f, -0.90011f, 0.42941f, 1.20928f, -0.88331f,
+ -1.48818f, -0.34785f, -0.32668f, -0.22695f, 0.89188f, 0.65521f,
+ 0.57598f, 0.99819f, 0.75175f, 0.17044f,
+};
+
+static const float av1_tx_type_nn_weights_8x16_ver_layer1[64] = {
+ -0.62913f, -0.34304f, 0.42963f, -0.17440f, -1.44092f, 0.69142f, -1.36067f,
+ 0.52211f, 0.44658f, -0.26501f, -0.41657f, 0.34428f, -0.34390f, -0.58567f,
+ -0.84097f, -1.96311f, -0.37215f, -0.22250f, -1.23811f, -0.07247f, -0.81731f,
+ 0.58755f, -1.30559f, 0.39551f, 0.41743f, -0.09940f, -0.33230f, 0.14458f,
+ -0.25139f, -0.54517f, 0.13469f, -0.38157f, -0.39109f, -0.18205f, 0.06834f,
+ -0.08395f, -0.92187f, 0.56724f, 1.44381f, 0.53226f, -0.22356f, 0.12285f,
+ -0.29418f, -1.86749f, -0.22372f, -0.60204f, -0.87746f, -1.16936f, 0.56884f,
+ 0.62641f, -0.11823f, 1.00395f, 1.64794f, -0.64535f, 2.29322f, -0.23397f,
+ 0.17251f, -0.35927f, 0.65631f, -0.26812f, 0.80128f, 0.85748f, 0.47404f,
+ 2.20547f,
+};
+
+static const float av1_tx_type_nn_bias_8x16_ver_layer1[4] = {
+ -0.44080f,
+ -1.67455f,
+ -1.46332f,
+ -6.13206f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_8x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_8x16_ver_layer0,
+ av1_tx_type_nn_weights_8x16_ver_layer1 },
+ { av1_tx_type_nn_bias_8x16_ver_layer0, av1_tx_type_nn_bias_8x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x8 block.
+static const float av1_tx_type_nn_weights_16x8_hor_layer0[128] = {
+ 0.02600f, 0.09786f, -1.05107f, -0.35594f, -0.15658f, 2.99828f, -0.07106f,
+ -0.10101f, -0.14412f, -0.83790f, -0.19434f, 2.28368f, 1.91727f, -0.00956f,
+ -0.90640f, 0.09174f, 1.58895f, 1.38945f, 1.49431f, 1.51381f, 1.44803f,
+ 1.53544f, 1.44694f, 0.17753f, 1.69735f, -0.78652f, 0.31092f, -0.23736f,
+ 0.02231f, -0.09884f, -0.00493f, 1.21189f, -1.94382f, -0.34629f, -0.58309f,
+ 0.72291f, -0.30056f, 0.90660f, -0.57495f, 3.07809f, 0.73644f, 1.43050f,
+ 1.34356f, -0.66554f, 0.50102f, -0.64305f, 0.42044f, -1.66165f, -0.05733f,
+ -2.51402f, -1.01067f, -0.33390f, -0.32986f, -0.92431f, 1.86281f, -0.07290f,
+ -0.26290f, -0.68941f, 1.81156f, 0.66125f, -2.09974f, 0.17032f, -0.67461f,
+ -0.00876f, -1.50154f, 1.17153f, 1.00377f, 0.33022f, 0.74689f, 0.42878f,
+ 0.61725f, -0.83967f, 0.09467f, -0.39892f, 0.33863f, 0.10656f, -0.09249f,
+ -0.39757f, 0.48481f, -0.35162f, 1.47014f, 1.67827f, -1.84051f, 0.16291f,
+ -0.50135f, -2.29911f, -0.42217f, -0.13358f, 1.45899f, -0.14743f, -0.02763f,
+ -0.28003f, -0.01364f, 0.21014f, -0.29026f, -0.20198f, 1.38782f, 0.56731f,
+ 0.27489f, 0.43227f, 0.41326f, 0.42721f, 0.87720f, -1.90067f, -5.04951f,
+ -0.17638f, -0.58119f, -0.08954f, -0.13692f, -0.12325f, -0.38548f, 0.66462f,
+ -1.42377f, -1.21917f, -1.38193f, -1.36539f, -1.39378f, -1.19629f, -1.59812f,
+ 0.28689f, 0.32394f, 0.52128f, 0.01013f, -0.28948f, -0.26293f, -0.44331f,
+ -0.36570f, -0.50757f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer0[16] = {
+ -0.08696f, -0.22110f, -1.43604f, -1.00451f, -1.51029f, 0.63736f,
+ 0.45260f, 0.16229f, 4.01393f, -0.21748f, 0.36411f, -0.08764f,
+ -0.12329f, 0.08986f, 1.08117f, -0.00220f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_hor_layer1[64] = {
+ 0.55824f, -0.14648f, 0.81947f, -0.45867f, -1.86078f, -0.17291f, 0.34849f,
+ 0.15153f, 1.75625f, -0.25760f, 0.72015f, -0.30059f, -0.57975f, 0.07609f,
+ -0.02036f, 0.07912f, 0.57080f, -0.13792f, 0.74184f, -0.87669f, -1.87572f,
+ -0.27270f, 0.39751f, 0.19652f, 2.03514f, -0.32944f, 0.76251f, 0.04399f,
+ -0.63175f, 0.37420f, 0.08309f, 0.04466f, 0.60255f, -0.12820f, 1.66065f,
+ -0.59496f, -1.94794f, -0.14847f, 0.39424f, 0.16273f, 1.80587f, 0.41197f,
+ 0.74691f, -0.21217f, -0.63173f, 0.09510f, -0.35538f, -0.04407f, 0.92847f,
+ 0.20141f, 1.68680f, -0.56528f, -2.26960f, 0.12978f, 0.73748f, 0.42438f,
+ 2.00673f, -0.40189f, 0.95423f, 0.23234f, -0.80953f, 0.65814f, 0.49444f,
+ -0.23347f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_hor_layer1[4] = {
+ 3.57175f,
+ 2.42612f,
+ 3.31259f,
+ 2.08287f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_hor_layer0,
+ av1_tx_type_nn_weights_16x8_hor_layer1 },
+ { av1_tx_type_nn_bias_16x8_hor_layer0, av1_tx_type_nn_bias_16x8_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer0[128] = {
+ 0.46633f, 1.55328f, -0.11230f, -0.29571f, 0.18814f, -1.52430f, -2.34660f,
+ 0.08644f, -1.97718f, -1.29140f, -1.12262f, -1.12985f, -1.25911f, -0.96506f,
+ -1.57129f, 0.96021f, 1.34192f, 1.28623f, 1.21655f, 1.28758f, 1.25482f,
+ 1.30195f, 1.19190f, 0.09310f, 0.52072f, 0.91487f, 1.24100f, 1.61236f,
+ 1.72166f, 2.20750f, 1.62379f, -1.43936f, 0.50665f, 0.40213f, 0.66502f,
+ -1.66699f, -3.07618f, 0.05877f, 0.60987f, -0.09995f, -0.10916f, 0.48049f,
+ 0.23812f, 0.39847f, -0.21682f, -0.63455f, 0.33453f, -0.67939f, -4.14355f,
+ -0.62756f, -0.22502f, -0.17215f, 0.01062f, 0.27049f, -0.10748f, 0.30945f,
+ 2.72445f, -0.89181f, -0.06800f, 0.20595f, -0.73385f, 0.04071f, -1.30294f,
+ 1.83507f, 0.92570f, 0.69609f, 0.76285f, 0.69892f, 0.76409f, 0.63104f,
+ 0.73397f, 1.09575f, -0.20129f, -0.24022f, -0.24599f, -0.59107f, -0.88755f,
+ -0.68987f, -0.75495f, -1.31002f, -1.30237f, -0.94093f, -2.15678f, -1.49303f,
+ -1.17498f, -1.39952f, -0.91270f, -0.05587f, 1.02381f, -0.75580f, -0.65263f,
+ -0.78996f, -0.71075f, -0.71018f, -0.70350f, -1.26196f, 2.34208f, -0.53611f,
+ 0.19752f, -0.16842f, -0.24828f, 0.21857f, 0.08222f, -2.55894f, -1.75702f,
+ 0.11394f, 1.03083f, 0.79972f, -1.54112f, -1.82341f, -0.57597f, -0.02077f,
+ -0.39616f, -0.00995f, -0.12809f, 0.01188f, -0.25117f, 0.09202f, 0.09336f,
+ -0.05614f, -0.30039f, 0.25834f, 1.19944f, 1.22533f, 0.92330f, 0.75967f,
+ -0.81945f, -0.41647f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer0[16] = {
+ 0.17841f, 0.67315f, -1.24450f, 3.13859f, 0.16203f, -0.14992f,
+ 0.29553f, -1.15567f, -0.71421f, 1.15977f, 1.14585f, 3.02460f,
+ -0.04510f, 0.48000f, -0.09354f, -0.42422f,
+};
+
+static const float av1_tx_type_nn_weights_16x8_ver_layer1[64] = {
+ 0.29912f, -0.10009f, -1.11478f, 1.76812f, -0.27719f, 0.52148f, 0.17622f,
+ -1.17116f, 0.73397f, -0.69279f, -0.11080f, 1.53751f, -1.42003f, 0.14731f,
+ 0.13592f, -0.04883f, 0.39186f, -0.13655f, -0.43994f, 1.82759f, -0.25601f,
+ -0.15018f, 0.51920f, -1.56070f, 0.31683f, -0.79367f, -0.02904f, 1.28637f,
+ -1.15203f, 0.26627f, 0.42828f, -0.24258f, 0.38647f, -0.83352f, 0.32553f,
+ 2.09522f, -0.26822f, -0.42191f, 0.32825f, -1.30748f, 1.50551f, -0.52669f,
+ 0.20045f, 1.69318f, -1.47839f, 0.30802f, -0.07290f, -0.28106f, 0.68192f,
+ -0.15522f, 1.12579f, 2.21921f, 0.09720f, -0.50265f, 0.83165f, -1.31721f,
+ 0.72422f, -1.24952f, 0.61653f, 2.04117f, -1.42406f, 0.52568f, -0.46180f,
+ -0.00873f,
+};
+
+static const float av1_tx_type_nn_bias_16x8_ver_layer1[4] = {
+ 3.34981f,
+ 3.74710f,
+ 1.38339f,
+ 0.45176f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x8_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x8_ver_layer0,
+ av1_tx_type_nn_weights_16x8_ver_layer1 },
+ { av1_tx_type_nn_bias_16x8_ver_layer0, av1_tx_type_nn_bias_16x8_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x16 block.
+static const float av1_tx_type_nn_weights_16x16_layer0[128] = {
+ 1.26592f, 1.36313f, 1.30956f, 1.29926f, 1.48816f, 1.68851f, 1.32000f,
+ 0.13321f, -0.22477f, -0.88906f, -0.19622f, 1.69605f, 1.22180f, -1.57771f,
+ -1.15765f, 0.05710f, -1.13355f, -0.85486f, -0.99971f, -0.91571f, -1.06031f,
+ -0.77952f, -1.15723f, 1.17809f, 1.35602f, -0.05243f, -0.37596f, 0.26108f,
+ 0.17611f, -0.10323f, 0.77279f, -0.48911f, -0.79308f, 0.55112f, 0.43918f,
+ 0.27872f, 0.28714f, 0.45830f, 1.05689f, 0.03705f, -2.49975f, -0.01940f,
+ 0.05709f, 0.07942f, -0.13290f, -0.10359f, 0.00143f, 0.37303f, 0.96470f,
+ 0.53293f, 1.14459f, 0.89185f, 0.43378f, 0.47764f, 0.90924f, 0.15279f,
+ -0.15361f, 0.02949f, 0.42240f, 0.68143f, 0.89588f, 0.73754f, 0.10974f,
+ 1.57755f, -0.39870f, -0.32914f, 0.35638f, 0.34991f, -0.00003f, -0.23373f,
+ 0.29630f, -0.76699f, -0.01356f, 0.04234f, 0.84253f, 1.92078f, 0.93160f,
+ 0.71993f, 0.71604f, 0.76455f, -1.59782f, 0.32332f, 1.11628f, 0.33062f,
+ -0.03728f, -0.05710f, 0.80447f, -0.14719f, 1.34658f, -0.05718f, 0.64015f,
+ 0.21926f, 0.41653f, 0.12720f, 0.54092f, 1.39411f, 1.81819f, -0.24513f,
+ 0.00955f, 0.38011f, -0.57787f, -0.41759f, 0.68834f, -0.31783f, -0.40607f,
+ -0.10107f, -0.79374f, 0.75599f, -0.16282f, -0.14490f, -0.20783f, -0.55019f,
+ -0.13793f, -0.22293f, 0.18305f, 0.12445f, 0.56830f, 0.24567f, 0.09278f,
+ 0.70803f, 0.35803f, -1.52676f, -0.89624f, 0.77665f, 0.19877f, 0.77175f,
+ 0.50355f, 0.08592f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer0[16] = {
+ -1.31834f, 0.14346f, -0.10062f, 0.84489f, 0.95617f, -0.06720f,
+ -0.68502f, -0.91442f, -0.31932f, 0.25276f, -0.15138f, -1.57661f,
+ -0.14062f, -0.42120f, 0.94573f, -0.09287f,
+};
+
+static const float av1_tx_type_nn_weights_16x16_layer1[64] = {
+ -1.80333f, -1.06353f, 0.55139f, 0.74644f, 0.13747f, -0.93018f, -0.10286f,
+ 0.67133f, 0.24460f, 1.44583f, 0.02173f, 0.26037f, -0.73687f, 0.19566f,
+ 0.61846f, -0.58601f, -1.03196f, -0.74415f, 0.30041f, -0.41967f, 1.08740f,
+ 0.96224f, -0.59139f, 0.03813f, 0.05403f, 1.33427f, -0.54375f, -1.92181f,
+ 0.54704f, 0.13608f, 0.22151f, -0.38076f, 1.18390f, -0.77508f, -1.84283f,
+ 1.00894f, 0.62318f, -0.15296f, 1.27600f, 0.22822f, 0.12751f, 0.93910f,
+ -0.28502f, 0.53912f, -0.96889f, 0.10182f, 0.81508f, -0.43028f, 2.67386f,
+ 0.52204f, 0.49820f, -0.41711f, 1.05038f, 1.12192f, 0.74349f, -0.75417f,
+ -0.03718f, -0.35769f, 0.89651f, 0.63236f, 0.54215f, -0.07894f, 0.48274f,
+ 1.08829f,
+};
+
+static const float av1_tx_type_nn_bias_16x16_layer1[4] = {
+ 0.81986f,
+ 1.26865f,
+ 0.11118f,
+ 2.48404f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x16 = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_type_nn_weights_16x16_layer0,
+ av1_tx_type_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_type_nn_bias_16x16_layer0,
+ av1_tx_type_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx type model for 4x16 block.
+static const float av1_tx_type_nn_weights_4x16_hor_layer0[32] = {
+ 0.36539f, 0.25667f, 0.01491f, -0.21959f, 2.55105f, 0.17615f, 1.79884f,
+ 1.65936f, -0.44363f, 0.00706f, -0.68004f, -0.64360f, 1.75760f, 1.91906f,
+ 1.47682f, 0.09650f, -3.59244f, -0.35004f, 0.93295f, 0.25806f, -0.08154f,
+ 0.79332f, 0.79535f, 1.09467f, 1.57855f, -0.51359f, 0.90553f, -1.67744f,
+ -1.74563f, -0.88830f, -1.77603f, 2.15935f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer0[8] = {
+ -0.36435f, -2.22731f, -0.00837f, -1.34546f,
+ 0.62806f, -0.20675f, 4.91940f, -0.56079f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_hor_layer1[32] = {
+ -0.57191f, -1.46418f, 0.67331f, -1.15027f, 0.46288f, 0.81251f, 2.51768f,
+ -0.27147f, 0.00761f, -2.15214f, -0.69650f, -0.50808f, 0.92832f, 0.45668f,
+ 2.34201f, -0.52941f, 0.51008f, -1.55496f, -0.01371f, -0.12356f, 0.66624f,
+ 0.88043f, 2.64862f, -1.28024f, -0.17578f, -1.80034f, -0.32217f, 0.89519f,
+ 1.28413f, -0.30326f, 2.45329f, -0.83335f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_hor_layer1[4] = {
+ 2.33198f,
+ 3.36245f,
+ 1.62603f,
+ 2.91056f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_hor = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_hor_layer0,
+ av1_tx_type_nn_weights_4x16_hor_layer1 },
+ { av1_tx_type_nn_bias_4x16_hor_layer0, av1_tx_type_nn_bias_4x16_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer0[128] = {
+ 1.61392f, 1.41239f, 1.47646f, 1.47325f, 1.46110f, 1.49208f, 1.49414f,
+ 0.12835f, -0.76986f, 0.07087f, -0.24572f, -0.93168f, 3.07935f, -0.18183f,
+ -0.09831f, -0.07703f, -0.03222f, -0.25473f, -0.06090f, 2.93713f, -0.38711f,
+ -0.12884f, -0.18329f, -0.06262f, -0.00327f, -0.02930f, -0.01641f, -0.00622f,
+ -0.03305f, -4.07069f, -2.76643f, 0.04413f, -1.03176f, -0.19217f, -0.44980f,
+ -2.48615f, -2.58112f, -0.87695f, 0.16187f, -0.04891f, -0.06854f, 1.08104f,
+ 0.75245f, 1.49302f, 0.63363f, 1.45715f, 0.92574f, 1.72029f, 0.33326f,
+ 3.86646f, 0.04422f, 0.41019f, 0.36212f, 0.56600f, -1.01552f, 0.05128f,
+ 0.40454f, -1.05100f, -0.47461f, -1.33168f, -0.46145f, -1.36870f, -0.88838f,
+ -1.05358f, -0.18537f, -0.34357f, -0.03698f, 0.68905f, 0.41010f, 0.31223f,
+ -0.43382f, -0.74715f, 2.03366f, -0.30419f, 0.45747f, 0.09526f, 0.31678f,
+ 0.22915f, 0.21832f, 1.26385f, -0.06814f, -0.71417f, -1.18947f, 0.03762f,
+ 0.10936f, 2.97396f, -0.42638f, -0.03123f, -5.49756f, -0.17029f, -0.11323f,
+ 0.05173f, -0.44274f, -0.15738f, 0.11311f, 0.43872f, 0.16837f, -0.52849f,
+ 2.90050f, -0.54735f, -0.29591f, 1.24030f, 0.21696f, -0.04443f, -1.60877f,
+ -1.36365f, -1.27432f, -1.52060f, -1.34397f, -1.13371f, -1.87554f, 0.80123f,
+ 0.42820f, -0.14157f, -2.73963f, -0.68040f, -0.35236f, 0.14490f, 2.23477f,
+ 0.01370f, -0.20426f, -1.51411f, -0.72293f, 0.64516f, 0.97638f, 0.32616f,
+ -0.27975f, -0.01149f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer0[16] = {
+ -1.37863f, -0.05763f, -0.07041f, 0.15306f, 0.96026f, -1.42105f,
+ -0.55822f, 1.04845f, -0.17662f, -1.25345f, -0.11927f, 0.49845f,
+ -0.32530f, 0.73483f, 0.08322f, -0.23890f,
+};
+
+static const float av1_tx_type_nn_weights_4x16_ver_layer1[64] = {
+ 0.27194f, 0.50607f, 0.49229f, -0.48192f, 0.15667f, -1.38891f, 0.38102f,
+ -0.58825f, -0.07337f, -0.52909f, 0.36975f, 0.28710f, 0.34992f, -0.73630f,
+ 0.30386f, -0.58822f, 0.36127f, 0.57950f, 0.55878f, -0.42796f, 0.19967f,
+ -1.45517f, 0.42529f, -0.54630f, -0.38169f, -0.84899f, 0.41622f, 0.46935f,
+ 0.39077f, -0.75448f, 0.31698f, -0.76187f, 0.97765f, 0.57052f, 0.55825f,
+ -0.54273f, 0.20466f, -1.46347f, 0.41813f, -0.55019f, -0.19948f, -0.57982f,
+ 0.41206f, 0.32373f, 0.38537f, -1.11657f, 0.32887f, -0.76911f, 1.12259f,
+ 0.72163f, 0.82603f, 0.37786f, 0.34976f, -1.86642f, 0.59961f, -0.16329f,
+ -0.36631f, -0.56814f, 0.60410f, 0.53158f, 0.56389f, -0.70508f, 0.51009f,
+ -0.56513f,
+};
+
+static const float av1_tx_type_nn_bias_4x16_ver_layer1[4] = {
+ 4.60896f,
+ 4.53551f,
+ 4.53124f,
+ 4.27435f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_4x16_ver = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_4x16_ver_layer0,
+ av1_tx_type_nn_weights_4x16_ver_layer1 },
+ { av1_tx_type_nn_bias_4x16_ver_layer0, av1_tx_type_nn_bias_4x16_ver_layer1 }
+};
+/******************************************************************************/
+
+// Tx type model for 16x4 block.
+static const float av1_tx_type_nn_weights_16x4_hor_layer0[128] = {
+ 1.45347f, -0.15743f, 0.44236f, 0.25808f, 0.33944f, 0.38678f, 0.24428f,
+ 1.67287f, 0.09539f, -0.42940f, -0.31507f, -0.00154f, -2.98755f, -2.27744f,
+ -0.49183f, 0.09333f, -0.99026f, -0.22157f, 0.53701f, 0.60447f, 0.15686f,
+ -0.04646f, 0.26341f, 2.12361f, 0.27090f, -1.14716f, -0.64146f, -0.91604f,
+ -0.75335f, -0.60056f, -1.25084f, 1.68473f, -3.24075f, -4.03867f, -2.07877f,
+ -0.02347f, 0.00333f, -0.01259f, -0.00465f, 0.02526f, 0.36286f, -0.10324f,
+ 2.12780f, -0.74584f, -1.05052f, 1.78467f, -0.55065f, -0.03326f, 2.46781f,
+ 1.18349f, 0.96015f, 1.01696f, 1.10584f, 1.07263f, 1.11531f, -1.06413f,
+ 0.32389f, -1.87360f, -0.14435f, 1.77926f, 1.09966f, -0.12680f, -0.61386f,
+ -0.09724f, -0.33095f, 1.12122f, 1.00791f, 1.52416f, 1.35004f, 1.32657f,
+ 0.60950f, -1.13538f, -0.38654f, 0.06473f, 2.10669f, 0.27734f, -0.38359f,
+ -1.91455f, -1.22676f, 0.05786f, 0.97432f, 2.19967f, 0.50457f, 0.78976f,
+ 0.95183f, -0.32414f, 0.49437f, -0.04506f, 0.18993f, -0.07971f, 0.23889f,
+ -0.09872f, -0.66036f, 0.05377f, 2.69638f, -0.08259f, -0.69210f, -1.08296f,
+ -1.96504f, -2.31947f, -0.80161f, -0.80456f, -1.35556f, -0.05323f, -4.42658f,
+ -0.30732f, -0.12043f, 0.11126f, 0.10771f, -0.14956f, -0.02218f, 0.41016f,
+ 1.16599f, 1.14629f, 1.12881f, 1.18676f, 1.24677f, 1.28695f, 1.11270f,
+ 0.08233f, 1.75440f, 0.49228f, -0.34858f, -0.17032f, 0.29288f, 0.47175f,
+ 0.19055f, -1.56413f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer0[16] = {
+ -1.71227f, 0.47291f, -0.97536f, -0.66216f, 0.11729f, -0.21451f,
+ 2.75281f, 0.04318f, 2.03965f, 0.14618f, -0.70483f, -0.24517f,
+ 1.14048f, 0.33308f, -1.10886f, 0.41184f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_hor_layer1[64] = {
+ -1.17079f, 0.19096f, -1.05753f, -0.30803f, -1.21680f, -0.67255f, 1.60115f,
+ 0.05972f, 1.44759f, -0.04068f, -0.26331f, 0.31400f, 0.96923f, 0.33443f,
+ -0.77215f, -0.91316f, -1.78928f, 0.21483f, -1.24008f, -0.46190f, -0.12127f,
+ -0.62144f, 1.37593f, 0.08373f, 1.56215f, 0.00279f, -0.14556f, 0.38710f,
+ 0.96228f, 0.66433f, -0.51798f, -0.80738f, -0.18539f, 0.19377f, -1.03090f,
+ -1.51044f, -0.59485f, -0.62589f, 1.90742f, 0.09078f, 1.49113f, 0.00205f,
+ -0.15918f, 0.40827f, 1.08553f, 0.43431f, 0.33519f, -1.12669f, -1.10274f,
+ 0.80004f, -1.83599f, -0.53134f, 2.00515f, -0.32670f, 1.37124f, 0.51136f,
+ 1.62563f, 0.24787f, 0.31757f, 0.81751f, 1.57262f, 0.83214f, 1.04661f,
+ -0.43819f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_hor_layer1[4] = {
+ 2.32575f,
+ 2.75703f,
+ 1.12304f,
+ 2.15567f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_hor = {
+ 8, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_hor_layer0,
+ av1_tx_type_nn_weights_16x4_hor_layer1 },
+ { av1_tx_type_nn_bias_16x4_hor_layer0, av1_tx_type_nn_bias_16x4_hor_layer1 }
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer0[32] = {
+ 0.26047f, 0.99930f, 1.16484f, -0.28196f, -2.67483f, -0.21456f, -0.16854f,
+ 0.46375f, 1.47951f, 1.13735f, 1.12356f, 0.27385f, 0.50978f, 2.09967f,
+ -1.47386f, 0.01950f, -0.06362f, 0.26014f, 1.04544f, -0.03099f, 0.07478f,
+ -0.39701f, 0.05545f, 2.73633f, -0.56305f, -0.02208f, -0.44517f, -0.00897f,
+ -0.17967f, -0.96622f, 0.42635f, -1.04784f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer0[8] = {
+ -0.52088f, 0.52844f, -1.03655f, -0.30974f,
+ 2.59952f, -1.93604f, 0.00000f, 2.51787f,
+};
+
+static const float av1_tx_type_nn_weights_16x4_ver_layer1[32] = {
+ 0.10916f, -0.21219f, -0.51340f, 0.69161f, 1.45988f, -1.36942f, -0.40899f,
+ 1.05136f, -0.08486f, 0.10008f, -0.55304f, 0.88012f, 1.61177f, -1.64507f,
+ 0.63428f, 1.15130f, -0.17287f, -0.18592f, -0.01143f, 0.88293f, 1.73326f,
+ -1.63624f, 0.09359f, 1.18393f, 0.26531f, 0.22378f, 0.15170f, 1.06965f,
+ 1.26814f, -1.93873f, -0.00768f, 1.58309f,
+};
+
+static const float av1_tx_type_nn_bias_16x4_ver_layer1[4] = {
+ 2.34713f,
+ 1.68667f,
+ 1.25488f,
+ 1.69812f,
+};
+
+static const NN_CONFIG av1_tx_type_nnconfig_16x4_ver = {
+ 4, // num_inputs
+ 4, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 8,
+ }, // num_hidden_nodes
+ { av1_tx_type_nn_weights_16x4_ver_layer0,
+ av1_tx_type_nn_weights_16x4_ver_layer1 },
+ { av1_tx_type_nn_bias_16x4_ver_layer0, av1_tx_type_nn_bias_16x4_ver_layer1 }
+};
+/******************************************************************************/
+
+// Map tx_size to its corresponding neural net model for tx type prediction.
+static const NN_CONFIG *av1_tx_type_nnconfig_map_hor[] = {
+ &av1_tx_type_nnconfig_4x4_hor, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_hor, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_hor, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_hor, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_hor, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_hor, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_hor, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_hor, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+static const NN_CONFIG *av1_tx_type_nnconfig_map_ver[] = {
+ &av1_tx_type_nnconfig_4x4_ver, // 4x4 transform
+ &av1_tx_type_nnconfig_8x8_ver, // 8x8 transform
+ &av1_tx_type_nnconfig_16x16, // 16x16 transform
+ NULL, // 32x32 transform
+ NULL, // 64x64 transform
+ &av1_tx_type_nnconfig_4x8_ver, // 4x8 transform
+ &av1_tx_type_nnconfig_8x4_ver, // 8x4 transform
+ &av1_tx_type_nnconfig_8x16_ver, // 8x16 transform
+ &av1_tx_type_nnconfig_16x8_ver, // 16x8 transform
+ NULL, // 16x32 transform
+ NULL, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ &av1_tx_type_nnconfig_4x16_ver, // 4x16 transform
+ &av1_tx_type_nnconfig_16x4_ver, // 16x4 transform
+ NULL, // 8x32 transform
+ NULL, // 32x8 transform
+ NULL, // 16x64 transform
+ NULL, // 64x16 transform
+};
+
+// Tx split model for 4x8 block.
+static const float av1_tx_split_nn_weights_4x8_layer0[8 * 16] = {
+ 0.068650f, -0.732073f, -0.040361f, 0.322550f, -0.021123f, 0.212518f,
+ -0.350546f, 0.435987f, -0.111756f, -0.401568f, 0.069548f, -0.313000f,
+ 0.073918f, -0.373805f, -0.775810f, -0.124753f, 0.181094f, -0.602641f,
+ -0.026219f, -0.350112f, 0.020599f, -0.311752f, -0.476482f, -0.669465f,
+ -0.310921f, 0.348869f, -0.115984f, 0.154250f, 0.200485f, -0.016689f,
+ 0.020392f, 0.413810f, 0.634064f, -0.627530f, 0.399178f, -0.012284f,
+ 0.472030f, 0.091087f, -0.706100f, -0.447944f, -0.274226f, 0.445656f,
+ 0.309339f, 0.505522f, 0.038496f, -0.152809f, 0.408684f, -0.068151f,
+ 0.271612f, 0.353233f, -0.150365f, 0.075212f, -0.035096f, 0.346615f,
+ 0.124382f, 0.477072f, 0.216288f, 0.070548f, -0.106362f, 0.681613f,
+ -0.145502f, -0.218631f, -0.099248f, -0.001983f, -0.196819f, -0.969045f,
+ 0.063009f, -0.123053f, 0.104875f, -0.137581f, -0.282933f, -0.003624f,
+ -0.315659f, -0.333523f, -0.503000f, -0.100063f, -0.536711f, -0.059978f,
+ -0.670248f, -0.353762f, 0.181109f, 0.289715f, -0.071206f, 0.261141f,
+ 0.052796f, -0.114554f, -0.139214f, -0.261380f, 0.075984f, -0.647925f,
+ -0.099528f, -0.677814f, 0.015712f, -0.389385f, -0.095622f, -0.165117f,
+ -0.109454f, -0.175240f, -0.393914f, 0.212330f, 0.037822f, 0.248280f,
+ 0.180197f, 0.110493f, -0.525727f, -0.092329f, -0.524029f, -0.407364f,
+ -0.542373f, -0.435626f, -0.912194f, 0.062794f, 0.160433f, 0.741485f,
+ -0.103659f, -0.119327f, -0.055275f, 0.334358f, 0.014713f, 0.046327f,
+ 0.831114f, -0.576682f, 0.354369f, -0.082088f, 0.452331f, 0.039730f,
+ -0.792429f, -0.385862f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer0[16] = {
+ 0.238621f, 2.186830f, 1.383035f, -0.867139f, 1.257119f, -0.351571f,
+ -0.240650f, -0.971692f, 2.744843f, 1.116991f, 0.139062f, -0.165332f,
+ 0.262171f, -1.598153f, -1.427340f, -1.602306f,
+};
+
+static const float av1_tx_split_nn_weights_4x8_layer1[16] = {
+ -0.367134f, 1.373058f, -0.897039f, -0.326819f, -0.734030f, -0.290413f,
+ -0.501249f, 0.505321f, -0.537692f, -0.767893f, 0.268697f, 0.278987f,
+ 0.085082f, 0.614986f, 0.847904f, 0.637578f,
+};
+
+static const float av1_tx_split_nn_bias_4x8_layer1[1] = {
+ 0.20586078f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x8 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x8_layer0,
+ av1_tx_split_nn_weights_4x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x8_layer0,
+ av1_tx_split_nn_bias_4x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x8 block.
+static const float av1_tx_split_nn_weights_8x8_layer0[144] = {
+ 0.177983f, -0.938386f, -0.074460f, -0.221843f, -0.073182f, -0.295155f,
+ -0.098202f, -0.279510f, 0.001054f, -0.119319f, -1.835282f, -0.581507f,
+ -1.222222f, -1.049006f, -0.807508f, -0.454252f, -0.774879f, -0.180607f,
+ -0.886976f, -0.231971f, -0.824677f, -0.351872f, -1.323819f, 0.235378f,
+ 0.015331f, -0.341818f, 0.145549f, -0.348362f, 0.147647f, -0.323400f,
+ 0.047558f, -0.553025f, -0.295485f, -0.330368f, -0.530605f, -0.407516f,
+ 0.447740f, 0.782381f, -0.179164f, -0.584675f, -0.052645f, 0.038656f,
+ -0.096783f, 0.038342f, -0.170762f, -0.405844f, -0.552665f, -0.509866f,
+ 0.757204f, -1.296465f, 0.631015f, 0.009265f, 0.646192f, 0.044523f,
+ 0.653161f, 0.033820f, 0.849639f, -0.068555f, -1.036085f, -0.511652f,
+ 0.104693f, -1.458690f, 0.286051f, -0.089800f, 0.381564f, -0.302640f,
+ 0.304465f, -0.268706f, 0.432603f, -0.117914f, -2.070031f, -0.565696f,
+ -0.073027f, -1.783570f, -0.318144f, -0.320990f, -0.343966f, -0.140996f,
+ -0.322977f, -0.232147f, -0.373210f, -0.158266f, -1.922305f, -0.634373f,
+ 0.101894f, -0.221847f, 0.018412f, -0.423887f, -0.266684f, -0.444930f,
+ -0.196237f, 0.106638f, -0.065834f, -0.538401f, -0.280772f, -0.620348f,
+ 1.089957f, -0.799928f, 0.504112f, -0.165763f, 0.578741f, -0.172653f,
+ 0.547316f, -0.143484f, 0.717220f, -0.297190f, -1.237854f, -0.074819f,
+ -0.977304f, -0.484092f, -0.646427f, -0.451443f, -0.612126f, -0.224475f,
+ -0.731608f, -0.257077f, -0.665857f, -0.346742f, -1.216372f, 0.227267f,
+ 0.231249f, -1.693073f, -0.035899f, 0.380845f, -0.058476f, 0.409405f,
+ -0.066679f, 0.406731f, -0.068501f, 0.396748f, 0.639462f, 0.150834f,
+ -0.418659f, -1.421931f, 0.101889f, 0.083573f, 0.129746f, 0.134460f,
+ 0.081185f, 0.127420f, 0.083664f, 0.051096f, 1.361688f, 0.386093f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer0[12] = {
+ 4.280443f, 2.218902f, -0.256953f, 3.161431f, 2.082548f, 2.506052f,
+ 2.563224f, 1.421976f, -1.627813f, -1.436085f, 2.297265f, 1.500469f,
+};
+
+static const float av1_tx_split_nn_weights_8x8_layer1[12] = {
+ 1.178833f, -0.428527f, -0.078737f, 0.381434f, -0.466895f, -0.901745f,
+ -0.766968f, -0.356663f, 0.450146f, 0.509370f, -0.356604f, -0.443506f,
+};
+
+static const float av1_tx_split_nn_bias_8x8_layer1[1] = {
+ -0.156294f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x8 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 12,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x8_layer0,
+ av1_tx_split_nn_weights_8x8_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x8_layer0,
+ av1_tx_split_nn_bias_8x8_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x16 block.
+static const float av1_tx_split_nn_weights_8x16_layer0[8 * 64] = {
+ 0.374660f, 0.218905f, -0.139779f, 0.212141f, 0.056517f, 0.051114f,
+ 0.042860f, -0.273258f, -0.340809f, 0.138983f, -0.216996f, -0.241519f,
+ -0.123244f, 0.078577f, -0.472273f, -0.194201f, 0.125056f, 0.239761f,
+ -0.332782f, 0.174782f, -0.211400f, -0.129795f, 0.062195f, 0.113176f,
+ -0.008869f, 0.140764f, 0.059833f, 0.163826f, 0.359293f, -0.109797f,
+ -0.022091f, -0.059536f, -0.188226f, 0.179709f, 0.031386f, 0.164790f,
+ 0.214364f, 0.198555f, 0.152262f, -0.242980f, 0.319367f, -0.136902f,
+ 0.046524f, -0.043591f, 0.342178f, -0.011757f, -0.014286f, 0.072871f,
+ -0.278314f, -0.345303f, -0.252103f, -0.107154f, -0.235101f, -0.106739f,
+ -0.120865f, -0.160042f, 0.240028f, 0.112902f, -0.141587f, -0.703012f,
+ -0.136591f, 0.318993f, -0.154417f, -0.054668f, 0.192870f, 0.176166f,
+ -0.029965f, 0.266942f, -0.178384f, 0.038680f, 0.134403f, -0.002426f,
+ 0.534825f, -0.070923f, 0.413281f, 0.418148f, 0.093729f, 0.016454f,
+ 0.305358f, -0.040512f, 0.069904f, -0.227588f, -0.362220f, -0.031604f,
+ -0.394901f, 0.071506f, -0.342833f, -0.142550f, -0.164005f, 0.182600f,
+ 0.213062f, 0.076805f, 0.278758f, 0.125613f, -0.035552f, 0.040971f,
+ 0.182785f, -0.227961f, -0.105413f, -0.074949f, -0.084629f, -0.254767f,
+ 0.114657f, 0.047121f, 0.195902f, 0.264759f, 0.017799f, 0.210230f,
+ 0.150749f, -0.142142f, 0.182494f, -0.142415f, -0.259782f, -0.114830f,
+ -0.198826f, 0.000061f, -0.375668f, -0.276656f, -0.373202f, 0.210298f,
+ 0.422680f, 0.066960f, 0.351106f, -0.209034f, 0.367195f, -0.110274f,
+ 0.115573f, -0.066642f, -0.389673f, -0.260447f, 0.056949f, -0.180425f,
+ 0.069922f, -0.153506f, -0.097053f, -0.111757f, 0.094069f, 0.144837f,
+ -0.052984f, -0.506681f, -0.034474f, 0.279057f, -0.105025f, 0.006656f,
+ -0.125017f, -0.114096f, 0.103153f, -0.117402f, -0.359472f, 0.072534f,
+ 0.110291f, 0.003088f, -0.456897f, 0.038331f, -0.322298f, 0.113942f,
+ -0.119916f, -0.194392f, 0.093167f, 0.193459f, 0.074671f, 0.033602f,
+ 0.004440f, -0.179578f, -0.036637f, -0.216172f, -0.296530f, -0.318992f,
+ 0.319160f, -0.066218f, 0.291246f, 0.181292f, 0.089914f, 0.025273f,
+ 0.303128f, 0.019063f, 0.078545f, -0.396919f, 0.014065f, -0.122121f,
+ 0.037107f, -0.151886f, -0.299392f, -0.172207f, -0.124571f, -0.232553f,
+ 0.102970f, -0.225040f, 0.061059f, -0.258188f, -0.469871f, -0.099607f,
+ -0.061524f, -0.213700f, 0.070237f, -0.289134f, -0.238225f, 0.256403f,
+ -0.119344f, 0.067782f, -0.398983f, -0.123975f, -0.200205f, -0.047038f,
+ 0.026569f, 0.031037f, 0.094302f, -0.101239f, 0.433307f, -0.303612f,
+ 0.088537f, -0.164436f, 0.202471f, -0.048592f, -0.251904f, 0.122577f,
+ -0.309874f, -0.263405f, -0.292503f, 0.216589f, 0.035378f, 0.136599f,
+ -0.145844f, -0.018211f, 0.174084f, -0.449941f, -0.001428f, 0.064134f,
+ 0.039652f, 0.111083f, -0.246076f, -0.204733f, 0.056559f, -0.000123f,
+ 0.104049f, 0.138512f, -0.128309f, 0.087855f, 0.232784f, 0.247138f,
+ 0.162766f, 0.154829f, 0.313605f, -0.164115f, -0.050844f, 0.156549f,
+ 0.185279f, -0.238962f, -0.308281f, -0.179592f, -0.193262f, 0.201670f,
+ -0.203399f, -0.096831f, -0.127867f, 0.310674f, -0.008181f, 0.004078f,
+ -0.211038f, -0.193480f, -0.185639f, -0.150202f, -0.204858f, -0.240758f,
+ 0.114268f, -0.032535f, -0.052403f, -0.234333f, -0.064072f, -0.208444f,
+ -0.352853f, -0.224001f, -0.156330f, 0.215436f, 0.171846f, 0.291849f,
+ 0.108832f, 0.046991f, -0.127801f, 0.032485f, 0.141493f, 0.123319f,
+ -0.057250f, 0.315346f, -0.061317f, -0.465086f, -0.130179f, -0.217841f,
+ -0.239089f, -0.073251f, -0.327718f, 0.054905f, -0.283169f, -0.028900f,
+ 0.071450f, 0.270072f, 0.248891f, 0.088052f, 0.253319f, 0.122808f,
+ 0.175490f, -0.147805f, 0.089169f, -0.045457f, -0.330788f, 0.099791f,
+ -0.137376f, -0.195977f, -0.350942f, -0.284930f, -0.559037f, 0.030504f,
+ 0.162554f, -0.199100f, -0.050453f, -0.131320f, -0.077863f, -0.066253f,
+ -0.379723f, -0.424047f, -0.081182f, -0.252261f, -0.102815f, 0.058240f,
+ -0.182036f, 0.176772f, -0.070823f, 0.216054f, -0.211533f, -0.232992f,
+ 0.279346f, 0.117984f, 0.236674f, 0.126625f, -0.046220f, 0.044919f,
+ 0.278492f, 0.083944f, 0.180512f, 0.217994f, 0.401170f, -0.064417f,
+ 0.011636f, -0.139597f, -0.050020f, -0.268438f, -0.032803f, 0.024908f,
+ -0.085713f, -0.012984f, -0.055192f, -0.338657f, 0.045826f, -0.312849f,
+ -0.023393f, -0.168800f, -0.030886f, -0.131816f, -0.253542f, -0.104812f,
+ -0.354389f, 0.169464f, 0.094151f, -0.217122f, -0.456397f, 0.211478f,
+ 0.219232f, -0.155519f, -0.353700f, -0.264759f, -0.034709f, 0.034409f,
+ -0.148639f, -0.132850f, -0.216791f, -0.118492f, 0.173721f, -0.144181f,
+ 0.335028f, 0.176439f, 0.105980f, 0.169390f, 0.155615f, -0.040618f,
+ -0.176029f, 0.155569f, -0.184833f, -0.171099f, -0.178663f, -0.032051f,
+ -0.434334f, 0.092238f, -0.263103f, 0.061804f, -0.172957f, 0.005962f,
+ -0.100176f, 0.125898f, 0.048092f, -0.088141f, 0.247196f, -0.221601f,
+ -0.114474f, -0.124410f, -0.156393f, -0.181782f, -0.083562f, 0.034937f,
+ 0.403401f, -0.046200f, 0.322259f, 0.219678f, 0.109850f, 0.051837f,
+ 0.196861f, -0.019118f, 0.248818f, -0.137567f, 0.127862f, 0.052293f,
+ 0.298726f, 0.275788f, 0.015344f, 0.058714f, 0.283691f, -0.053794f,
+ -0.123270f, -0.227761f, -0.141744f, -0.268515f, -0.007189f, -0.242117f,
+ -0.252396f, -0.069017f, 0.034803f, -0.003388f, -0.262577f, 0.062115f,
+ -0.298393f, 0.215415f, -0.153615f, 0.289902f, 0.085886f, -0.504290f,
+ 0.077178f, 0.150861f, -0.228848f, -0.261020f, 0.198204f, 0.162113f,
+ 0.346418f, -0.286950f, 0.354756f, -0.226419f, 0.024720f, 0.208037f,
+ 0.107286f, -0.110849f, 0.104415f, -0.207725f, 0.063932f, -0.037748f,
+ -0.167037f, -0.068282f, 0.320815f, -0.051884f, 0.099989f, -0.078388f,
+ 0.127071f, 0.046675f, -0.336571f, -0.273080f, 0.264694f, -0.007352f,
+ -0.093828f, 0.094773f, -0.144434f, 0.091795f, -0.031615f, 0.056914f,
+ 0.064673f, -0.136669f, 0.344734f, 0.225926f, 0.283451f, -0.068354f,
+ 0.030572f, 0.180784f, -0.378047f, -0.092962f, -0.083291f, 0.038970f,
+ 0.052094f, -0.017932f, 0.216302f, -0.184396f, 0.079888f, 0.210406f,
+ -0.020627f, 0.244744f, 0.336972f, -0.182914f, -0.220976f, -0.304225f,
+ -0.330974f, -0.370868f, -0.084935f, -0.136489f, -0.210082f, -0.188088f,
+ -0.408768f, 0.184693f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer0[64] = {
+ -0.274107f, 0.445751f, 0.234359f, 0.291593f, 0.163298f, 0.183707f,
+ -0.548839f, -0.190779f, -0.163346f, -0.669028f, 0.399209f, -0.354974f,
+ 0.000000f, -0.254630f, 0.220149f, 0.371104f, 0.789759f, 0.270300f,
+ 0.195126f, -0.206958f, 0.917708f, -0.256232f, 1.131933f, 1.178944f,
+ 0.461270f, 0.246169f, -0.818614f, -0.111986f, 0.759355f, 0.154889f,
+ 0.470299f, -1.025250f, 0.678678f, 0.959346f, -0.164105f, 0.544079f,
+ -0.448733f, 0.649221f, -0.536672f, 0.962758f, -0.256427f, 0.808664f,
+ -0.118694f, 0.684873f, -0.015635f, -0.046469f, 0.075481f, 0.412647f,
+ 0.454456f, -0.107169f, 0.775235f, -0.261629f, -1.194849f, 0.010093f,
+ -0.231289f, 0.658286f, -0.769320f, 0.564545f, 0.482962f, -0.131378f,
+ -0.255844f, -0.078400f, 0.476752f, 0.643001f,
+};
+
+static const float av1_tx_split_nn_weights_8x16_layer1[64] = {
+ -0.145065f, -0.145101f, 0.174786f, 0.196692f, 0.102025f, -0.087735f,
+ 0.386353f, -0.660539f, -0.183940f, 0.490045f, -0.276404f, -0.145669f,
+ 0.209846f, -0.085574f, -0.156821f, -0.377450f, -0.950010f, 0.450709f,
+ -0.108545f, -0.261181f, 1.435606f, -0.176621f, -1.158548f, 2.035680f,
+ 0.218069f, -0.138629f, 0.305958f, -0.277194f, -0.602468f, 0.203873f,
+ 0.120720f, 0.216095f, -0.434502f, -0.579746f, -0.239450f, 0.755529f,
+ 0.545643f, 0.232091f, 0.330169f, 0.988136f, -0.070465f, -0.345584f,
+ -0.162455f, -0.617064f, 0.123881f, -0.201098f, 0.222756f, 0.112932f,
+ 0.048647f, -0.147890f, 0.394584f, -0.262148f, 0.280564f, -0.195432f,
+ -0.047515f, 1.133410f, 0.255415f, -0.299032f, -0.397807f, -0.153246f,
+ -0.256734f, 0.177370f, 0.213522f, -0.530158f,
+};
+
+static const float av1_tx_split_nn_bias_8x16_layer1[1] = {
+ 0.14910713f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 64,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x16_layer0,
+ av1_tx_split_nn_weights_8x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x16_layer0,
+ av1_tx_split_nn_bias_8x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x16 block.
+static const float av1_tx_split_nn_weights_16x16_layer0[12 * 24] = {
+ -0.177215f, -0.297166f, 0.299924f, 0.207878f, 0.216871f, 0.173264f,
+ 0.295464f, 0.048395f, 0.154731f, 0.305880f, 0.056787f, -0.166617f,
+ 0.115653f, -0.529477f, -0.073995f, -0.211746f, -0.018169f, 0.000788f,
+ -0.024940f, -0.007055f, 0.001392f, 0.021678f, -1.594600f, -0.099593f,
+ 0.332930f, 0.103574f, 0.158249f, 0.182601f, 0.332665f, 0.226207f,
+ -0.139566f, 0.185531f, 0.099074f, -0.185654f, -0.203121f, -0.285678f,
+ -0.313453f, -0.294452f, -0.143707f, -0.031265f, -0.453030f, -0.061874f,
+ -0.066150f, -0.099058f, -0.458879f, 0.127544f, 0.338314f, -0.161350f,
+ 0.030091f, -0.075528f, 0.004320f, 0.353690f, -0.013480f, -0.420402f,
+ -0.004659f, -0.329401f, -0.001745f, 0.227384f, -0.055183f, 0.121405f,
+ 0.160340f, 0.143603f, -0.221813f, 0.079107f, -0.657639f, -0.084348f,
+ -0.303414f, 0.046774f, -0.367679f, 0.060005f, 0.168645f, 0.084421f,
+ -0.133625f, 0.301375f, 0.079412f, -0.419303f, 0.017235f, 0.068637f,
+ 0.018384f, -0.428325f, -0.019753f, 0.149444f, -0.474836f, -0.287162f,
+ 0.198083f, 0.028292f, -0.299092f, -0.005849f, -0.256245f, 0.233277f,
+ -0.217561f, -0.264003f, 0.269411f, 0.207032f, -0.339411f, -0.198431f,
+ -0.028521f, 0.158076f, 0.177116f, 0.345702f, -0.145132f, 0.064623f,
+ -0.090867f, 0.288816f, -0.263198f, -0.071028f, -0.044546f, 0.380017f,
+ -0.014100f, -0.271192f, -0.318559f, 0.129015f, -0.050314f, -0.093355f,
+ -0.578498f, 0.099090f, -0.133080f, -0.029975f, -0.059828f, -0.157765f,
+ -0.321153f, -0.343671f, -0.242959f, 0.128304f, 0.017170f, 0.072787f,
+ -0.475838f, -0.003806f, -0.068615f, 0.150556f, -0.159903f, -0.416513f,
+ 0.218794f, -0.290456f, -0.084569f, -0.170014f, -0.044414f, -0.153069f,
+ -0.077329f, -0.089747f, -0.096526f, 0.537952f, 0.134725f, -0.006469f,
+ -0.323335f, -0.168183f, -0.107163f, -0.139954f, 0.011286f, -0.021712f,
+ -0.513992f, 0.259135f, -0.319808f, 0.077811f, 0.104613f, 0.370571f,
+ 0.185244f, 0.065530f, -0.091098f, -0.573741f, 0.111934f, 0.437417f,
+ -0.123691f, 0.220641f, -0.024783f, -0.149460f, -0.354185f, -0.134127f,
+ 0.038015f, -0.380596f, 0.250980f, 0.142208f, 0.135170f, -0.131129f,
+ -0.357556f, -0.530945f, 0.159672f, -0.147025f, -0.377829f, -0.504508f,
+ -0.492870f, 0.020753f, 0.142818f, 0.025172f, 0.086140f, 0.091283f,
+ 0.087491f, -0.186415f, 0.177785f, -0.195121f, -1.191148f, -0.477102f,
+ 0.023371f, 0.227004f, -0.023502f, -0.242913f, -0.074398f, -0.153480f,
+ 0.162900f, 0.415509f, -0.162565f, -0.131709f, -0.258852f, -0.252027f,
+ -0.080845f, -0.330274f, 0.021874f, 0.232398f, 0.069277f, 0.220567f,
+ -0.024237f, -0.366771f, 0.081673f, -0.429906f, -0.302170f, 0.061045f,
+ 0.352777f, -0.230376f, 0.408153f, 0.064758f, 0.142051f, 0.007219f,
+ 0.622878f, 0.212577f, 0.036489f, 0.081150f, -0.284767f, 0.107763f,
+ -0.529786f, -0.072190f, -0.300421f, -0.287959f, -0.568900f, 0.011547f,
+ -0.131696f, -0.356854f, -0.587962f, -0.026598f, 0.405829f, 0.057565f,
+ 0.414265f, -0.159155f, 0.221456f, 0.146314f, 0.265776f, -0.006516f,
+ 0.473978f, -0.186431f, 0.288672f, -0.060437f, 0.083380f, -0.205641f,
+ 0.360016f, 0.222041f, 0.420011f, 0.024579f, 0.377546f, 0.250380f,
+ -0.069900f, 0.296743f, 0.073532f, -0.243225f, -0.374987f, -0.387288f,
+ -0.237255f, -0.287013f, 0.417831f, -0.252988f, -0.257652f, -0.066775f,
+ -0.253926f, 0.057841f, 0.346133f, -0.157797f, -0.406028f, -0.286893f,
+ 0.274507f, -0.452561f, 0.143381f, -0.097755f, 0.021242f, 0.034561f,
+ 0.044115f, 0.004065f, 0.066729f, 0.043558f, 0.102991f, -0.477574f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer0[24] = {
+ -0.479033f, 1.467402f, -0.366291f, 0.372511f, 0.715322f, -0.605500f,
+ 0.176848f, 0.032318f, 0.237429f, -0.046047f, 0.452082f, 0.451805f,
+ -0.822845f, 0.636762f, -0.057350f, 1.163978f, 0.728287f, 0.603654f,
+ -0.245519f, -0.893569f, -1.428185f, 0.808870f, -0.076159f, 1.231976f,
+};
+
+static const float av1_tx_split_nn_weights_16x16_layer1[24] = {
+ -0.176161f, 1.670188f, -0.180755f, -0.321326f, 0.249728f, -0.170504f,
+ -0.538432f, 0.033893f, 0.149842f, 0.404140f, -0.377812f, 0.338838f,
+ -0.176091f, 0.249844f, -0.362533f, 1.412460f, 0.196862f, 0.278194f,
+ -0.140444f, 0.297746f, 0.172533f, 0.116470f, -0.151656f, -0.603250f,
+};
+
+static const float av1_tx_split_nn_bias_16x16_layer1[1] = {
+ 0.184803f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x16 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x16_layer0,
+ av1_tx_split_nn_weights_16x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x16_layer0,
+ av1_tx_split_nn_bias_16x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x32 block.
+static const float av1_tx_split_nn_weights_32x32_layer0[12 * 32] = {
+ -0.439303f, 0.004813f, -0.365052f, -0.116868f, -0.356716f, -0.196537f,
+ -0.196770f, -0.076096f, 0.357004f, -0.044909f, -0.112910f, -0.129081f,
+ 0.156725f, -0.386346f, 0.038971f, 0.160696f, 0.204923f, -0.384333f,
+ -0.319546f, 0.028179f, -0.250524f, -0.289669f, -0.284138f, -0.258963f,
+ -0.180854f, -0.000807f, -0.029620f, -0.353134f, 0.212408f, 0.141414f,
+ 0.303016f, 0.098066f, 0.482455f, 0.036069f, -0.166279f, 0.210119f,
+ -0.086337f, -0.023550f, -0.250796f, -0.183945f, -0.393856f, 0.170608f,
+ -0.306403f, 0.026318f, -0.277296f, 0.092684f, -0.033584f, -0.018371f,
+ -0.025043f, -0.257659f, -0.139163f, -0.206949f, -0.190105f, 0.028053f,
+ 0.361851f, -0.364726f, -0.096771f, -0.184166f, -0.433228f, -0.182191f,
+ -0.097051f, 0.259172f, 0.016432f, 0.259358f, 0.145059f, 0.037196f,
+ 0.091581f, -0.219644f, 0.140384f, -0.446837f, -0.234531f, 0.149508f,
+ -0.083429f, 0.186189f, -0.099890f, -0.111277f, 0.495214f, 0.085053f,
+ -0.266613f, -0.051366f, 0.148593f, 0.111875f, 0.077787f, -0.371653f,
+ -0.146157f, -0.229235f, 0.076203f, 0.488975f, 0.096771f, -0.009483f,
+ 0.192985f, 0.246273f, -0.192671f, -0.557890f, -0.292650f, -0.088907f,
+ -0.106892f, -0.329659f, 0.012105f, -0.359326f, 0.170723f, -0.004357f,
+ 0.171593f, -0.478768f, -0.236016f, -0.035077f, 0.133731f, 0.137962f,
+ -0.397926f, -0.155164f, -0.276709f, -0.186602f, -0.258301f, 0.036965f,
+ -0.649359f, 0.127605f, 0.097930f, 0.182775f, -0.313324f, 0.053349f,
+ 0.204203f, -0.222948f, -0.059008f, -0.049759f, -0.056848f, 0.087497f,
+ -0.039987f, -0.055042f, -0.041623f, -0.078424f, -0.317291f, -0.191398f,
+ 0.632147f, 0.221825f, 0.268394f, -0.096357f, 0.442545f, -0.007117f,
+ -0.036125f, 0.000525f, 0.088092f, -0.203653f, 0.086925f, 0.439141f,
+ 0.329889f, -0.370050f, -0.194306f, -0.207430f, 0.132779f, -0.217614f,
+ -0.039444f, -0.053019f, -0.260725f, -0.116563f, -0.271048f, 0.283737f,
+ -0.007300f, 0.062257f, -0.347865f, -0.296767f, -0.359123f, 0.230459f,
+ -0.189117f, -0.087622f, -0.561091f, 0.184182f, -0.044980f, 0.012643f,
+ 0.241672f, 0.050272f, -0.204851f, -0.159285f, -0.064081f, -0.118666f,
+ -0.269471f, 0.231668f, 0.135749f, -0.131162f, 0.062760f, 0.100949f,
+ 0.074967f, -0.056918f, 0.251707f, 0.034098f, 0.341290f, -0.105027f,
+ 0.313246f, -0.092679f, -0.014632f, -0.390967f, 0.136881f, -0.241554f,
+ 0.097674f, 0.110832f, -0.390245f, 0.017654f, -0.506222f, 0.065252f,
+ 0.244834f, -0.171352f, -0.331702f, 0.111043f, 0.125217f, -0.058116f,
+ -0.382595f, -0.052545f, 0.114261f, -0.493617f, 0.243984f, -0.171053f,
+ 0.165009f, -0.063020f, 0.096502f, 0.341339f, -0.013443f, 0.056372f,
+ 0.339284f, 0.398376f, 0.389409f, 0.257252f, 0.517368f, 0.078856f,
+ 0.087716f, -0.171092f, 0.227461f, 0.125307f, -0.054423f, -0.143161f,
+ 0.224041f, -0.086477f, -0.092548f, 0.072392f, -0.061608f, 0.258347f,
+ 0.147033f, -0.478244f, -0.204869f, 0.038552f, -0.144563f, 0.224087f,
+ -0.296705f, 0.153889f, -0.064624f, 0.085265f, -0.103826f, 0.127971f,
+ 0.019965f, 0.111937f, -0.074187f, -0.029518f, -0.127305f, -0.012210f,
+ 0.042714f, 0.070052f, -0.202360f, 0.348144f, -0.132097f, -0.209585f,
+ -0.248286f, -0.065774f, -0.089482f, -0.133226f, 0.325430f, -0.013468f,
+ -0.406090f, -0.144936f, 0.208620f, 0.343445f, -0.059639f, 0.114857f,
+ -0.069431f, -0.218725f, 0.190575f, -0.368101f, 0.030030f, 0.062815f,
+ -0.239369f, -0.537852f, 0.022487f, 0.023038f, 0.190788f, 0.040123f,
+ -0.004304f, 0.060749f, -0.108929f, 0.136796f, -0.542875f, -0.227074f,
+ -0.182244f, 0.082559f, 0.019149f, 0.178854f, 0.120284f, 0.009070f,
+ 0.068268f, -0.544822f, 0.120536f, 0.354028f, -0.119890f, -0.122055f,
+ -0.405335f, 0.122341f, -0.304412f, 0.062405f, -0.302568f, -0.276505f,
+ -0.120915f, -0.221841f, 0.282007f, -0.253971f, 0.059517f, -0.144976f,
+ 0.149391f, -0.047355f, -0.167742f, -0.392333f, -0.041132f, 0.342135f,
+ 0.017485f, 0.021038f, -0.023728f, -0.192181f, -0.103996f, 0.092873f,
+ -0.114365f, -0.397732f, -0.065421f, 0.053084f, 0.035201f, 0.053019f,
+ -0.105377f, -0.039500f, 0.131904f, -0.123911f, -0.390328f, -0.125198f,
+ -0.000126f, 0.014864f, -0.220187f, 0.084056f, -0.492155f, -0.164979f,
+ 0.133592f, 0.121519f, -0.240813f, 0.186680f, 0.118673f, 0.235006f,
+ -0.239894f, -0.185759f, -0.336992f, 0.209620f, -0.298845f, 0.127803f,
+ -0.083992f, 0.194340f, -0.245378f, 0.212308f, 0.142512f, -0.163324f,
+ 0.383495f, 0.291065f, 0.286620f, -0.239957f, 0.225127f, -0.174424f,
+ 0.297231f, -0.045434f, 0.156444f, -0.184273f, -0.204567f, 0.202551f,
+ 0.370019f, -0.073910f, 0.344897f, 0.063100f, 0.338547f, -0.099145f,
+ 0.391863f, -0.214244f, -0.241734f, -0.281851f, -0.035133f, -0.153157f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer0[32] = {
+ 0.143343f, -0.021982f, -0.314939f, 0.170867f, -0.081248f, 0.125758f,
+ -0.355762f, 0.279798f, 1.027712f, -0.434660f, 1.072005f, 0.668893f,
+ -0.031216f, -0.528650f, 0.328349f, 0.543645f, -0.188810f, 0.221110f,
+ -1.638637f, 0.058045f, -1.731105f, -0.444284f, 0.513693f, 0.890025f,
+ 0.160288f, 0.393312f, 0.332856f, -0.080767f, 0.299822f, 0.235876f,
+ 0.254942f, -0.017796f,
+};
+
+static const float av1_tx_split_nn_weights_32x32_layer1[32] = {
+ -0.090326f, -0.267553f, -0.026071f, 0.100912f, 0.279137f, 0.079064f,
+ -0.074885f, 0.053804f, 0.736810f, -0.031693f, -0.970514f, 0.174069f,
+ 0.095940f, -0.065047f, 0.052911f, 0.176728f, -0.058274f, 0.148364f,
+ -0.162210f, 0.093875f, -0.367663f, 0.020876f, 0.137280f, -1.099116f,
+ 0.146854f, 0.075590f, 0.228534f, 0.141993f, 0.072143f, 0.101421f,
+ -0.068547f, -0.154148f,
+};
+
+static const float av1_tx_split_nn_bias_32x32_layer1[1] = {
+ 0.316622f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x32 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x32_layer0,
+ av1_tx_split_nn_weights_32x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x32_layer0,
+ av1_tx_split_nn_bias_32x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 64x64 block.
+static const float av1_tx_split_nn_weights_64x64_layer0[12 * 32] = {
+ -0.006828f, 0.149944f, -0.017614f, -0.044599f, -0.024517f, 0.507698f,
+ 0.001039f, 0.037164f, 0.015091f, -0.306620f, -0.162047f, -0.369440f,
+ 0.396310f, 0.087121f, 0.208609f, -0.083068f, 0.493774f, 0.217682f,
+ 0.377393f, 0.172879f, 0.397422f, 0.078919f, 0.741350f, 0.064169f,
+ -0.099989f, -0.192983f, -0.278230f, -0.310048f, -0.439965f, -0.226698f,
+ -0.436596f, -0.007551f, -0.396721f, 0.153570f, -0.190838f, -0.071869f,
+ 0.048799f, -0.301301f, -0.005015f, 0.500480f, -0.030622f, -0.559095f,
+ -0.032634f, -0.054160f, -0.056979f, -0.456545f, 0.306536f, -0.411323f,
+ -0.005366f, -0.069496f, 0.019990f, 0.327931f, -0.002516f, 0.393190f,
+ 0.001759f, 0.035093f, -0.030302f, -0.528984f, 0.174781f, 0.241462f,
+ -0.415427f, -0.164502f, 0.143065f, -0.122595f, 0.082049f, -0.143346f,
+ 0.055642f, -0.124701f, 0.004050f, -0.216235f, -2.681730f, 0.101658f,
+ 0.381239f, 0.465936f, 0.331154f, 0.301708f, -0.360171f, 0.054886f,
+ -0.118658f, 0.287921f, 0.277859f, 0.203784f, 0.247809f, 0.656924f,
+ -0.354628f, 0.315081f, 0.105108f, -0.510179f, 0.059267f, 0.061386f,
+ 0.076423f, 0.347119f, 0.100134f, 0.028402f, -0.118621f, -0.238689f,
+ 0.080141f, -0.138863f, 0.009009f, -0.100526f, -0.138875f, 0.066992f,
+ 0.005949f, 0.564336f, 0.046994f, 0.004655f, 0.366047f, 0.014695f,
+ -0.146928f, -0.024665f, -0.440357f, -0.109395f, 0.527231f, -0.020925f,
+ -0.227236f, -0.068141f, 0.282009f, 0.040192f, -0.267100f, 0.229228f,
+ 0.133861f, 0.338706f, -0.030178f, -0.040919f, -0.026343f, -0.330338f,
+ -0.066931f, -0.110580f, -0.072056f, 0.599457f, -0.020738f, 0.169200f,
+ 0.836240f, -0.157548f, 0.386273f, 0.002404f, 0.329410f, -0.007020f,
+ 0.351705f, -0.041259f, 0.388861f, 0.003899f, 0.582627f, 0.023572f,
+ 0.409912f, -0.158472f, 0.536383f, 0.525093f, 0.604247f, 0.439159f,
+ 0.692832f, 0.046272f, 0.590367f, -0.082166f, 0.262357f, 0.478671f,
+ 0.031935f, 0.042675f, 0.120002f, 0.398616f, -0.078967f, 0.227986f,
+ -0.044679f, 0.151061f, -0.085564f, 0.220205f, -0.265606f, -0.203623f,
+ 0.204719f, -0.125922f, 0.038544f, -0.269379f, 0.025866f, 0.109967f,
+ 0.019064f, -0.237297f, -0.309746f, -0.329118f, -0.278368f, -0.063859f,
+ 0.278496f, 0.018620f, 0.209971f, 0.296250f, 0.142850f, 0.288689f,
+ 0.137084f, 0.130517f, 0.128171f, -0.155396f, -0.008449f, -0.099845f,
+ 0.173455f, -0.059909f, -0.147318f, 0.102851f, -0.251389f, -0.001448f,
+ 0.103907f, 0.297273f, -0.027846f, 0.028260f, -0.382601f, 0.346695f,
+ -0.601641f, 0.162366f, -0.477495f, -0.042731f, -0.387871f, -0.051791f,
+ -0.401498f, -0.048446f, -0.456270f, -0.062287f, 0.493919f, 0.003008f,
+ 0.099917f, -0.358525f, -0.094903f, -0.022811f, -0.062259f, 0.019455f,
+ -0.050644f, 0.020041f, -0.132912f, -0.061578f, -3.083691f, -0.014961f,
+ -0.129115f, -0.710559f, 0.157213f, -0.844037f, -0.121991f, -0.943386f,
+ -0.231269f, -0.003462f, 0.331478f, -0.132703f, -1.285993f, -0.120957f,
+ -0.373755f, -0.322609f, 0.309059f, -0.131523f, -0.118334f, -0.063805f,
+ -0.104251f, 0.012166f, -0.094699f, -0.283753f, 0.128168f, -0.526929f,
+ -0.050331f, 0.186153f, 0.005913f, -0.221236f, 0.036363f, 0.160909f,
+ -0.001342f, -0.382749f, 0.037820f, 0.281689f, -0.024275f, 0.028854f,
+ 0.318291f, 0.318526f, 0.035778f, 0.034031f, 0.189663f, -0.293367f,
+ 0.082022f, 0.127923f, 0.078866f, -0.081361f, -0.268117f, 0.246675f,
+ 0.248605f, -0.215479f, -0.073084f, 0.496140f, -0.067327f, 0.396237f,
+ -0.120739f, 0.033752f, -0.044120f, -0.218941f, -0.028078f, 0.195132f,
+ -0.040400f, 0.281604f, -0.100471f, 0.415207f, -0.258503f, -0.429749f,
+ 0.150569f, -0.010859f, 0.136448f, 0.026589f, 0.148466f, 0.110764f,
+ 0.380967f, 0.009177f, 0.103075f, 0.116417f, 0.226273f, -0.327746f,
+ 0.169346f, 0.284553f, -0.094986f, 0.312745f, -0.147840f, 0.025062f,
+ -0.494482f, 0.112388f, -0.213962f, 0.107050f, -0.433371f, -0.096276f,
+ -0.244835f, -0.003518f, -0.459148f, -0.145080f, 0.017150f, 0.042846f,
+ -0.237479f, 0.104746f, 0.158677f, 0.358937f, 0.099921f, 0.277109f,
+ 0.012410f, -0.062897f, 0.116130f, 0.255309f, 0.341628f, 0.145002f,
+ -0.429344f, -0.016433f, -0.068985f, 0.285194f, -0.286719f, -0.018298f,
+ -0.179369f, -0.194655f, -0.165380f, 0.026071f, -0.428268f, -0.379929f,
+ -0.727543f, 0.179610f, -0.963979f, -0.042026f, -0.616202f, 0.133401f,
+ -0.784966f, 0.061205f, -0.713357f, 0.129795f, 0.120512f, -0.339545f,
+ 0.353557f, 0.114906f, -0.329813f, -0.209987f, 0.085410f, 0.214313f,
+ -0.122082f, 0.335770f, -0.020937f, 0.202456f, 0.289023f, -0.421186f,
+ 0.337905f, 0.407663f, 0.132771f, 0.071734f, 0.213914f, 0.128595f,
+ 0.302659f, -0.209501f, 0.217756f, 0.253079f, -0.089505f, -0.205614f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer0[32] = {
+ 0.296914f, -1.826816f, 0.346130f, 0.969520f, -0.528154f, 1.175862f,
+ -0.075985f, -0.097323f, -0.233059f, 0.004846f, 0.401279f, -2.272435f,
+ 0.086257f, 0.414162f, -0.194786f, -0.233887f, -0.113215f, -2.453546f,
+ 0.861214f, 0.298361f, 0.267397f, -0.158557f, -0.119911f, -0.098134f,
+ -0.339263f, 0.385871f, -0.678123f, 0.263218f, 0.251611f, -1.155773f,
+ -0.365437f, 0.229255f,
+};
+
+static const float av1_tx_split_nn_weights_64x64_layer1[32] = {
+ 0.502104f, -0.708023f, 0.419648f, 1.583418f, 0.419355f, -1.462981f,
+ -0.439623f, 0.405691f, 0.823257f, 0.061654f, 0.750875f, 0.775031f,
+ -0.387909f, 0.447385f, 0.284690f, 0.353262f, -0.224347f, 0.832864f,
+ -1.708491f, -1.042447f, -0.272829f, 0.540640f, 0.310509f, 0.723745f,
+ 0.245592f, -0.218417f, -0.597987f, -0.362301f, 0.702217f, -0.692614f,
+ 0.207812f, 0.513560f,
+};
+
+static const float av1_tx_split_nn_bias_64x64_layer1[1] = { -0.2307045f };
+
+static const NN_CONFIG av1_tx_split_nnconfig_64x64 = {
+ 12, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_64x64_layer0,
+ av1_tx_split_nn_weights_64x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_64x64_layer0,
+ av1_tx_split_nn_bias_64x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 4x16 block.
+static const float av1_tx_split_nn_weights_4x16_layer0[8 * 16] = {
+ -1.344184f, -1.454625f, -0.703110f, -0.140570f, -0.841536f, -0.068131f,
+ -2.128968f, -0.655518f, 0.432180f, 0.879752f, -0.222211f, 0.061615f,
+ -0.230969f, 0.569496f, 1.424188f, 0.598063f, -0.436005f, -0.737606f,
+ -0.137875f, -0.085730f, -0.076512f, -0.583101f, -0.937377f, -0.203556f,
+ -0.215797f, -0.015361f, -0.124098f, -0.411917f, 0.340441f, -0.331752f,
+ -0.472607f, -0.097714f, -0.930572f, -1.354713f, -0.550724f, 0.176212f,
+ -0.636060f, 0.183271f, -0.610212f, 0.345895f, -1.100906f, -1.605713f,
+ 0.111888f, -0.140937f, 0.063013f, -0.013315f, -0.273472f, -0.255870f,
+ 1.200328f, 0.274002f, 1.005776f, 0.322392f, 1.222373f, 0.158227f,
+ 0.408810f, 0.145022f, 0.139842f, -1.249412f, 0.286672f, -0.635699f,
+ 0.312562f, -0.495606f, -1.117034f, -0.085107f, -0.097484f, -0.341521f,
+ -0.132199f, -0.863055f, 0.217579f, -1.161425f, -0.302087f, -1.357271f,
+ -0.520724f, -1.211069f, -1.048729f, -0.333087f, -1.171527f, -0.280824f,
+ -2.057684f, -0.228755f, 0.606278f, 0.101198f, -0.314847f, -1.303255f,
+ -0.294964f, 1.301923f, 0.041712f, 0.077593f, -1.152746f, 0.495315f,
+ -0.751566f, 0.230249f, -0.840661f, 0.100731f, 1.346269f, 0.649898f,
+ -1.432258f, -0.456710f, -1.018123f, -0.348559f, -1.225226f, -0.170717f,
+ -0.354072f, 0.068292f, -0.234168f, 0.277503f, 0.179134f, 0.907420f,
+ 0.354626f, -0.627210f, 0.905779f, 0.512612f, 0.161190f, -0.843177f,
+ 0.014953f, -0.354983f, 0.011116f, -0.429598f, -1.017138f, -0.211432f,
+ 0.941840f, -0.281747f, 0.957776f, -0.541914f, 1.041880f, -0.433580f,
+ -1.416451f, -0.166467f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer0[16] = {
+ 3.086118f, -3.235095f, 4.830956f, -0.165706f, 0.955031f, 4.055783f,
+ -0.311489f, 4.660205f, -0.576277f, -0.248111f, -0.790519f, -1.686412f,
+ -1.191704f, -3.800073f, 4.121552f, -1.399397f,
+};
+
+static const float av1_tx_split_nn_weights_4x16_layer1[16] = {
+ -0.758677f, 0.388776f, 0.439906f, 0.011390f, -0.084319f, -0.667969f,
+ -0.467316f, -0.875491f, -0.160668f, 0.805292f, 0.114393f, -0.549682f,
+ 0.462109f, 0.343315f, 1.092593f, 0.483152f,
+};
+
+static const float av1_tx_split_nn_bias_4x16_layer1[1] = {
+ 0.8205083f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_4x16 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_4x16_layer0,
+ av1_tx_split_nn_weights_4x16_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_4x16_layer0,
+ av1_tx_split_nn_bias_4x16_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x32_layer0[8 * 32] = {
+ 0.180713f, 0.033211f, 0.607561f, 0.138642f, 0.637204f, -0.000940f,
+ 0.012630f, 0.358109f, 0.022238f, 0.190418f, 0.079088f, 0.065925f,
+ 0.038242f, 0.162380f, -0.122728f, 0.379382f, -0.303283f, -0.327550f,
+ 0.029120f, -0.284553f, 0.269588f, -0.309805f, -0.241036f, -0.161103f,
+ -0.304887f, 0.239843f, -0.149146f, 0.311234f, -0.073640f, -0.132718f,
+ 0.178901f, 0.474712f, 0.020280f, 0.063685f, -0.609170f, -0.013658f,
+ -0.338074f, 0.250429f, 0.082978f, -0.186315f, -0.788959f, 0.039859f,
+ -0.426461f, -0.001524f, -0.447211f, 0.378102f, 0.315617f, 0.017428f,
+ 0.745494f, -0.219024f, 0.512836f, 0.200522f, 0.680449f, 0.313686f,
+ -0.412569f, -0.132927f, 0.631120f, 0.042735f, 0.336153f, 0.044772f,
+ 0.432606f, 0.175681f, -0.634411f, -0.073509f, -0.040643f, -0.559260f,
+ -0.104034f, -0.570495f, -0.247365f, 0.063256f, -0.582021f, -0.492585f,
+ -0.194955f, -0.207934f, -0.506627f, 0.021743f, -0.416518f, 0.320876f,
+ 0.115889f, 0.149399f, -0.229376f, 0.095505f, 0.115191f, -0.471921f,
+ 0.113068f, 0.343684f, -0.036831f, 0.021240f, 0.295112f, 0.031166f,
+ 0.448201f, -0.132241f, 0.164032f, 0.355572f, 0.072154f, 0.017335f,
+ -0.046113f, 0.178719f, -0.026881f, -0.242590f, 0.055073f, -0.012958f,
+ 0.077904f, 0.351356f, 0.107655f, 0.260568f, -0.080052f, -0.197553f,
+ 0.085763f, 0.263416f, -0.327741f, 0.158855f, 0.056899f, -0.162121f,
+ 0.339518f, -0.571204f, 0.264966f, -0.252214f, -0.202560f, -0.134213f,
+ -0.330188f, 0.009470f, -0.468376f, -0.065240f, -0.307957f, 0.116479f,
+ -0.222238f, -0.458716f, 0.186493f, -0.391415f, 0.118649f, -0.104653f,
+ -0.259958f, -0.332081f, -0.403785f, -0.050147f, -0.573511f, 0.177117f,
+ -0.598358f, 0.164947f, -0.119694f, -0.058520f, 0.203829f, -0.267404f,
+ -0.048202f, -0.600006f, 0.181594f, -0.731805f, 0.146417f, -0.687148f,
+ -1.210525f, -0.450101f, -0.620635f, 0.208825f, -0.611357f, 0.112202f,
+ -0.309468f, -0.323545f, 0.357770f, 0.308061f, 0.553199f, 0.049012f,
+ 0.530093f, -0.208597f, 0.607882f, -0.058120f, -0.527634f, 0.018136f,
+ 0.060753f, 0.118894f, 0.175649f, 0.014731f, 0.428318f, -0.106465f,
+ -0.119077f, 0.080179f, 0.524997f, 0.368286f, 0.528286f, 0.213659f,
+ 0.639286f, 0.195079f, -0.049815f, -0.092008f, -0.302958f, 0.298149f,
+ -0.173870f, -0.145205f, -0.233589f, -0.303368f, 0.141275f, 0.325622f,
+ -0.115293f, 0.155188f, 0.047225f, 0.231050f, -0.167447f, 0.349754f,
+ 0.295544f, -0.319466f, 0.095144f, 0.174612f, -0.194652f, 0.305915f,
+ -0.239008f, -0.037453f, 0.280696f, 0.125850f, 0.749196f, -0.101919f,
+ 0.791808f, -0.236811f, 0.064157f, 0.032865f, -0.225911f, 0.350384f,
+ 0.723183f, -0.103992f, 0.483085f, -0.123992f, 0.602138f, 0.023895f,
+ -0.692601f, -0.118387f, 0.162527f, 0.145178f, -0.184702f, -0.017753f,
+ -0.159436f, 0.124105f, -0.131067f, 0.310275f, 0.151499f, 0.138924f,
+ 0.537459f, 0.263212f, 0.615896f, 0.281255f, 0.021293f, -0.473459f,
+ 0.210145f, -0.056682f, 0.063658f, 0.377254f, -0.314410f, -0.183487f,
+ 0.300384f, 0.328471f, 0.164694f, -0.159272f, -0.160942f, -0.502861f,
+ -0.129147f, 0.045916f, -0.606865f, -0.101378f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer0[32] = {
+ 0.051664f, -0.212487f, -0.077596f, -0.818467f, 0.638475f, -0.759937f,
+ 0.157198f, 0.989640f, 1.586035f, 0.431144f, 0.041605f, 0.543085f,
+ 0.498379f, 0.320504f, 0.134233f, 0.670979f, -0.105562f, -1.574879f,
+ 1.261812f, -0.287530f, -1.610592f, 0.730899f, -0.894240f, -0.657790f,
+ 0.270806f, -0.181708f, 0.298578f, 0.817240f, -0.221508f, -0.201771f,
+ -0.294389f, 1.456413f,
+};
+
+static const float av1_tx_split_nn_weights_16x32_layer1[32] = {
+ 1.208914f, 0.324728f, 0.383352f, -0.874321f, 0.172565f, -0.580927f,
+ -0.432927f, 0.433698f, -0.801935f, 0.672028f, 0.563493f, 0.260077f,
+ -0.200557f, -0.121638f, 0.530735f, -0.525196f, 0.281799f, 0.624204f,
+ -0.662775f, -0.230887f, 0.980989f, 0.223437f, -0.790591f, 0.600724f,
+ -0.273445f, 0.427635f, -0.501641f, -0.878390f, 0.234731f, -0.172550f,
+ 0.418904f, 1.792187f,
+};
+
+static const float av1_tx_split_nn_bias_16x32_layer1[1] = {
+ -0.29233751f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x32_layer0,
+ av1_tx_split_nn_weights_16x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x32_layer0,
+ av1_tx_split_nn_bias_16x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 32x64 block.
+static const float av1_tx_split_nn_weights_32x64_layer0[8 * 32] = {
+ 0.031614f, -0.110926f, 0.052418f, -0.702506f, 0.045708f, 0.238329f,
+ -0.021806f, -0.208128f, 0.509745f, -0.293891f, 0.277788f, 0.113937f,
+ 0.741576f, 0.062848f, 0.351878f, 0.212532f, 0.385842f, 0.081517f,
+ 0.398502f, -0.015156f, 0.242616f, 0.214619f, -0.182678f, -0.170546f,
+ 0.110605f, -0.236749f, -0.023831f, -0.285243f, 0.147156f, -0.257639f,
+ 0.341355f, -0.571641f, -0.721797f, 0.139588f, -0.518494f, -0.206526f,
+ -0.570560f, -0.184295f, 0.110271f, 0.210292f, -0.109132f, -0.001080f,
+ 0.129251f, -0.204230f, -0.396312f, -0.183024f, 0.421243f, -0.013154f,
+ 0.222627f, 0.169826f, 0.226037f, 0.218153f, -0.343528f, 0.274906f,
+ -0.156632f, 0.250261f, -0.484020f, 0.019909f, -0.349575f, -0.286643f,
+ -0.507396f, 0.202446f, -0.154110f, -0.292644f, 0.122666f, 0.306963f,
+ 0.424895f, 0.005579f, 0.494094f, -0.079551f, 0.473740f, 0.352414f,
+ -0.356917f, 0.264331f, -0.554487f, 0.119978f, 0.012291f, -0.141641f,
+ -0.254714f, -0.213723f, -0.116701f, -0.011267f, 0.190025f, -0.118501f,
+ 0.305151f, -0.316782f, -0.220801f, -0.308420f, -0.324285f, 0.421329f,
+ -0.177066f, -0.055114f, 0.229698f, -0.199523f, 0.054278f, 0.365020f,
+ -0.060586f, -0.300618f, 0.157563f, -0.064338f, -0.005711f, -0.176991f,
+ -0.424502f, -0.111914f, 0.092608f, 0.126621f, 0.078547f, 0.148008f,
+ 0.024221f, 0.124599f, 0.001343f, 0.059402f, 0.453753f, 0.047102f,
+ 0.242544f, 0.055735f, -0.067451f, -0.170061f, -0.170469f, -0.232173f,
+ 0.214908f, 0.248889f, 0.544348f, -0.084566f, 0.402478f, 0.298031f,
+ 0.099038f, -0.238019f, -0.475085f, -0.070042f, -0.754955f, -0.049095f,
+ -0.783801f, -0.099857f, -0.582008f, -0.055194f, -0.103655f, 0.143689f,
+ 0.100219f, 0.293934f, 0.099271f, -0.036320f, 0.356626f, -0.261445f,
+ 0.879544f, 0.000878f, 0.532920f, -0.093918f, 0.508867f, -0.040215f,
+ -0.789042f, -0.145380f, -0.090040f, -0.066636f, 0.015212f, 0.352989f,
+ -0.058831f, -0.164588f, 0.039890f, 0.122861f, 0.222508f, 0.061217f,
+ 0.466487f, 0.022666f, 0.423777f, -0.002200f, -0.656835f, -0.099760f,
+ -0.520606f, 0.303204f, -0.563620f, -0.160922f, -0.243203f, 0.313354f,
+ -0.336516f, -0.206764f, -0.236040f, 0.325899f, -0.418748f, 0.163205f,
+ -0.476242f, -0.121928f, 0.139178f, -0.157193f, -0.531766f, -0.180202f,
+ -0.485254f, 0.187703f, -0.440072f, 0.137854f, 0.029139f, 0.109530f,
+ -0.078475f, -0.360618f, -0.334672f, -0.350890f, -0.403976f, 0.180336f,
+ -0.304542f, 0.005123f, 0.413995f, 0.314639f, 0.342648f, -0.293264f,
+ 0.358135f, -0.180425f, -0.369530f, -0.048413f, 0.498366f, 0.121875f,
+ 0.270948f, -0.187966f, 0.342503f, 0.174420f, -0.352105f, 0.088080f,
+ 0.008277f, 0.020275f, -0.002381f, 0.504389f, -0.018832f, -0.366047f,
+ -0.090947f, -0.168150f, 0.016184f, -0.328914f, 0.089579f, -0.017349f,
+ 0.005844f, -0.005010f, -1.857514f, -0.282426f, 0.010177f, -0.214727f,
+ -0.182529f, 0.156943f, -0.162032f, -0.472654f, 0.069432f, 0.016901f,
+ -0.767905f, 0.137129f, -0.411463f, 0.049056f, -0.431657f, -0.037641f,
+ 0.785500f, 0.046225f, 0.195831f, 0.245204f, 0.368614f, 0.212261f,
+ 0.440626f, -0.158048f, -0.461031f, -0.146280f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer0[32] = {
+ 0.490777f, -1.894238f, 0.621333f, -0.076756f, 0.286298f, 0.286375f,
+ -0.126431f, -0.350034f, -1.017572f, 0.620125f, 0.408128f, 0.238756f,
+ -0.060728f, 0.210912f, 0.043124f, 0.445649f, 0.907025f, 0.360272f,
+ 1.083101f, -0.068952f, 1.062348f, 0.396354f, 0.280075f, 0.501732f,
+ 0.328422f, 0.066241f, 0.474697f, 0.126313f, 0.741206f, 0.314796f,
+ 0.552712f, 0.299410f,
+};
+
+static const float av1_tx_split_nn_weights_32x64_layer1[32] = {
+ 1.033823f, 0.603439f, 0.304591f, -0.279940f, -0.780909f, -0.132801f,
+ 0.154059f, 0.662014f, -0.718368f, 0.198733f, 0.039766f, -0.208516f,
+ -0.104909f, -0.394209f, 0.081617f, 0.365041f, -0.874960f, -0.063315f,
+ -1.189897f, 0.337225f, 0.410893f, 0.307519f, 0.221323f, 0.233895f,
+ 0.469536f, 0.438557f, 0.280144f, 0.422423f, -1.394513f, 0.781900f,
+ 0.352981f, 0.111265f,
+};
+
+static const float av1_tx_split_nn_bias_32x64_layer1[1] = {
+ -0.18160765f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_32x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 32,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_32x64_layer0,
+ av1_tx_split_nn_weights_32x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_32x64_layer0,
+ av1_tx_split_nn_bias_32x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 8x32 block.
+static const float av1_tx_split_nn_weights_8x32_layer0[8 * 24] = {
+ -0.687846f, 0.121404f, -0.372905f, 0.126770f, -0.103298f, -0.101650f,
+ -0.148490f, -0.271740f, 0.682915f, -0.079765f, 0.634347f, -0.151503f,
+ 0.287692f, -0.079072f, -0.236948f, 0.065064f, 0.713383f, 0.397123f,
+ 0.553621f, 0.368529f, 0.767663f, -0.046601f, -0.392402f, -0.294822f,
+ -0.292325f, -0.010573f, -0.837945f, 0.050113f, -0.811360f, 0.199162f,
+ 0.150832f, 0.011602f, 0.369694f, -0.225876f, 0.234113f, -0.269808f,
+ 0.303805f, -0.190281f, -0.451136f, 0.209755f, -0.308894f, 0.326956f,
+ 0.313591f, 0.089923f, -0.095754f, 0.390981f, 0.467366f, 0.169670f,
+ 0.853322f, 0.054055f, 0.830319f, -0.121918f, 0.262019f, -0.093526f,
+ 0.385558f, 0.419174f, 0.040198f, -0.347030f, -0.450492f, -0.106764f,
+ 0.487502f, -0.204188f, 0.430374f, -0.116388f, 0.236407f, -0.157376f,
+ 0.732294f, -0.651387f, 0.347446f, 0.342575f, 0.048406f, 0.187657f,
+ 0.434899f, -0.447782f, 0.032728f, -0.071168f, -0.255327f, 0.104174f,
+ 0.095689f, -0.431743f, 0.725694f, 0.031797f, 0.523171f, 0.061801f,
+ 0.469804f, -0.071068f, -0.059024f, -0.211937f, 0.392134f, -0.321490f,
+ 0.366060f, -0.427798f, 0.166771f, 0.299652f, 0.044660f, 0.205142f,
+ 0.039133f, -0.051835f, -0.465475f, 0.216976f, -0.341156f, 0.095358f,
+ 0.230807f, 0.201674f, 0.279266f, -0.713534f, -0.091690f, -0.569708f,
+ -0.119001f, 0.252160f, -1.544578f, -0.284477f, 0.555348f, 0.226471f,
+ 0.347690f, 0.034365f, 0.770835f, -0.241859f, -0.130241f, 0.292936f,
+ 0.396622f, -0.417916f, 0.492224f, 0.125517f, 0.344824f, 0.232172f,
+ -0.432106f, -0.278745f, 0.035069f, -0.307247f, -0.120760f, 0.170950f,
+ 0.433601f, 0.044286f, 0.141463f, -0.041382f, 0.529346f, 0.010868f,
+ -0.323674f, 0.185205f, 0.623459f, 0.232842f, -0.406693f, -0.142944f,
+ 0.222988f, 0.343634f, 0.065401f, 0.002621f, 0.805335f, -0.426926f,
+ 0.279181f, 0.131364f, 0.192339f, -0.402391f, 0.544120f, -0.060618f,
+ 0.467780f, 0.165224f, -0.373131f, 0.002427f, 0.688064f, 0.322317f,
+ 0.259713f, 0.130583f, 0.185032f, -0.189111f, -0.067821f, 0.010875f,
+ 0.644724f, -0.179291f, 0.463222f, 0.155230f, 0.721384f, -0.046019f,
+ 0.438501f, 0.440027f, -0.462090f, -0.002039f, -0.468026f, -0.008890f,
+ -0.328530f, 0.370102f, 0.482531f, 0.043471f, -0.469732f, -0.532663f,
+ 0.122081f, -0.379659f, 0.037219f, -0.519913f, -0.128975f, -0.404365f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer0[24] = {
+ -1.198965f, 0.395204f, -0.408627f, -0.021654f, -0.658355f, 0.154525f,
+ -0.288354f, 1.207574f, 0.411608f, 0.964678f, -1.176893f, 1.059006f,
+ -0.472969f, 2.087975f, 1.065536f, 0.595569f, 0.197907f, -0.349938f,
+ 1.013651f, -0.931093f, -0.973595f, -0.459094f, -1.253062f, 1.624782f,
+};
+
+static const float av1_tx_split_nn_weights_8x32_layer1[24] = {
+ 0.815787f, -0.393465f, -0.483427f, -0.565592f, 0.493494f, 0.430229f,
+ -0.507073f, -0.251379f, -0.353418f, -0.495445f, 0.820029f, 0.649146f,
+ -0.487383f, 1.844503f, 0.480324f, -0.982705f, -0.501446f, -0.220584f,
+ 0.334299f, 0.802238f, 0.805838f, -0.487848f, 0.300772f, -1.232857f,
+};
+
+static const float av1_tx_split_nn_bias_8x32_layer1[1] = {
+ 0.13435879f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_8x32 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 24,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_8x32_layer0,
+ av1_tx_split_nn_weights_8x32_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_8x32_layer0,
+ av1_tx_split_nn_bias_8x32_layer1,
+ },
+};
+/******************************************************************************/
+
+// Tx split model for 16x32 block.
+static const float av1_tx_split_nn_weights_16x64_layer0[8 * 16] = {
+ -0.378223f, -0.124216f, -0.514089f, -0.110117f, -0.585801f, -0.094838f,
+ -0.455385f, -0.220254f, -0.504568f, -0.082351f, -0.476420f, -0.253993f,
+ -0.454709f, -0.059461f, 0.210313f, -0.155683f, 0.192968f, -0.127804f,
+ 0.471996f, 0.253377f, 0.472625f, 0.485322f, 0.150560f, 0.164868f,
+ -0.475587f, 0.447559f, -0.455759f, -0.306665f, -0.194866f, -0.283716f,
+ -0.243897f, 0.293020f, -0.308298f, -0.191904f, -0.468568f, 0.014053f,
+ -0.618848f, 0.096273f, -0.444586f, 0.347750f, -0.280643f, -0.062872f,
+ 0.118661f, 0.540099f, 0.104141f, -0.279300f, -0.098721f, -0.173427f,
+ -0.984558f, -0.424559f, -0.411928f, -0.120875f, -0.488999f, -0.050716f,
+ -0.523103f, 0.093620f, -0.930396f, -0.431997f, -1.163297f, 0.190384f,
+ -0.422581f, -0.005354f, 0.450552f, 0.369210f, 0.562484f, 0.679922f,
+ 0.282099f, -0.039075f, 0.404196f, 0.006371f, 0.069679f, -0.196160f,
+ -0.213675f, 0.275187f, -0.104235f, -0.193090f, 0.003116f, -0.252454f,
+ -0.094591f, 0.210439f, -0.137070f, 0.145043f, 0.024558f, 0.121718f,
+ 0.010138f, 0.301651f, -0.377990f, 0.444414f, 0.001845f, -0.095334f,
+ 0.550259f, 0.087603f, 0.792492f, -0.044584f, 0.641706f, -0.328458f,
+ -0.447791f, 0.135376f, 0.356385f, 0.135748f, 0.310370f, 0.293757f,
+ -0.062000f, -0.056368f, 0.343930f, 0.312039f, 0.370763f, 0.452381f,
+ -0.023630f, -0.185909f, 0.422277f, -0.006306f, 0.045166f, 0.423359f,
+ -0.157735f, -0.084901f, 0.219527f, -0.209510f, 0.575057f, 0.249276f,
+ 0.069267f, 0.233898f, -0.229392f, 0.117197f, -0.038551f, 0.293976f,
+ 0.101996f, 0.120878f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer0[16] = {
+ 1.036995f, 0.160249f, 0.100264f, 0.694881f, 0.694677f, 0.128379f,
+ -0.843405f, -0.405515f, 0.104139f, 0.182980f, -0.025472f, 0.901067f,
+ -0.299866f, -0.103079f, -0.190352f, -0.048121f,
+};
+
+static const float av1_tx_split_nn_weights_16x64_layer1[16] = {
+ -1.778868f, 0.174690f, 0.211991f, 0.712138f, 0.589352f, 0.466652f,
+ 1.029146f, -0.490044f, 0.483015f, 0.600215f, -0.577776f, -0.755546f,
+ 0.348337f, -0.205082f, 0.347129f, -0.322277f,
+};
+
+static const float av1_tx_split_nn_bias_16x64_layer1[1] = {
+ 0.04230947f,
+};
+
+static const NN_CONFIG av1_tx_split_nnconfig_16x64 = {
+ 8, // num_inputs
+ 1, // num_outputs
+ 1, // num_hidden_layers
+ {
+ 16,
+ }, // num_hidden_nodes
+ {
+ av1_tx_split_nn_weights_16x64_layer0,
+ av1_tx_split_nn_weights_16x64_layer1,
+ },
+ {
+ av1_tx_split_nn_bias_16x64_layer0,
+ av1_tx_split_nn_bias_16x64_layer1,
+ },
+};
+/******************************************************************************/
+
+// Map block size to its corresponding neural net model for tx split prediction.
+static const NN_CONFIG *av1_tx_split_nnconfig_map[TX_SIZES_ALL] = {
+ NULL, // TX_4X4,
+ &av1_tx_split_nnconfig_8x8, // TX_8X8,
+ &av1_tx_split_nnconfig_16x16, // TX_16X16,
+ &av1_tx_split_nnconfig_32x32, // TX_32X32,
+ &av1_tx_split_nnconfig_64x64, // TX_64X64,
+ &av1_tx_split_nnconfig_4x8, // TX_4X8,
+ &av1_tx_split_nnconfig_4x8, // TX_8X4,
+ &av1_tx_split_nnconfig_8x16, // TX_8X16,
+ &av1_tx_split_nnconfig_8x16, // TX_16X8,
+ &av1_tx_split_nnconfig_16x32, // TX_16X32,
+ &av1_tx_split_nnconfig_16x32, // TX_32X16,
+ &av1_tx_split_nnconfig_32x64, // TX_32X64,
+ &av1_tx_split_nnconfig_32x64, // TX_64X32,
+ &av1_tx_split_nnconfig_4x16, // TX_4X16,
+ &av1_tx_split_nnconfig_4x16, // TX_16X4,
+ &av1_tx_split_nnconfig_8x32, // TX_8X32,
+ &av1_tx_split_nnconfig_8x32, // TX_32X8,
+ &av1_tx_split_nnconfig_16x64, // TX_16X64,
+ &av1_tx_split_nnconfig_16x64, // TX_64X16,
+};
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AV1_ENCODER_TX_PRUNE_MODEL_WEIGHTS_H_
diff --git a/third_party/aom/av1/encoder/wedge_utils.c b/third_party/aom/av1/encoder/wedge_utils.c
new file mode 100644
index 0000000000..e6edbb6af0
--- /dev/null
+++ b/third_party/aom/av1/encoder/wedge_utils.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1: Residuals of p1.
+ * (source - p1)
+ * d: Difference of p1 and p0.
+ * (p1 - p0)
+ * m: The blending mask
+ * N: Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ * where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ * is equivalent to:
+ * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ * which is the SSE of the residuals of the compound predictor scaled up by
+ * MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ uint64_t csse = 0;
+ int i;
+
+ for (i = 0; i < N; i++) {
+ int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
+ t = clamp(t, INT16_MIN, INT16_MAX);
+ csse += t * t;
+ }
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds: Difference of the squares of the residuals.
+ * r0**2 - r1**2
+ * m: The blending mask
+ * N: Number of pixels
+ * limit: Pre-computed threshold value.
+ * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ * >
+ * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ * which can be simplified to:
+ *
+ * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * The right hand side does not depend on the mask, and needs to be passed as
+ * the 'limit' parameter.
+ *
+ * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ * hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ * Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ * being small, this should not cause a noticeable issue.
+ */
+int av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, int N,
+ int64_t limit) {
+ int64_t acc = 0;
+
+ do {
+ acc += *ds++ * *m++;
+ } while (--N);
+
+ return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ int i;
+
+ for (i = 0; i < N; i++)
+ d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000000..07615543c6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm1d_sse4.c
@@ -0,0 +1,1217 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ __m128i buf0[32];
+ __m128i buf1[32];
+ const int32_t *cospi;
+ // stage 0
+ // stage 1
+ buf1[0] = _mm_add_epi32(input[0], input[31]);
+ buf1[31] = _mm_sub_epi32(input[0], input[31]);
+ buf1[1] = _mm_add_epi32(input[1], input[30]);
+ buf1[30] = _mm_sub_epi32(input[1], input[30]);
+ buf1[2] = _mm_add_epi32(input[2], input[29]);
+ buf1[29] = _mm_sub_epi32(input[2], input[29]);
+ buf1[3] = _mm_add_epi32(input[3], input[28]);
+ buf1[28] = _mm_sub_epi32(input[3], input[28]);
+ buf1[4] = _mm_add_epi32(input[4], input[27]);
+ buf1[27] = _mm_sub_epi32(input[4], input[27]);
+ buf1[5] = _mm_add_epi32(input[5], input[26]);
+ buf1[26] = _mm_sub_epi32(input[5], input[26]);
+ buf1[6] = _mm_add_epi32(input[6], input[25]);
+ buf1[25] = _mm_sub_epi32(input[6], input[25]);
+ buf1[7] = _mm_add_epi32(input[7], input[24]);
+ buf1[24] = _mm_sub_epi32(input[7], input[24]);
+ buf1[8] = _mm_add_epi32(input[8], input[23]);
+ buf1[23] = _mm_sub_epi32(input[8], input[23]);
+ buf1[9] = _mm_add_epi32(input[9], input[22]);
+ buf1[22] = _mm_sub_epi32(input[9], input[22]);
+ buf1[10] = _mm_add_epi32(input[10], input[21]);
+ buf1[21] = _mm_sub_epi32(input[10], input[21]);
+ buf1[11] = _mm_add_epi32(input[11], input[20]);
+ buf1[20] = _mm_sub_epi32(input[11], input[20]);
+ buf1[12] = _mm_add_epi32(input[12], input[19]);
+ buf1[19] = _mm_sub_epi32(input[12], input[19]);
+ buf1[13] = _mm_add_epi32(input[13], input[18]);
+ buf1[18] = _mm_sub_epi32(input[13], input[18]);
+ buf1[14] = _mm_add_epi32(input[14], input[17]);
+ buf1[17] = _mm_sub_epi32(input[14], input[17]);
+ buf1[15] = _mm_add_epi32(input[15], input[16]);
+ buf1[16] = _mm_sub_epi32(input[15], input[16]);
+
+ // stage 2
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+ buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+ buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+ buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+ buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+ buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+ buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+ buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+ buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+ buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+ buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ buf0[18] = buf1[18];
+ buf0[19] = buf1[19];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+ buf0[28] = buf1[28];
+ buf0[29] = buf1[29];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 3
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+ buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+ buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+ buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+ buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+ buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+ buf1[8] = buf0[8];
+ buf1[9] = buf0[9];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[14] = buf0[14];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+ buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+ buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+ buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+ buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+ buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+ buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+ buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+ buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+ // stage 4
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+ buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+ buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+ buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+ buf0[4] = buf1[4];
+ btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[7] = buf1[7];
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+ buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+ buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+ buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+ buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+ buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+ buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+ buf0[16] = buf1[16];
+ buf0[17] = buf1[17];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ buf0[22] = buf1[22];
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[25] = buf1[25];
+ buf0[30] = buf1[30];
+ buf0[31] = buf1[31];
+
+ // stage 5
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3],
+ cos_bit);
+ buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+ buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+ buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+ buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+ buf1[8] = buf0[8];
+ btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ buf1[11] = buf0[11];
+ buf1[12] = buf0[12];
+ buf1[15] = buf0[15];
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+ buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+ buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+ buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+ buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+ buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+ buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+ buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+ buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+ buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+ buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+ buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+ buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+ // stage 6
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6],
+ cos_bit);
+ buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+ buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+ buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+ buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+ buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+ buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+ buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+ buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+ buf0[16] = buf1[16];
+ btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ buf0[19] = buf1[19];
+ buf0[20] = buf1[20];
+ btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ buf0[23] = buf1[23];
+ buf0[24] = buf1[24];
+ buf0[27] = buf1[27];
+ buf0[28] = buf1[28];
+ buf0[31] = buf1[31];
+
+ // stage 7
+ cospi = cospi_arr(cos_bit);
+ buf1[0] = buf0[0];
+ buf1[1] = buf0[1];
+ buf1[2] = buf0[2];
+ buf1[3] = buf0[3];
+ buf1[4] = buf0[4];
+ buf1[5] = buf0[5];
+ buf1[6] = buf0[6];
+ buf1[7] = buf0[7];
+ btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15],
+ cos_bit);
+ btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+ buf1[14], cos_bit);
+ btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+ buf1[13], cos_bit);
+ btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+ buf1[12], cos_bit);
+ buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+ buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+ buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+ buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+ buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+ buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+ buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+ buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+ buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+ buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+ buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+ buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+ buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+ buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+ buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+ buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+ // stage 8
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ buf0[2] = buf1[2];
+ buf0[3] = buf1[3];
+ buf0[4] = buf1[4];
+ buf0[5] = buf1[5];
+ buf0[6] = buf1[6];
+ buf0[7] = buf1[7];
+ buf0[8] = buf1[8];
+ buf0[9] = buf1[9];
+ buf0[10] = buf1[10];
+ buf0[11] = buf1[11];
+ buf0[12] = buf1[12];
+ buf0[13] = buf1[13];
+ buf0[14] = buf1[14];
+ buf0[15] = buf1[15];
+ btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+ buf0[31], cos_bit);
+ btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+ buf0[30], cos_bit);
+ btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+ buf0[29], cos_bit);
+ btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+ buf0[28], cos_bit);
+ btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+ buf0[27], cos_bit);
+ btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+ buf0[26], cos_bit);
+ btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+ buf0[25], cos_bit);
+ btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+ buf0[24], cos_bit);
+
+ // stage 9
+ output[0] = buf0[0];
+ output[1] = buf0[16];
+ output[2] = buf0[8];
+ output[3] = buf0[24];
+ output[4] = buf0[4];
+ output[5] = buf0[20];
+ output[6] = buf0[12];
+ output[7] = buf0[28];
+ output[8] = buf0[2];
+ output[9] = buf0[18];
+ output[10] = buf0[10];
+ output[11] = buf0[26];
+ output[12] = buf0[6];
+ output[13] = buf0[22];
+ output[14] = buf0[14];
+ output[15] = buf0[30];
+ output[16] = buf0[1];
+ output[17] = buf0[17];
+ output[18] = buf0[9];
+ output[19] = buf0[25];
+ output[20] = buf0[5];
+ output[21] = buf0[21];
+ output[22] = buf0[13];
+ output[23] = buf0[29];
+ output[24] = buf0[3];
+ output[25] = buf0[19];
+ output[26] = buf0[11];
+ output[27] = buf0[27];
+ output[28] = buf0[7];
+ output[29] = buf0[23];
+ output[30] = buf0[15];
+ output[31] = buf0[31];
+}
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 4;
+ const int num_per_128 = 4;
+ const int32_t *cospi;
+ __m128i buf0[4];
+ __m128i buf1[4];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ // stage 0;
+ int32_t stage_idx = 0;
+ int j;
+ for (j = 0; j < 4; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+
+ // stage 1
+ stage_idx++;
+ buf1[0] = buf0[3];
+ buf1[1] = buf0[0];
+ buf1[2] = buf0[1];
+ buf1[3] = buf0[2];
+
+ // stage 2
+ stage_idx++;
+
+ cospi = cospi_arr(cos_bit);
+ btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+ cos_bit);
+ btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 3
+ stage_idx++;
+ buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+ buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+ buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+ buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+ // stage 4
+ stage_idx++;
+
+ cospi = cospi_arr(cos_bit);
+ buf0[0] = buf1[0];
+ buf0[1] = buf1[1];
+ btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+ buf0[3], cos_bit);
+
+ // stage 5
+ stage_idx++;
+ buf1[0] = buf0[0];
+ buf1[1] = _mm_sub_epi32(_mm_setzero_si128(), buf0[2]);
+ buf1[2] = buf0[3];
+ buf1[3] = _mm_sub_epi32(_mm_setzero_si128(), buf0[1]);
+
+ for (j = 0; j < 4; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit, const int instride,
+ const int outstride) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32 = _mm_set1_epi32(-cospi[32]);
+ __m128i cospi_p32 = _mm_set1_epi32(cospi[32]);
+ __m128i cospi_m16 = _mm_set1_epi32(-cospi[16]);
+ __m128i cospi_p48 = _mm_set1_epi32(cospi[48]);
+ __m128i cospi_m48 = _mm_set1_epi32(-cospi[48]);
+ __m128i cospi_p16 = _mm_set1_epi32(cospi[16]);
+ __m128i cospi_m08 = _mm_set1_epi32(-cospi[8]);
+ __m128i cospi_p56 = _mm_set1_epi32(cospi[56]);
+ __m128i cospi_m56 = _mm_set1_epi32(-cospi[56]);
+ __m128i cospi_m40 = _mm_set1_epi32(-cospi[40]);
+ __m128i cospi_p24 = _mm_set1_epi32(cospi[24]);
+ __m128i cospi_m24 = _mm_set1_epi32(-cospi[24]);
+ __m128i cospi_p08 = _mm_set1_epi32(cospi[8]);
+ __m128i cospi_p40 = _mm_set1_epi32(cospi[40]);
+ __m128i cospi_p60 = _mm_set1_epi32(cospi[60]);
+ __m128i cospi_p04 = _mm_set1_epi32(cospi[4]);
+ __m128i cospi_p28 = _mm_set1_epi32(cospi[28]);
+ __m128i cospi_p36 = _mm_set1_epi32(cospi[36]);
+ __m128i cospi_p44 = _mm_set1_epi32(cospi[44]);
+ __m128i cospi_p20 = _mm_set1_epi32(cospi[20]);
+ __m128i cospi_p12 = _mm_set1_epi32(cospi[12]);
+ __m128i cospi_p52 = _mm_set1_epi32(cospi[52]);
+ __m128i cospi_m04 = _mm_set1_epi32(-cospi[4]);
+ __m128i cospi_m60 = _mm_set1_epi32(-cospi[60]);
+ __m128i cospi_m36 = _mm_set1_epi32(-cospi[36]);
+ __m128i cospi_m28 = _mm_set1_epi32(-cospi[28]);
+ __m128i cospi_m20 = _mm_set1_epi32(-cospi[20]);
+ __m128i cospi_m44 = _mm_set1_epi32(-cospi[44]);
+ __m128i cospi_m52 = _mm_set1_epi32(-cospi[52]);
+ __m128i cospi_m12 = _mm_set1_epi32(-cospi[12]);
+ __m128i cospi_p62 = _mm_set1_epi32(cospi[62]);
+ __m128i cospi_p02 = _mm_set1_epi32(cospi[2]);
+ __m128i cospi_p30 = _mm_set1_epi32(cospi[30]);
+ __m128i cospi_p34 = _mm_set1_epi32(cospi[34]);
+ __m128i cospi_p46 = _mm_set1_epi32(cospi[46]);
+ __m128i cospi_p18 = _mm_set1_epi32(cospi[18]);
+ __m128i cospi_p14 = _mm_set1_epi32(cospi[14]);
+ __m128i cospi_p50 = _mm_set1_epi32(cospi[50]);
+ __m128i cospi_p54 = _mm_set1_epi32(cospi[54]);
+ __m128i cospi_p10 = _mm_set1_epi32(cospi[10]);
+ __m128i cospi_p22 = _mm_set1_epi32(cospi[22]);
+ __m128i cospi_p42 = _mm_set1_epi32(cospi[42]);
+ __m128i cospi_p38 = _mm_set1_epi32(cospi[38]);
+ __m128i cospi_p26 = _mm_set1_epi32(cospi[26]);
+ __m128i cospi_p06 = _mm_set1_epi32(cospi[6]);
+ __m128i cospi_p58 = _mm_set1_epi32(cospi[58]);
+ __m128i cospi_p63 = _mm_set1_epi32(cospi[63]);
+ __m128i cospi_p01 = _mm_set1_epi32(cospi[1]);
+ __m128i cospi_p31 = _mm_set1_epi32(cospi[31]);
+ __m128i cospi_p33 = _mm_set1_epi32(cospi[33]);
+ __m128i cospi_p47 = _mm_set1_epi32(cospi[47]);
+ __m128i cospi_p17 = _mm_set1_epi32(cospi[17]);
+ __m128i cospi_p15 = _mm_set1_epi32(cospi[15]);
+ __m128i cospi_p49 = _mm_set1_epi32(cospi[49]);
+ __m128i cospi_p55 = _mm_set1_epi32(cospi[55]);
+ __m128i cospi_p09 = _mm_set1_epi32(cospi[9]);
+ __m128i cospi_p23 = _mm_set1_epi32(cospi[23]);
+ __m128i cospi_p41 = _mm_set1_epi32(cospi[41]);
+ __m128i cospi_p39 = _mm_set1_epi32(cospi[39]);
+ __m128i cospi_p25 = _mm_set1_epi32(cospi[25]);
+ __m128i cospi_p07 = _mm_set1_epi32(cospi[7]);
+ __m128i cospi_p57 = _mm_set1_epi32(cospi[57]);
+ __m128i cospi_p59 = _mm_set1_epi32(cospi[59]);
+ __m128i cospi_p05 = _mm_set1_epi32(cospi[5]);
+ __m128i cospi_p27 = _mm_set1_epi32(cospi[27]);
+ __m128i cospi_p37 = _mm_set1_epi32(cospi[37]);
+ __m128i cospi_p43 = _mm_set1_epi32(cospi[43]);
+ __m128i cospi_p21 = _mm_set1_epi32(cospi[21]);
+ __m128i cospi_p11 = _mm_set1_epi32(cospi[11]);
+ __m128i cospi_p53 = _mm_set1_epi32(cospi[53]);
+ __m128i cospi_p51 = _mm_set1_epi32(cospi[51]);
+ __m128i cospi_p13 = _mm_set1_epi32(cospi[13]);
+ __m128i cospi_p19 = _mm_set1_epi32(cospi[19]);
+ __m128i cospi_p45 = _mm_set1_epi32(cospi[45]);
+ __m128i cospi_p35 = _mm_set1_epi32(cospi[35]);
+ __m128i cospi_p29 = _mm_set1_epi32(cospi[29]);
+ __m128i cospi_p03 = _mm_set1_epi32(cospi[3]);
+ __m128i cospi_p61 = _mm_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_add_epi32(input[0 * instride], input[63 * instride]);
+ x1[63] = _mm_sub_epi32(input[0 * instride], input[63 * instride]);
+ x1[1] = _mm_add_epi32(input[1 * instride], input[62 * instride]);
+ x1[62] = _mm_sub_epi32(input[1 * instride], input[62 * instride]);
+ x1[2] = _mm_add_epi32(input[2 * instride], input[61 * instride]);
+ x1[61] = _mm_sub_epi32(input[2 * instride], input[61 * instride]);
+ x1[3] = _mm_add_epi32(input[3 * instride], input[60 * instride]);
+ x1[60] = _mm_sub_epi32(input[3 * instride], input[60 * instride]);
+ x1[4] = _mm_add_epi32(input[4 * instride], input[59 * instride]);
+ x1[59] = _mm_sub_epi32(input[4 * instride], input[59 * instride]);
+ x1[5] = _mm_add_epi32(input[5 * instride], input[58 * instride]);
+ x1[58] = _mm_sub_epi32(input[5 * instride], input[58 * instride]);
+ x1[6] = _mm_add_epi32(input[6 * instride], input[57 * instride]);
+ x1[57] = _mm_sub_epi32(input[6 * instride], input[57 * instride]);
+ x1[7] = _mm_add_epi32(input[7 * instride], input[56 * instride]);
+ x1[56] = _mm_sub_epi32(input[7 * instride], input[56 * instride]);
+ x1[8] = _mm_add_epi32(input[8 * instride], input[55 * instride]);
+ x1[55] = _mm_sub_epi32(input[8 * instride], input[55 * instride]);
+ x1[9] = _mm_add_epi32(input[9 * instride], input[54 * instride]);
+ x1[54] = _mm_sub_epi32(input[9 * instride], input[54 * instride]);
+ x1[10] = _mm_add_epi32(input[10 * instride], input[53 * instride]);
+ x1[53] = _mm_sub_epi32(input[10 * instride], input[53 * instride]);
+ x1[11] = _mm_add_epi32(input[11 * instride], input[52 * instride]);
+ x1[52] = _mm_sub_epi32(input[11 * instride], input[52 * instride]);
+ x1[12] = _mm_add_epi32(input[12 * instride], input[51 * instride]);
+ x1[51] = _mm_sub_epi32(input[12 * instride], input[51 * instride]);
+ x1[13] = _mm_add_epi32(input[13 * instride], input[50 * instride]);
+ x1[50] = _mm_sub_epi32(input[13 * instride], input[50 * instride]);
+ x1[14] = _mm_add_epi32(input[14 * instride], input[49 * instride]);
+ x1[49] = _mm_sub_epi32(input[14 * instride], input[49 * instride]);
+ x1[15] = _mm_add_epi32(input[15 * instride], input[48 * instride]);
+ x1[48] = _mm_sub_epi32(input[15 * instride], input[48 * instride]);
+ x1[16] = _mm_add_epi32(input[16 * instride], input[47 * instride]);
+ x1[47] = _mm_sub_epi32(input[16 * instride], input[47 * instride]);
+ x1[17] = _mm_add_epi32(input[17 * instride], input[46 * instride]);
+ x1[46] = _mm_sub_epi32(input[17 * instride], input[46 * instride]);
+ x1[18] = _mm_add_epi32(input[18 * instride], input[45 * instride]);
+ x1[45] = _mm_sub_epi32(input[18 * instride], input[45 * instride]);
+ x1[19] = _mm_add_epi32(input[19 * instride], input[44 * instride]);
+ x1[44] = _mm_sub_epi32(input[19 * instride], input[44 * instride]);
+ x1[20] = _mm_add_epi32(input[20 * instride], input[43 * instride]);
+ x1[43] = _mm_sub_epi32(input[20 * instride], input[43 * instride]);
+ x1[21] = _mm_add_epi32(input[21 * instride], input[42 * instride]);
+ x1[42] = _mm_sub_epi32(input[21 * instride], input[42 * instride]);
+ x1[22] = _mm_add_epi32(input[22 * instride], input[41 * instride]);
+ x1[41] = _mm_sub_epi32(input[22 * instride], input[41 * instride]);
+ x1[23] = _mm_add_epi32(input[23 * instride], input[40 * instride]);
+ x1[40] = _mm_sub_epi32(input[23 * instride], input[40 * instride]);
+ x1[24] = _mm_add_epi32(input[24 * instride], input[39 * instride]);
+ x1[39] = _mm_sub_epi32(input[24 * instride], input[39 * instride]);
+ x1[25] = _mm_add_epi32(input[25 * instride], input[38 * instride]);
+ x1[38] = _mm_sub_epi32(input[25 * instride], input[38 * instride]);
+ x1[26] = _mm_add_epi32(input[26 * instride], input[37 * instride]);
+ x1[37] = _mm_sub_epi32(input[26 * instride], input[37 * instride]);
+ x1[27] = _mm_add_epi32(input[27 * instride], input[36 * instride]);
+ x1[36] = _mm_sub_epi32(input[27 * instride], input[36 * instride]);
+ x1[28] = _mm_add_epi32(input[28 * instride], input[35 * instride]);
+ x1[35] = _mm_sub_epi32(input[28 * instride], input[35 * instride]);
+ x1[29] = _mm_add_epi32(input[29 * instride], input[34 * instride]);
+ x1[34] = _mm_sub_epi32(input[29 * instride], input[34 * instride]);
+ x1[30] = _mm_add_epi32(input[30 * instride], input[33 * instride]);
+ x1[33] = _mm_sub_epi32(input[30 * instride], input[33 * instride]);
+ x1[31] = _mm_add_epi32(input[31 * instride], input[32 * instride]);
+ x1[32] = _mm_sub_epi32(input[31 * instride], input[32 * instride]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_add_epi32(x1[0], x1[31]);
+ x2[31] = _mm_sub_epi32(x1[0], x1[31]);
+ x2[1] = _mm_add_epi32(x1[1], x1[30]);
+ x2[30] = _mm_sub_epi32(x1[1], x1[30]);
+ x2[2] = _mm_add_epi32(x1[2], x1[29]);
+ x2[29] = _mm_sub_epi32(x1[2], x1[29]);
+ x2[3] = _mm_add_epi32(x1[3], x1[28]);
+ x2[28] = _mm_sub_epi32(x1[3], x1[28]);
+ x2[4] = _mm_add_epi32(x1[4], x1[27]);
+ x2[27] = _mm_sub_epi32(x1[4], x1[27]);
+ x2[5] = _mm_add_epi32(x1[5], x1[26]);
+ x2[26] = _mm_sub_epi32(x1[5], x1[26]);
+ x2[6] = _mm_add_epi32(x1[6], x1[25]);
+ x2[25] = _mm_sub_epi32(x1[6], x1[25]);
+ x2[7] = _mm_add_epi32(x1[7], x1[24]);
+ x2[24] = _mm_sub_epi32(x1[7], x1[24]);
+ x2[8] = _mm_add_epi32(x1[8], x1[23]);
+ x2[23] = _mm_sub_epi32(x1[8], x1[23]);
+ x2[9] = _mm_add_epi32(x1[9], x1[22]);
+ x2[22] = _mm_sub_epi32(x1[9], x1[22]);
+ x2[10] = _mm_add_epi32(x1[10], x1[21]);
+ x2[21] = _mm_sub_epi32(x1[10], x1[21]);
+ x2[11] = _mm_add_epi32(x1[11], x1[20]);
+ x2[20] = _mm_sub_epi32(x1[11], x1[20]);
+ x2[12] = _mm_add_epi32(x1[12], x1[19]);
+ x2[19] = _mm_sub_epi32(x1[12], x1[19]);
+ x2[13] = _mm_add_epi32(x1[13], x1[18]);
+ x2[18] = _mm_sub_epi32(x1[13], x1[18]);
+ x2[14] = _mm_add_epi32(x1[14], x1[17]);
+ x2[17] = _mm_sub_epi32(x1[14], x1[17]);
+ x2[15] = _mm_add_epi32(x1[15], x1[16]);
+ x2[16] = _mm_sub_epi32(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[40], x1[55], x2[40], x2[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[41], x1[54], x2[41], x2[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[42], x1[53], x2[42], x2[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[43], x1[52], x2[43], x2[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[44], x1[51], x2[44], x2[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[45], x1[50], x2[45], x2[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[46], x1[49], x2[46], x2[49],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x1[47], x1[48], x2[47], x2[48],
+ __rounding, cos_bit);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_add_epi32(x2[0], x2[15]);
+ x3[15] = _mm_sub_epi32(x2[0], x2[15]);
+ x3[1] = _mm_add_epi32(x2[1], x2[14]);
+ x3[14] = _mm_sub_epi32(x2[1], x2[14]);
+ x3[2] = _mm_add_epi32(x2[2], x2[13]);
+ x3[13] = _mm_sub_epi32(x2[2], x2[13]);
+ x3[3] = _mm_add_epi32(x2[3], x2[12]);
+ x3[12] = _mm_sub_epi32(x2[3], x2[12]);
+ x3[4] = _mm_add_epi32(x2[4], x2[11]);
+ x3[11] = _mm_sub_epi32(x2[4], x2[11]);
+ x3[5] = _mm_add_epi32(x2[5], x2[10]);
+ x3[10] = _mm_sub_epi32(x2[5], x2[10]);
+ x3[6] = _mm_add_epi32(x2[6], x2[9]);
+ x3[9] = _mm_sub_epi32(x2[6], x2[9]);
+ x3[7] = _mm_add_epi32(x2[7], x2[8]);
+ x3[8] = _mm_sub_epi32(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[20], x2[27], x3[20], x3[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[21], x2[26], x3[21], x3[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[22], x2[25], x3[22], x3[25],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x2[23], x2[24], x3[23], x3[24],
+ __rounding, cos_bit);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_add_epi32(x2[32], x2[47]);
+ x3[47] = _mm_sub_epi32(x2[32], x2[47]);
+ x3[33] = _mm_add_epi32(x2[33], x2[46]);
+ x3[46] = _mm_sub_epi32(x2[33], x2[46]);
+ x3[34] = _mm_add_epi32(x2[34], x2[45]);
+ x3[45] = _mm_sub_epi32(x2[34], x2[45]);
+ x3[35] = _mm_add_epi32(x2[35], x2[44]);
+ x3[44] = _mm_sub_epi32(x2[35], x2[44]);
+ x3[36] = _mm_add_epi32(x2[36], x2[43]);
+ x3[43] = _mm_sub_epi32(x2[36], x2[43]);
+ x3[37] = _mm_add_epi32(x2[37], x2[42]);
+ x3[42] = _mm_sub_epi32(x2[37], x2[42]);
+ x3[38] = _mm_add_epi32(x2[38], x2[41]);
+ x3[41] = _mm_sub_epi32(x2[38], x2[41]);
+ x3[39] = _mm_add_epi32(x2[39], x2[40]);
+ x3[40] = _mm_sub_epi32(x2[39], x2[40]);
+ x3[48] = _mm_sub_epi32(x2[63], x2[48]);
+ x3[63] = _mm_add_epi32(x2[63], x2[48]);
+ x3[49] = _mm_sub_epi32(x2[62], x2[49]);
+ x3[62] = _mm_add_epi32(x2[62], x2[49]);
+ x3[50] = _mm_sub_epi32(x2[61], x2[50]);
+ x3[61] = _mm_add_epi32(x2[61], x2[50]);
+ x3[51] = _mm_sub_epi32(x2[60], x2[51]);
+ x3[60] = _mm_add_epi32(x2[60], x2[51]);
+ x3[52] = _mm_sub_epi32(x2[59], x2[52]);
+ x3[59] = _mm_add_epi32(x2[59], x2[52]);
+ x3[53] = _mm_sub_epi32(x2[58], x2[53]);
+ x3[58] = _mm_add_epi32(x2[58], x2[53]);
+ x3[54] = _mm_sub_epi32(x2[57], x2[54]);
+ x3[57] = _mm_add_epi32(x2[57], x2[54]);
+ x3[55] = _mm_sub_epi32(x2[56], x2[55]);
+ x3[56] = _mm_add_epi32(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_add_epi32(x3[0], x3[7]);
+ x4[7] = _mm_sub_epi32(x3[0], x3[7]);
+ x4[1] = _mm_add_epi32(x3[1], x3[6]);
+ x4[6] = _mm_sub_epi32(x3[1], x3[6]);
+ x4[2] = _mm_add_epi32(x3[2], x3[5]);
+ x4[5] = _mm_sub_epi32(x3[2], x3[5]);
+ x4[3] = _mm_add_epi32(x3[3], x3[4]);
+ x4[4] = _mm_sub_epi32(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[10], x3[13], x4[10], x4[13],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x3[11], x3[12], x4[11], x4[12],
+ __rounding, cos_bit);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_add_epi32(x3[16], x3[23]);
+ x4[23] = _mm_sub_epi32(x3[16], x3[23]);
+ x4[17] = _mm_add_epi32(x3[17], x3[22]);
+ x4[22] = _mm_sub_epi32(x3[17], x3[22]);
+ x4[18] = _mm_add_epi32(x3[18], x3[21]);
+ x4[21] = _mm_sub_epi32(x3[18], x3[21]);
+ x4[19] = _mm_add_epi32(x3[19], x3[20]);
+ x4[20] = _mm_sub_epi32(x3[19], x3[20]);
+ x4[24] = _mm_sub_epi32(x3[31], x3[24]);
+ x4[31] = _mm_add_epi32(x3[31], x3[24]);
+ x4[25] = _mm_sub_epi32(x3[30], x3[25]);
+ x4[30] = _mm_add_epi32(x3[30], x3[25]);
+ x4[26] = _mm_sub_epi32(x3[29], x3[26]);
+ x4[29] = _mm_add_epi32(x3[29], x3[26]);
+ x4[27] = _mm_sub_epi32(x3[28], x3[27]);
+ x4[28] = _mm_add_epi32(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[36], x3[59], x4[36], x4[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[37], x3[58], x4[37], x4[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[38], x3[57], x4[38], x4[57],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x3[39], x3[56], x4[39], x4[56],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[40], x3[55], x4[40], x4[55],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[41], x3[54], x4[41], x4[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[42], x3[53], x4[42], x4[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x3[43], x3[52], x4[43], x4[52],
+ __rounding, cos_bit);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_add_epi32(x4[0], x4[3]);
+ x5[3] = _mm_sub_epi32(x4[0], x4[3]);
+ x5[1] = _mm_add_epi32(x4[1], x4[2]);
+ x5[2] = _mm_sub_epi32(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_32_type0_sse4_1_new(cospi_m32, cospi_p32, x4[5], x4[6], x5[5], x5[6],
+ __rounding, cos_bit);
+ x5[7] = x4[7];
+ x5[8] = _mm_add_epi32(x4[8], x4[11]);
+ x5[11] = _mm_sub_epi32(x4[8], x4[11]);
+ x5[9] = _mm_add_epi32(x4[9], x4[10]);
+ x5[10] = _mm_sub_epi32(x4[9], x4[10]);
+ x5[12] = _mm_sub_epi32(x4[15], x4[12]);
+ x5[15] = _mm_add_epi32(x4[15], x4[12]);
+ x5[13] = _mm_sub_epi32(x4[14], x4[13]);
+ x5[14] = _mm_add_epi32(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[18], x4[29], x5[18], x5[29],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x4[19], x4[28], x5[19], x5[28],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[20], x4[27], x5[20], x5[27],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x4[21], x4[26], x5[21], x5[26],
+ __rounding, cos_bit);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_add_epi32(x4[32], x4[39]);
+ x5[39] = _mm_sub_epi32(x4[32], x4[39]);
+ x5[33] = _mm_add_epi32(x4[33], x4[38]);
+ x5[38] = _mm_sub_epi32(x4[33], x4[38]);
+ x5[34] = _mm_add_epi32(x4[34], x4[37]);
+ x5[37] = _mm_sub_epi32(x4[34], x4[37]);
+ x5[35] = _mm_add_epi32(x4[35], x4[36]);
+ x5[36] = _mm_sub_epi32(x4[35], x4[36]);
+ x5[40] = _mm_sub_epi32(x4[47], x4[40]);
+ x5[47] = _mm_add_epi32(x4[47], x4[40]);
+ x5[41] = _mm_sub_epi32(x4[46], x4[41]);
+ x5[46] = _mm_add_epi32(x4[46], x4[41]);
+ x5[42] = _mm_sub_epi32(x4[45], x4[42]);
+ x5[45] = _mm_add_epi32(x4[45], x4[42]);
+ x5[43] = _mm_sub_epi32(x4[44], x4[43]);
+ x5[44] = _mm_add_epi32(x4[44], x4[43]);
+ x5[48] = _mm_add_epi32(x4[48], x4[55]);
+ x5[55] = _mm_sub_epi32(x4[48], x4[55]);
+ x5[49] = _mm_add_epi32(x4[49], x4[54]);
+ x5[54] = _mm_sub_epi32(x4[49], x4[54]);
+ x5[50] = _mm_add_epi32(x4[50], x4[53]);
+ x5[53] = _mm_sub_epi32(x4[50], x4[53]);
+ x5[51] = _mm_add_epi32(x4[51], x4[52]);
+ x5[52] = _mm_sub_epi32(x4[51], x4[52]);
+ x5[56] = _mm_sub_epi32(x4[63], x4[56]);
+ x5[63] = _mm_add_epi32(x4[63], x4[56]);
+ x5[57] = _mm_sub_epi32(x4[62], x4[57]);
+ x5[62] = _mm_add_epi32(x4[62], x4[57]);
+ x5[58] = _mm_sub_epi32(x4[61], x4[58]);
+ x5[61] = _mm_add_epi32(x4[61], x4[58]);
+ x5[59] = _mm_sub_epi32(x4[60], x4[59]);
+ x5[60] = _mm_add_epi32(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_32_type0_sse4_1_new(cospi_p32, cospi_p32, x5[0], x5[1], x6[0], x6[1],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p48, cospi_p16, x5[2], x5[3], x6[2], x6[3],
+ __rounding, cos_bit);
+ x6[4] = _mm_add_epi32(x5[4], x5[5]);
+ x6[5] = _mm_sub_epi32(x5[4], x5[5]);
+ x6[6] = _mm_sub_epi32(x5[7], x5[6]);
+ x6[7] = _mm_add_epi32(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_32_type0_sse4_1_new(cospi_m16, cospi_p48, x5[9], x5[14], x6[9], x6[14],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m48, cospi_m16, x5[10], x5[13], x6[10], x6[13],
+ __rounding, cos_bit);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_add_epi32(x5[16], x5[19]);
+ x6[19] = _mm_sub_epi32(x5[16], x5[19]);
+ x6[17] = _mm_add_epi32(x5[17], x5[18]);
+ x6[18] = _mm_sub_epi32(x5[17], x5[18]);
+ x6[20] = _mm_sub_epi32(x5[23], x5[20]);
+ x6[23] = _mm_add_epi32(x5[23], x5[20]);
+ x6[21] = _mm_sub_epi32(x5[22], x5[21]);
+ x6[22] = _mm_add_epi32(x5[22], x5[21]);
+ x6[24] = _mm_add_epi32(x5[24], x5[27]);
+ x6[27] = _mm_sub_epi32(x5[24], x5[27]);
+ x6[25] = _mm_add_epi32(x5[25], x5[26]);
+ x6[26] = _mm_sub_epi32(x5[25], x5[26]);
+ x6[28] = _mm_sub_epi32(x5[31], x5[28]);
+ x6[31] = _mm_add_epi32(x5[31], x5[28]);
+ x6[29] = _mm_sub_epi32(x5[30], x5[29]);
+ x6[30] = _mm_add_epi32(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[34], x5[61], x6[34], x6[61],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x5[35], x5[60], x6[35], x6[60],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[36], x5[59], x6[36], x6[59],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x5[37], x5[58], x6[37], x6[58],
+ __rounding, cos_bit);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[42], x5[53], x6[42], x6[53],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x5[43], x5[52], x6[43], x6[52],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[44], x5[51], x6[44], x6[51],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x5[45], x5[50], x6[45], x6[50],
+ __rounding, cos_bit);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_32_type1_sse4_1_new(cospi_p56, cospi_p08, x6[4], x6[7], x7[4], x7[7],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p24, cospi_p40, x6[5], x6[6], x7[5], x7[6],
+ __rounding, cos_bit);
+ x7[8] = _mm_add_epi32(x6[8], x6[9]);
+ x7[9] = _mm_sub_epi32(x6[8], x6[9]);
+ x7[10] = _mm_sub_epi32(x6[11], x6[10]);
+ x7[11] = _mm_add_epi32(x6[11], x6[10]);
+ x7[12] = _mm_add_epi32(x6[12], x6[13]);
+ x7[13] = _mm_sub_epi32(x6[12], x6[13]);
+ x7[14] = _mm_sub_epi32(x6[15], x6[14]);
+ x7[15] = _mm_add_epi32(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_32_type0_sse4_1_new(cospi_m08, cospi_p56, x6[17], x6[30], x7[17], x7[30],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m56, cospi_m08, x6[18], x6[29], x7[18], x7[29],
+ __rounding, cos_bit);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_32_type0_sse4_1_new(cospi_m40, cospi_p24, x6[21], x6[26], x7[21], x7[26],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m24, cospi_m40, x6[22], x6[25], x7[22], x7[25],
+ __rounding, cos_bit);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_add_epi32(x6[32], x6[35]);
+ x7[35] = _mm_sub_epi32(x6[32], x6[35]);
+ x7[33] = _mm_add_epi32(x6[33], x6[34]);
+ x7[34] = _mm_sub_epi32(x6[33], x6[34]);
+ x7[36] = _mm_sub_epi32(x6[39], x6[36]);
+ x7[39] = _mm_add_epi32(x6[39], x6[36]);
+ x7[37] = _mm_sub_epi32(x6[38], x6[37]);
+ x7[38] = _mm_add_epi32(x6[38], x6[37]);
+ x7[40] = _mm_add_epi32(x6[40], x6[43]);
+ x7[43] = _mm_sub_epi32(x6[40], x6[43]);
+ x7[41] = _mm_add_epi32(x6[41], x6[42]);
+ x7[42] = _mm_sub_epi32(x6[41], x6[42]);
+ x7[44] = _mm_sub_epi32(x6[47], x6[44]);
+ x7[47] = _mm_add_epi32(x6[47], x6[44]);
+ x7[45] = _mm_sub_epi32(x6[46], x6[45]);
+ x7[46] = _mm_add_epi32(x6[46], x6[45]);
+ x7[48] = _mm_add_epi32(x6[48], x6[51]);
+ x7[51] = _mm_sub_epi32(x6[48], x6[51]);
+ x7[49] = _mm_add_epi32(x6[49], x6[50]);
+ x7[50] = _mm_sub_epi32(x6[49], x6[50]);
+ x7[52] = _mm_sub_epi32(x6[55], x6[52]);
+ x7[55] = _mm_add_epi32(x6[55], x6[52]);
+ x7[53] = _mm_sub_epi32(x6[54], x6[53]);
+ x7[54] = _mm_add_epi32(x6[54], x6[53]);
+ x7[56] = _mm_add_epi32(x6[56], x6[59]);
+ x7[59] = _mm_sub_epi32(x6[56], x6[59]);
+ x7[57] = _mm_add_epi32(x6[57], x6[58]);
+ x7[58] = _mm_sub_epi32(x6[57], x6[58]);
+ x7[60] = _mm_sub_epi32(x6[63], x6[60]);
+ x7[63] = _mm_add_epi32(x6[63], x6[60]);
+ x7[61] = _mm_sub_epi32(x6[62], x6[61]);
+ x7[62] = _mm_add_epi32(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_32_type1_sse4_1_new(cospi_p60, cospi_p04, x7[8], x7[15], x8[8], x8[15],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p28, cospi_p36, x7[9], x7[14], x8[9], x8[14],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p44, cospi_p20, x7[10], x7[13], x8[10], x8[13],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p12, cospi_p52, x7[11], x7[12], x8[11], x8[12],
+ __rounding, cos_bit);
+ x8[16] = _mm_add_epi32(x7[16], x7[17]);
+ x8[17] = _mm_sub_epi32(x7[16], x7[17]);
+ x8[18] = _mm_sub_epi32(x7[19], x7[18]);
+ x8[19] = _mm_add_epi32(x7[19], x7[18]);
+ x8[20] = _mm_add_epi32(x7[20], x7[21]);
+ x8[21] = _mm_sub_epi32(x7[20], x7[21]);
+ x8[22] = _mm_sub_epi32(x7[23], x7[22]);
+ x8[23] = _mm_add_epi32(x7[23], x7[22]);
+ x8[24] = _mm_add_epi32(x7[24], x7[25]);
+ x8[25] = _mm_sub_epi32(x7[24], x7[25]);
+ x8[26] = _mm_sub_epi32(x7[27], x7[26]);
+ x8[27] = _mm_add_epi32(x7[27], x7[26]);
+ x8[28] = _mm_add_epi32(x7[28], x7[29]);
+ x8[29] = _mm_sub_epi32(x7[28], x7[29]);
+ x8[30] = _mm_sub_epi32(x7[31], x7[30]);
+ x8[31] = _mm_add_epi32(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_32_type0_sse4_1_new(cospi_m04, cospi_p60, x7[33], x7[62], x8[33], x8[62],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m60, cospi_m04, x7[34], x7[61], x8[34], x8[61],
+ __rounding, cos_bit);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_32_type0_sse4_1_new(cospi_m36, cospi_p28, x7[37], x7[58], x8[37], x8[58],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m28, cospi_m36, x7[38], x7[57], x8[38], x8[57],
+ __rounding, cos_bit);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_32_type0_sse4_1_new(cospi_m20, cospi_p44, x7[41], x7[54], x8[41], x8[54],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m44, cospi_m20, x7[42], x7[53], x8[42], x8[53],
+ __rounding, cos_bit);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_32_type0_sse4_1_new(cospi_m52, cospi_p12, x7[45], x7[50], x8[45], x8[50],
+ __rounding, cos_bit);
+ btf_32_type0_sse4_1_new(cospi_m12, cospi_m52, x7[46], x7[49], x8[46], x8[49],
+ __rounding, cos_bit);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_32_type1_sse4_1_new(cospi_p62, cospi_p02, x8[16], x8[31], x9[16], x9[31],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p30, cospi_p34, x8[17], x8[30], x9[17], x9[30],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p46, cospi_p18, x8[18], x8[29], x9[18], x9[29],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p14, cospi_p50, x8[19], x8[28], x9[19], x9[28],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p54, cospi_p10, x8[20], x8[27], x9[20], x9[27],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p22, cospi_p42, x8[21], x8[26], x9[21], x9[26],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p38, cospi_p26, x8[22], x8[25], x9[22], x9[25],
+ __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p06, cospi_p58, x8[23], x8[24], x9[23], x9[24],
+ __rounding, cos_bit);
+ x9[32] = _mm_add_epi32(x8[32], x8[33]);
+ x9[33] = _mm_sub_epi32(x8[32], x8[33]);
+ x9[34] = _mm_sub_epi32(x8[35], x8[34]);
+ x9[35] = _mm_add_epi32(x8[35], x8[34]);
+ x9[36] = _mm_add_epi32(x8[36], x8[37]);
+ x9[37] = _mm_sub_epi32(x8[36], x8[37]);
+ x9[38] = _mm_sub_epi32(x8[39], x8[38]);
+ x9[39] = _mm_add_epi32(x8[39], x8[38]);
+ x9[40] = _mm_add_epi32(x8[40], x8[41]);
+ x9[41] = _mm_sub_epi32(x8[40], x8[41]);
+ x9[42] = _mm_sub_epi32(x8[43], x8[42]);
+ x9[43] = _mm_add_epi32(x8[43], x8[42]);
+ x9[44] = _mm_add_epi32(x8[44], x8[45]);
+ x9[45] = _mm_sub_epi32(x8[44], x8[45]);
+ x9[46] = _mm_sub_epi32(x8[47], x8[46]);
+ x9[47] = _mm_add_epi32(x8[47], x8[46]);
+ x9[48] = _mm_add_epi32(x8[48], x8[49]);
+ x9[49] = _mm_sub_epi32(x8[48], x8[49]);
+ x9[50] = _mm_sub_epi32(x8[51], x8[50]);
+ x9[51] = _mm_add_epi32(x8[51], x8[50]);
+ x9[52] = _mm_add_epi32(x8[52], x8[53]);
+ x9[53] = _mm_sub_epi32(x8[52], x8[53]);
+ x9[54] = _mm_sub_epi32(x8[55], x8[54]);
+ x9[55] = _mm_add_epi32(x8[55], x8[54]);
+ x9[56] = _mm_add_epi32(x8[56], x8[57]);
+ x9[57] = _mm_sub_epi32(x8[56], x8[57]);
+ x9[58] = _mm_sub_epi32(x8[59], x8[58]);
+ x9[59] = _mm_add_epi32(x8[59], x8[58]);
+ x9[60] = _mm_add_epi32(x8[60], x8[61]);
+ x9[61] = _mm_sub_epi32(x8[60], x8[61]);
+ x9[62] = _mm_sub_epi32(x8[63], x8[62]);
+ x9[63] = _mm_add_epi32(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_32_type1_sse4_1_new(cospi_p63, cospi_p01, x9[32], x9[63], x10[32],
+ x10[63], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p31, cospi_p33, x9[33], x9[62], x10[33],
+ x10[62], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p47, cospi_p17, x9[34], x9[61], x10[34],
+ x10[61], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p15, cospi_p49, x9[35], x9[60], x10[35],
+ x10[60], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p55, cospi_p09, x9[36], x9[59], x10[36],
+ x10[59], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p23, cospi_p41, x9[37], x9[58], x10[37],
+ x10[58], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p39, cospi_p25, x9[38], x9[57], x10[38],
+ x10[57], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p07, cospi_p57, x9[39], x9[56], x10[39],
+ x10[56], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p59, cospi_p05, x9[40], x9[55], x10[40],
+ x10[55], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p27, cospi_p37, x9[41], x9[54], x10[41],
+ x10[54], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p43, cospi_p21, x9[42], x9[53], x10[42],
+ x10[53], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p11, cospi_p53, x9[43], x9[52], x10[43],
+ x10[52], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p51, cospi_p13, x9[44], x9[51], x10[44],
+ x10[51], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p19, cospi_p45, x9[45], x9[50], x10[45],
+ x10[50], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p35, cospi_p29, x9[46], x9[49], x10[46],
+ x10[49], __rounding, cos_bit);
+ btf_32_type1_sse4_1_new(cospi_p03, cospi_p61, x9[47], x9[48], x10[47],
+ x10[48], __rounding, cos_bit);
+
+ // stage 11
+ output[0 * outstride] = x10[0];
+ output[1 * outstride] = x10[32];
+ output[2 * outstride] = x10[16];
+ output[3 * outstride] = x10[48];
+ output[4 * outstride] = x10[8];
+ output[5 * outstride] = x10[40];
+ output[6 * outstride] = x10[24];
+ output[7 * outstride] = x10[56];
+ output[8 * outstride] = x10[4];
+ output[9 * outstride] = x10[36];
+ output[10 * outstride] = x10[20];
+ output[11 * outstride] = x10[52];
+ output[12 * outstride] = x10[12];
+ output[13 * outstride] = x10[44];
+ output[14 * outstride] = x10[28];
+ output[15 * outstride] = x10[60];
+ output[16 * outstride] = x10[2];
+ output[17 * outstride] = x10[34];
+ output[18 * outstride] = x10[18];
+ output[19 * outstride] = x10[50];
+ output[20 * outstride] = x10[10];
+ output[21 * outstride] = x10[42];
+ output[22 * outstride] = x10[26];
+ output[23 * outstride] = x10[58];
+ output[24 * outstride] = x10[6];
+ output[25 * outstride] = x10[38];
+ output[26 * outstride] = x10[22];
+ output[27 * outstride] = x10[54];
+ output[28 * outstride] = x10[14];
+ output[29 * outstride] = x10[46];
+ output[30 * outstride] = x10[30];
+ output[31 * outstride] = x10[62];
+ output[32 * outstride] = x10[1];
+ output[33 * outstride] = x10[33];
+ output[34 * outstride] = x10[17];
+ output[35 * outstride] = x10[49];
+ output[36 * outstride] = x10[9];
+ output[37 * outstride] = x10[41];
+ output[38 * outstride] = x10[25];
+ output[39 * outstride] = x10[57];
+ output[40 * outstride] = x10[5];
+ output[41 * outstride] = x10[37];
+ output[42 * outstride] = x10[21];
+ output[43 * outstride] = x10[53];
+ output[44 * outstride] = x10[13];
+ output[45 * outstride] = x10[45];
+ output[46 * outstride] = x10[29];
+ output[47 * outstride] = x10[61];
+ output[48 * outstride] = x10[3];
+ output[49 * outstride] = x10[35];
+ output[50 * outstride] = x10[19];
+ output[51 * outstride] = x10[51];
+ output[52 * outstride] = x10[11];
+ output[53 * outstride] = x10[43];
+ output[54 * outstride] = x10[27];
+ output[55 * outstride] = x10[59];
+ output[56 * outstride] = x10[7];
+ output[57 * outstride] = x10[39];
+ output[58 * outstride] = x10[23];
+ output[59 * outstride] = x10[55];
+ output[60 * outstride] = x10[15];
+ output[61 * outstride] = x10[47];
+ output[62 * outstride] = x10[31];
+ output[63 * outstride] = x10[63];
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
new file mode 100644
index 0000000000..592462e20d
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_avx2.c
@@ -0,0 +1,2068 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/encoder/x86/av1_fwd_txfm_avx2.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_avx2.h"
+
+static INLINE void fdct16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m256i x1[16];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[15], input[0], input[15]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[14], input[1], input[14]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[13], input[2], input[13]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[12], input[3], input[12]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[11], input[4], input[11]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[10], input[5], input[10]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[9], input[6], input[9]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[8], input[7], input[8]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+
+ // stage 7
+ output[0] = x1[0];
+ output[1] = x1[8];
+ output[2] = x1[4];
+ output[3] = x1[12];
+ output[4] = x1[2];
+ output[5] = x1[10];
+ output[6] = x1[6];
+ output[7] = x1[14];
+ output[8] = x1[1];
+ output[9] = x1[9];
+ output[10] = x1[5];
+ output[11] = x1[13];
+ output[12] = x1[3];
+ output[13] = x1[11];
+ output[14] = x1[7];
+ output[15] = x1[15];
+}
+
+static INLINE void fdct16x32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m256i x1[32];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void fdct16x64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32_p32 = pair_set_w16_epi16(-cospi[32], cospi[32]);
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_m16_p48 = pair_set_w16_epi16(-cospi[16], cospi[48]);
+ __m256i cospi_p48_p16 = pair_set_w16_epi16(cospi[48], cospi[16]);
+ __m256i cospi_m48_m16 = pair_set_w16_epi16(-cospi[48], -cospi[16]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_m08_p56 = pair_set_w16_epi16(-cospi[8], cospi[56]);
+ __m256i cospi_p56_p08 = pair_set_w16_epi16(cospi[56], cospi[8]);
+ __m256i cospi_m56_m08 = pair_set_w16_epi16(-cospi[56], -cospi[8]);
+ __m256i cospi_m40_p24 = pair_set_w16_epi16(-cospi[40], cospi[24]);
+ __m256i cospi_p24_p40 = pair_set_w16_epi16(cospi[24], cospi[40]);
+ __m256i cospi_m24_m40 = pair_set_w16_epi16(-cospi[24], -cospi[40]);
+ __m256i cospi_p60_p04 = pair_set_w16_epi16(cospi[60], cospi[4]);
+ __m256i cospi_m04_p60 = pair_set_w16_epi16(-cospi[4], cospi[60]);
+ __m256i cospi_p28_p36 = pair_set_w16_epi16(cospi[28], cospi[36]);
+ __m256i cospi_m36_p28 = pair_set_w16_epi16(-cospi[36], cospi[28]);
+ __m256i cospi_p44_p20 = pair_set_w16_epi16(cospi[44], cospi[20]);
+ __m256i cospi_m20_p44 = pair_set_w16_epi16(-cospi[20], cospi[44]);
+ __m256i cospi_p12_p52 = pair_set_w16_epi16(cospi[12], cospi[52]);
+ __m256i cospi_m52_p12 = pair_set_w16_epi16(-cospi[52], cospi[12]);
+ __m256i cospi_m60_m04 = pair_set_w16_epi16(-cospi[60], -cospi[4]);
+ __m256i cospi_m28_m36 = pair_set_w16_epi16(-cospi[28], -cospi[36]);
+ __m256i cospi_m44_m20 = pair_set_w16_epi16(-cospi[44], -cospi[20]);
+ __m256i cospi_m12_m52 = pair_set_w16_epi16(-cospi[12], -cospi[52]);
+ __m256i cospi_p62_p02 = pair_set_w16_epi16(cospi[62], cospi[2]);
+ __m256i cospi_m02_p62 = pair_set_w16_epi16(-cospi[2], cospi[62]);
+ __m256i cospi_p30_p34 = pair_set_w16_epi16(cospi[30], cospi[34]);
+ __m256i cospi_m34_p30 = pair_set_w16_epi16(-cospi[34], cospi[30]);
+ __m256i cospi_p46_p18 = pair_set_w16_epi16(cospi[46], cospi[18]);
+ __m256i cospi_m18_p46 = pair_set_w16_epi16(-cospi[18], cospi[46]);
+ __m256i cospi_p14_p50 = pair_set_w16_epi16(cospi[14], cospi[50]);
+ __m256i cospi_m50_p14 = pair_set_w16_epi16(-cospi[50], cospi[14]);
+ __m256i cospi_p54_p10 = pair_set_w16_epi16(cospi[54], cospi[10]);
+ __m256i cospi_m10_p54 = pair_set_w16_epi16(-cospi[10], cospi[54]);
+ __m256i cospi_p22_p42 = pair_set_w16_epi16(cospi[22], cospi[42]);
+ __m256i cospi_m42_p22 = pair_set_w16_epi16(-cospi[42], cospi[22]);
+ __m256i cospi_p38_p26 = pair_set_w16_epi16(cospi[38], cospi[26]);
+ __m256i cospi_m26_p38 = pair_set_w16_epi16(-cospi[26], cospi[38]);
+ __m256i cospi_p06_p58 = pair_set_w16_epi16(cospi[6], cospi[58]);
+ __m256i cospi_m58_p06 = pair_set_w16_epi16(-cospi[58], cospi[6]);
+ __m256i cospi_p63_p01 = pair_set_w16_epi16(cospi[63], cospi[1]);
+ __m256i cospi_m01_p63 = pair_set_w16_epi16(-cospi[1], cospi[63]);
+ __m256i cospi_p31_p33 = pair_set_w16_epi16(cospi[31], cospi[33]);
+ __m256i cospi_m33_p31 = pair_set_w16_epi16(-cospi[33], cospi[31]);
+ __m256i cospi_p47_p17 = pair_set_w16_epi16(cospi[47], cospi[17]);
+ __m256i cospi_m17_p47 = pair_set_w16_epi16(-cospi[17], cospi[47]);
+ __m256i cospi_p15_p49 = pair_set_w16_epi16(cospi[15], cospi[49]);
+ __m256i cospi_m49_p15 = pair_set_w16_epi16(-cospi[49], cospi[15]);
+ __m256i cospi_p55_p09 = pair_set_w16_epi16(cospi[55], cospi[9]);
+ __m256i cospi_m09_p55 = pair_set_w16_epi16(-cospi[9], cospi[55]);
+ __m256i cospi_p23_p41 = pair_set_w16_epi16(cospi[23], cospi[41]);
+ __m256i cospi_m41_p23 = pair_set_w16_epi16(-cospi[41], cospi[23]);
+ __m256i cospi_p39_p25 = pair_set_w16_epi16(cospi[39], cospi[25]);
+ __m256i cospi_m25_p39 = pair_set_w16_epi16(-cospi[25], cospi[39]);
+ __m256i cospi_p07_p57 = pair_set_w16_epi16(cospi[7], cospi[57]);
+ __m256i cospi_m57_p07 = pair_set_w16_epi16(-cospi[57], cospi[7]);
+ __m256i cospi_p59_p05 = pair_set_w16_epi16(cospi[59], cospi[5]);
+ __m256i cospi_m05_p59 = pair_set_w16_epi16(-cospi[5], cospi[59]);
+ __m256i cospi_p27_p37 = pair_set_w16_epi16(cospi[27], cospi[37]);
+ __m256i cospi_m37_p27 = pair_set_w16_epi16(-cospi[37], cospi[27]);
+ __m256i cospi_p43_p21 = pair_set_w16_epi16(cospi[43], cospi[21]);
+ __m256i cospi_m21_p43 = pair_set_w16_epi16(-cospi[21], cospi[43]);
+ __m256i cospi_p11_p53 = pair_set_w16_epi16(cospi[11], cospi[53]);
+ __m256i cospi_m53_p11 = pair_set_w16_epi16(-cospi[53], cospi[11]);
+ __m256i cospi_p51_p13 = pair_set_w16_epi16(cospi[51], cospi[13]);
+ __m256i cospi_m13_p51 = pair_set_w16_epi16(-cospi[13], cospi[51]);
+ __m256i cospi_p19_p45 = pair_set_w16_epi16(cospi[19], cospi[45]);
+ __m256i cospi_m45_p19 = pair_set_w16_epi16(-cospi[45], cospi[19]);
+ __m256i cospi_p35_p29 = pair_set_w16_epi16(cospi[35], cospi[29]);
+ __m256i cospi_m29_p35 = pair_set_w16_epi16(-cospi[29], cospi[35]);
+ __m256i cospi_p03_p61 = pair_set_w16_epi16(cospi[3], cospi[61]);
+ __m256i cospi_m61_p03 = pair_set_w16_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_16_adds_subs_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_16_adds_subs_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_16_adds_subs_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_16_adds_subs_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_16_adds_subs_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_16_adds_subs_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_16_adds_subs_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_16_adds_subs_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_16_adds_subs_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_16_adds_subs_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_16_adds_subs_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_16_adds_subs_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_16_adds_subs_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_16_adds_subs_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_16_adds_subs_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_16_adds_subs_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_16_adds_subs_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_16_adds_subs_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_16_adds_subs_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_16_adds_subs_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_16_adds_subs_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_16_adds_subs_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_16_adds_subs_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_16_adds_subs_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_16_adds_subs_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_16_adds_subs_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_16_adds_subs_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_16_adds_subs_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_16_adds_subs_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_16_adds_subs_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_16_adds_subs_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_16_adds_subs_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_16_adds_subs_avx2(&x1[0], &x1[31]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[30]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[16]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[15]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[8]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[47]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[37], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[48]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[58], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_16_adds_subs_avx2(&x1[0], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[4]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[23]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[18], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[24]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[29], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[27]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[2]);
+ btf_16_w16_avx2(cospi_m32_p32, cospi_p32_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[14], &x1[13]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[39]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[34], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[40]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[45], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[55]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[50], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[56]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[61], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p48_p16, cospi_m16_p48, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[4], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[6]);
+ btf_16_w16_avx2(cospi_m16_p48, cospi_p48_p16, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_m16, cospi_m16_p48, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[19]);
+ btf_16_adds_subs_avx2(&x1[17], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[20]);
+ btf_16_adds_subs_avx2(&x1[22], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[27]);
+ btf_16_adds_subs_avx2(&x1[25], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[28]);
+ btf_16_adds_subs_avx2(&x1[30], &x1[29]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_16_w16_avx2(cospi_p56_p08, cospi_m08_p56, &x1[4], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p24_p40, cospi_m40_p24, &x1[5], &x1[6], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[8], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[15], &x1[14]);
+ btf_16_w16_avx2(cospi_m08_p56, cospi_p56_p08, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_m08, cospi_m08_p56, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m40_p24, cospi_p24_p40, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_m40, cospi_m40_p24, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[35]);
+ btf_16_adds_subs_avx2(&x1[33], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[36]);
+ btf_16_adds_subs_avx2(&x1[38], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[43]);
+ btf_16_adds_subs_avx2(&x1[41], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[44]);
+ btf_16_adds_subs_avx2(&x1[46], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[51]);
+ btf_16_adds_subs_avx2(&x1[49], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[52]);
+ btf_16_adds_subs_avx2(&x1[54], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[59]);
+ btf_16_adds_subs_avx2(&x1[57], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[60]);
+ btf_16_adds_subs_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p60_p04, cospi_m04_p60, &x1[8], &x1[15], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p28_p36, cospi_m36_p28, &x1[9], &x1[14], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p44_p20, cospi_m20_p44, &x1[10], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p12_p52, cospi_m52_p12, &x1[11], &x1[12], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[16], &x1[17]);
+ btf_16_adds_subs_avx2(&x1[19], &x1[18]);
+ btf_16_adds_subs_avx2(&x1[20], &x1[21]);
+ btf_16_adds_subs_avx2(&x1[23], &x1[22]);
+ btf_16_adds_subs_avx2(&x1[24], &x1[25]);
+ btf_16_adds_subs_avx2(&x1[27], &x1[26]);
+ btf_16_adds_subs_avx2(&x1[28], &x1[29]);
+ btf_16_adds_subs_avx2(&x1[31], &x1[30]);
+ btf_16_w16_avx2(cospi_m04_p60, cospi_p60_p04, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m60_m04, cospi_m04_p60, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m36_p28, cospi_p28_p36, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m28_m36, cospi_m36_p28, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m20_p44, cospi_p44_p20, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m44_m20, cospi_m20_p44, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m52_p12, cospi_p12_p52, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m12_m52, cospi_m52_p12, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_16_w16_avx2(cospi_p62_p02, cospi_m02_p62, &x1[16], &x1[31], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p30_p34, cospi_m34_p30, &x1[17], &x1[30], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p46_p18, cospi_m18_p46, &x1[18], &x1[29], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p14_p50, cospi_m50_p14, &x1[19], &x1[28], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p54_p10, cospi_m10_p54, &x1[20], &x1[27], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p22_p42, cospi_m42_p22, &x1[21], &x1[26], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p38_p26, cospi_m26_p38, &x1[22], &x1[25], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p06_p58, cospi_m58_p06, &x1[23], &x1[24], _r, cos_bit);
+ btf_16_adds_subs_avx2(&x1[32], &x1[33]);
+ btf_16_adds_subs_avx2(&x1[35], &x1[34]);
+ btf_16_adds_subs_avx2(&x1[36], &x1[37]);
+ btf_16_adds_subs_avx2(&x1[39], &x1[38]);
+ btf_16_adds_subs_avx2(&x1[40], &x1[41]);
+ btf_16_adds_subs_avx2(&x1[43], &x1[42]);
+ btf_16_adds_subs_avx2(&x1[44], &x1[45]);
+ btf_16_adds_subs_avx2(&x1[47], &x1[46]);
+ btf_16_adds_subs_avx2(&x1[48], &x1[49]);
+ btf_16_adds_subs_avx2(&x1[51], &x1[50]);
+ btf_16_adds_subs_avx2(&x1[52], &x1[53]);
+ btf_16_adds_subs_avx2(&x1[55], &x1[54]);
+ btf_16_adds_subs_avx2(&x1[56], &x1[57]);
+ btf_16_adds_subs_avx2(&x1[59], &x1[58]);
+ btf_16_adds_subs_avx2(&x1[60], &x1[61]);
+ btf_16_adds_subs_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_16_w16_avx2(cospi_p63_p01, cospi_m01_p63, &x1[32], &x1[63], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p31_p33, cospi_m33_p31, &x1[33], &x1[62], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p47_p17, cospi_m17_p47, &x1[34], &x1[61], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p15_p49, cospi_m49_p15, &x1[35], &x1[60], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p55_p09, cospi_m09_p55, &x1[36], &x1[59], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p23_p41, cospi_m41_p23, &x1[37], &x1[58], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p39_p25, cospi_m25_p39, &x1[38], &x1[57], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p07_p57, cospi_m57_p07, &x1[39], &x1[56], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p59_p05, cospi_m05_p59, &x1[40], &x1[55], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p27_p37, cospi_m37_p27, &x1[41], &x1[54], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p43_p21, cospi_m21_p43, &x1[42], &x1[53], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p11_p53, cospi_m53_p11, &x1[43], &x1[52], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p51_p13, cospi_m13_p51, &x1[44], &x1[51], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p19_p45, cospi_m45_p19, &x1[45], &x1[50], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p35_p29, cospi_m29_p35, &x1[46], &x1[49], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p03_p61, cospi_m61_p03, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void av1_fdct32_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ __m256i x1[32];
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+ // stage 0
+ // stage 1
+ btf_32_add_sub_out_avx2(&x1[0], &x1[31], input[0], input[31]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[30], input[1], input[30]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[29], input[2], input[29]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[28], input[3], input[28]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[27], input[4], input[27]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[26], input[5], input[26]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[25], input[6], input[25]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[24], input[7], input[24]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[23], input[8], input[23]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[22], input[9], input[22]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[21], input[10], input[21]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[20], input[11], input[20]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[19], input[12], input[19]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[18], input[13], input[18]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[17], input[14], input[17]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[16], input[15], input[16]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0(-cospi[32], cospi[32], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[21], &x1[26], _r, cos_bit);
+
+ // stage 5
+ btf_32_avx2_type0(cospi[32], cospi[32], &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1(cospi[48], cospi[16], &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0(-cospi[16], cospi[48], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[48], -cospi[16], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+
+ // stage 6
+ btf_32_avx2_type1(cospi[56], cospi[8], &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1(cospi[24], cospi[40], &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0(-cospi[8], cospi[56], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[56], -cospi[8], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[40], cospi[24], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0(-cospi[24], -cospi[40], &x1[22], &x1[25], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1(cospi[60], cospi[4], &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1(cospi[28], cospi[36], &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1(cospi[44], cospi[20], &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1(cospi[12], cospi[52], &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+
+ // stage 8
+ btf_32_avx2_type1(cospi[62], cospi[2], &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1(cospi[30], cospi[34], &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1(cospi[46], cospi[18], &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1(cospi[14], cospi[50], &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1(cospi[54], cospi[10], &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1(cospi[22], cospi[42], &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1(cospi[38], cospi[26], &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1(cospi[6], cospi[58], &x1[23], &x1[24], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[0];
+ output[1] = x1[16];
+ output[2] = x1[8];
+ output[3] = x1[24];
+ output[4] = x1[4];
+ output[5] = x1[20];
+ output[6] = x1[12];
+ output[7] = x1[28];
+ output[8] = x1[2];
+ output[9] = x1[18];
+ output[10] = x1[10];
+ output[11] = x1[26];
+ output[12] = x1[6];
+ output[13] = x1[22];
+ output[14] = x1[14];
+ output[15] = x1[30];
+ output[16] = x1[1];
+ output[17] = x1[17];
+ output[18] = x1[9];
+ output[19] = x1[25];
+ output[20] = x1[5];
+ output[21] = x1[21];
+ output[22] = x1[13];
+ output[23] = x1[29];
+ output[24] = x1[3];
+ output[25] = x1[19];
+ output[26] = x1[11];
+ output[27] = x1[27];
+ output[28] = x1[7];
+ output[29] = x1[23];
+ output[30] = x1[15];
+ output[31] = x1[31];
+}
+
+static INLINE void av1_fdct64_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_m32 = _mm256_set1_epi32(-cospi[32]);
+ __m256i cospi_p32 = _mm256_set1_epi32(cospi[32]);
+ __m256i cospi_m16 = _mm256_set1_epi32(-cospi[16]);
+ __m256i cospi_p48 = _mm256_set1_epi32(cospi[48]);
+ __m256i cospi_m48 = _mm256_set1_epi32(-cospi[48]);
+ __m256i cospi_p16 = _mm256_set1_epi32(cospi[16]);
+ __m256i cospi_m08 = _mm256_set1_epi32(-cospi[8]);
+ __m256i cospi_p56 = _mm256_set1_epi32(cospi[56]);
+ __m256i cospi_m56 = _mm256_set1_epi32(-cospi[56]);
+ __m256i cospi_m40 = _mm256_set1_epi32(-cospi[40]);
+ __m256i cospi_p24 = _mm256_set1_epi32(cospi[24]);
+ __m256i cospi_m24 = _mm256_set1_epi32(-cospi[24]);
+ __m256i cospi_p08 = _mm256_set1_epi32(cospi[8]);
+ __m256i cospi_p40 = _mm256_set1_epi32(cospi[40]);
+ __m256i cospi_p60 = _mm256_set1_epi32(cospi[60]);
+ __m256i cospi_p04 = _mm256_set1_epi32(cospi[4]);
+ __m256i cospi_p28 = _mm256_set1_epi32(cospi[28]);
+ __m256i cospi_p36 = _mm256_set1_epi32(cospi[36]);
+ __m256i cospi_p44 = _mm256_set1_epi32(cospi[44]);
+ __m256i cospi_p20 = _mm256_set1_epi32(cospi[20]);
+ __m256i cospi_p12 = _mm256_set1_epi32(cospi[12]);
+ __m256i cospi_p52 = _mm256_set1_epi32(cospi[52]);
+ __m256i cospi_m04 = _mm256_set1_epi32(-cospi[4]);
+ __m256i cospi_m60 = _mm256_set1_epi32(-cospi[60]);
+ __m256i cospi_m36 = _mm256_set1_epi32(-cospi[36]);
+ __m256i cospi_m28 = _mm256_set1_epi32(-cospi[28]);
+ __m256i cospi_m20 = _mm256_set1_epi32(-cospi[20]);
+ __m256i cospi_m44 = _mm256_set1_epi32(-cospi[44]);
+ __m256i cospi_m52 = _mm256_set1_epi32(-cospi[52]);
+ __m256i cospi_m12 = _mm256_set1_epi32(-cospi[12]);
+ __m256i cospi_p62 = _mm256_set1_epi32(cospi[62]);
+ __m256i cospi_p02 = _mm256_set1_epi32(cospi[2]);
+ __m256i cospi_p30 = _mm256_set1_epi32(cospi[30]);
+ __m256i cospi_p34 = _mm256_set1_epi32(cospi[34]);
+ __m256i cospi_p46 = _mm256_set1_epi32(cospi[46]);
+ __m256i cospi_p18 = _mm256_set1_epi32(cospi[18]);
+ __m256i cospi_p14 = _mm256_set1_epi32(cospi[14]);
+ __m256i cospi_p50 = _mm256_set1_epi32(cospi[50]);
+ __m256i cospi_p54 = _mm256_set1_epi32(cospi[54]);
+ __m256i cospi_p10 = _mm256_set1_epi32(cospi[10]);
+ __m256i cospi_p22 = _mm256_set1_epi32(cospi[22]);
+ __m256i cospi_p42 = _mm256_set1_epi32(cospi[42]);
+ __m256i cospi_p38 = _mm256_set1_epi32(cospi[38]);
+ __m256i cospi_p26 = _mm256_set1_epi32(cospi[26]);
+ __m256i cospi_p06 = _mm256_set1_epi32(cospi[6]);
+ __m256i cospi_p58 = _mm256_set1_epi32(cospi[58]);
+ __m256i cospi_p63 = _mm256_set1_epi32(cospi[63]);
+ __m256i cospi_p01 = _mm256_set1_epi32(cospi[1]);
+ __m256i cospi_p31 = _mm256_set1_epi32(cospi[31]);
+ __m256i cospi_p33 = _mm256_set1_epi32(cospi[33]);
+ __m256i cospi_p47 = _mm256_set1_epi32(cospi[47]);
+ __m256i cospi_p17 = _mm256_set1_epi32(cospi[17]);
+ __m256i cospi_p15 = _mm256_set1_epi32(cospi[15]);
+ __m256i cospi_p49 = _mm256_set1_epi32(cospi[49]);
+ __m256i cospi_p55 = _mm256_set1_epi32(cospi[55]);
+ __m256i cospi_p09 = _mm256_set1_epi32(cospi[9]);
+ __m256i cospi_p23 = _mm256_set1_epi32(cospi[23]);
+ __m256i cospi_p41 = _mm256_set1_epi32(cospi[41]);
+ __m256i cospi_p39 = _mm256_set1_epi32(cospi[39]);
+ __m256i cospi_p25 = _mm256_set1_epi32(cospi[25]);
+ __m256i cospi_p07 = _mm256_set1_epi32(cospi[7]);
+ __m256i cospi_p57 = _mm256_set1_epi32(cospi[57]);
+ __m256i cospi_p59 = _mm256_set1_epi32(cospi[59]);
+ __m256i cospi_p05 = _mm256_set1_epi32(cospi[5]);
+ __m256i cospi_p27 = _mm256_set1_epi32(cospi[27]);
+ __m256i cospi_p37 = _mm256_set1_epi32(cospi[37]);
+ __m256i cospi_p43 = _mm256_set1_epi32(cospi[43]);
+ __m256i cospi_p21 = _mm256_set1_epi32(cospi[21]);
+ __m256i cospi_p11 = _mm256_set1_epi32(cospi[11]);
+ __m256i cospi_p53 = _mm256_set1_epi32(cospi[53]);
+ __m256i cospi_p51 = _mm256_set1_epi32(cospi[51]);
+ __m256i cospi_p13 = _mm256_set1_epi32(cospi[13]);
+ __m256i cospi_p19 = _mm256_set1_epi32(cospi[19]);
+ __m256i cospi_p45 = _mm256_set1_epi32(cospi[45]);
+ __m256i cospi_p35 = _mm256_set1_epi32(cospi[35]);
+ __m256i cospi_p29 = _mm256_set1_epi32(cospi[29]);
+ __m256i cospi_p03 = _mm256_set1_epi32(cospi[3]);
+ __m256i cospi_p61 = _mm256_set1_epi32(cospi[61]);
+
+ // stage 1
+ __m256i x1[64];
+ btf_32_add_sub_out_avx2(&x1[0], &x1[63], input[0], input[63]);
+ btf_32_add_sub_out_avx2(&x1[1], &x1[62], input[1], input[62]);
+ btf_32_add_sub_out_avx2(&x1[2], &x1[61], input[2], input[61]);
+ btf_32_add_sub_out_avx2(&x1[3], &x1[60], input[3], input[60]);
+ btf_32_add_sub_out_avx2(&x1[4], &x1[59], input[4], input[59]);
+ btf_32_add_sub_out_avx2(&x1[5], &x1[58], input[5], input[58]);
+ btf_32_add_sub_out_avx2(&x1[6], &x1[57], input[6], input[57]);
+ btf_32_add_sub_out_avx2(&x1[7], &x1[56], input[7], input[56]);
+ btf_32_add_sub_out_avx2(&x1[8], &x1[55], input[8], input[55]);
+ btf_32_add_sub_out_avx2(&x1[9], &x1[54], input[9], input[54]);
+ btf_32_add_sub_out_avx2(&x1[10], &x1[53], input[10], input[53]);
+ btf_32_add_sub_out_avx2(&x1[11], &x1[52], input[11], input[52]);
+ btf_32_add_sub_out_avx2(&x1[12], &x1[51], input[12], input[51]);
+ btf_32_add_sub_out_avx2(&x1[13], &x1[50], input[13], input[50]);
+ btf_32_add_sub_out_avx2(&x1[14], &x1[49], input[14], input[49]);
+ btf_32_add_sub_out_avx2(&x1[15], &x1[48], input[15], input[48]);
+ btf_32_add_sub_out_avx2(&x1[16], &x1[47], input[16], input[47]);
+ btf_32_add_sub_out_avx2(&x1[17], &x1[46], input[17], input[46]);
+ btf_32_add_sub_out_avx2(&x1[18], &x1[45], input[18], input[45]);
+ btf_32_add_sub_out_avx2(&x1[19], &x1[44], input[19], input[44]);
+ btf_32_add_sub_out_avx2(&x1[20], &x1[43], input[20], input[43]);
+ btf_32_add_sub_out_avx2(&x1[21], &x1[42], input[21], input[42]);
+ btf_32_add_sub_out_avx2(&x1[22], &x1[41], input[22], input[41]);
+ btf_32_add_sub_out_avx2(&x1[23], &x1[40], input[23], input[40]);
+ btf_32_add_sub_out_avx2(&x1[24], &x1[39], input[24], input[39]);
+ btf_32_add_sub_out_avx2(&x1[25], &x1[38], input[25], input[38]);
+ btf_32_add_sub_out_avx2(&x1[26], &x1[37], input[26], input[37]);
+ btf_32_add_sub_out_avx2(&x1[27], &x1[36], input[27], input[36]);
+ btf_32_add_sub_out_avx2(&x1[28], &x1[35], input[28], input[35]);
+ btf_32_add_sub_out_avx2(&x1[29], &x1[34], input[29], input[34]);
+ btf_32_add_sub_out_avx2(&x1[30], &x1[33], input[30], input[33]);
+ btf_32_add_sub_out_avx2(&x1[31], &x1[32], input[31], input[32]);
+
+ // stage 2
+ btf_32_add_sub_avx2(&x1[0], &x1[31]);
+ btf_32_add_sub_avx2(&x1[1], &x1[30]);
+ btf_32_add_sub_avx2(&x1[2], &x1[29]);
+ btf_32_add_sub_avx2(&x1[3], &x1[28]);
+ btf_32_add_sub_avx2(&x1[4], &x1[27]);
+ btf_32_add_sub_avx2(&x1[5], &x1[26]);
+ btf_32_add_sub_avx2(&x1[6], &x1[25]);
+ btf_32_add_sub_avx2(&x1[7], &x1[24]);
+ btf_32_add_sub_avx2(&x1[8], &x1[23]);
+ btf_32_add_sub_avx2(&x1[9], &x1[22]);
+ btf_32_add_sub_avx2(&x1[10], &x1[21]);
+ btf_32_add_sub_avx2(&x1[11], &x1[20]);
+ btf_32_add_sub_avx2(&x1[12], &x1[19]);
+ btf_32_add_sub_avx2(&x1[13], &x1[18]);
+ btf_32_add_sub_avx2(&x1[14], &x1[17]);
+ btf_32_add_sub_avx2(&x1[15], &x1[16]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 3
+ btf_32_add_sub_avx2(&x1[0], &x1[15]);
+ btf_32_add_sub_avx2(&x1[1], &x1[14]);
+ btf_32_add_sub_avx2(&x1[2], &x1[13]);
+ btf_32_add_sub_avx2(&x1[3], &x1[12]);
+ btf_32_add_sub_avx2(&x1[4], &x1[11]);
+ btf_32_add_sub_avx2(&x1[5], &x1[10]);
+ btf_32_add_sub_avx2(&x1[6], &x1[9]);
+ btf_32_add_sub_avx2(&x1[7], &x1[8]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[47]);
+ btf_32_add_sub_avx2(&x1[33], &x1[46]);
+ btf_32_add_sub_avx2(&x1[34], &x1[45]);
+ btf_32_add_sub_avx2(&x1[35], &x1[44]);
+ btf_32_add_sub_avx2(&x1[36], &x1[43]);
+ btf_32_add_sub_avx2(&x1[37], &x1[42]);
+ btf_32_add_sub_avx2(&x1[38], &x1[41]);
+ btf_32_add_sub_avx2(&x1[39], &x1[40]);
+ btf_32_add_sub_avx2(&x1[63], &x1[48]);
+ btf_32_add_sub_avx2(&x1[62], &x1[49]);
+ btf_32_add_sub_avx2(&x1[61], &x1[50]);
+ btf_32_add_sub_avx2(&x1[60], &x1[51]);
+ btf_32_add_sub_avx2(&x1[59], &x1[52]);
+ btf_32_add_sub_avx2(&x1[58], &x1[53]);
+ btf_32_add_sub_avx2(&x1[57], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[55]);
+
+ // stage 4
+ btf_32_add_sub_avx2(&x1[0], &x1[7]);
+ btf_32_add_sub_avx2(&x1[1], &x1[6]);
+ btf_32_add_sub_avx2(&x1[2], &x1[5]);
+ btf_32_add_sub_avx2(&x1[3], &x1[4]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[23]);
+ btf_32_add_sub_avx2(&x1[17], &x1[22]);
+ btf_32_add_sub_avx2(&x1[18], &x1[21]);
+ btf_32_add_sub_avx2(&x1[19], &x1[20]);
+ btf_32_add_sub_avx2(&x1[31], &x1[24]);
+ btf_32_add_sub_avx2(&x1[30], &x1[25]);
+ btf_32_add_sub_avx2(&x1[29], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[27]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[43], &x1[52], _r, cos_bit);
+
+ // stage 5
+ btf_32_add_sub_avx2(&x1[0], &x1[3]);
+ btf_32_add_sub_avx2(&x1[1], &x1[2]);
+ btf_32_avx2_type0_new(cospi_m32, cospi_p32, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[11]);
+ btf_32_add_sub_avx2(&x1[9], &x1[10]);
+ btf_32_add_sub_avx2(&x1[15], &x1[12]);
+ btf_32_add_sub_avx2(&x1[14], &x1[13]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[39]);
+ btf_32_add_sub_avx2(&x1[33], &x1[38]);
+ btf_32_add_sub_avx2(&x1[34], &x1[37]);
+ btf_32_add_sub_avx2(&x1[35], &x1[36]);
+ btf_32_add_sub_avx2(&x1[47], &x1[40]);
+ btf_32_add_sub_avx2(&x1[46], &x1[41]);
+ btf_32_add_sub_avx2(&x1[45], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[43]);
+ btf_32_add_sub_avx2(&x1[48], &x1[55]);
+ btf_32_add_sub_avx2(&x1[49], &x1[54]);
+ btf_32_add_sub_avx2(&x1[50], &x1[53]);
+ btf_32_add_sub_avx2(&x1[51], &x1[52]);
+ btf_32_add_sub_avx2(&x1[63], &x1[56]);
+ btf_32_add_sub_avx2(&x1[62], &x1[57]);
+ btf_32_add_sub_avx2(&x1[61], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[59]);
+
+ // stage 6
+ btf_32_avx2_type0_new(cospi_p32, cospi_p32, &x1[0], &x1[1], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p48, cospi_p16, &x1[2], &x1[3], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[4], &x1[5]);
+ btf_32_add_sub_avx2(&x1[7], &x1[6]);
+ btf_32_avx2_type0_new(cospi_m16, cospi_p48, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m48, cospi_m16, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[19]);
+ btf_32_add_sub_avx2(&x1[17], &x1[18]);
+ btf_32_add_sub_avx2(&x1[23], &x1[20]);
+ btf_32_add_sub_avx2(&x1[22], &x1[21]);
+ btf_32_add_sub_avx2(&x1[24], &x1[27]);
+ btf_32_add_sub_avx2(&x1[25], &x1[26]);
+ btf_32_add_sub_avx2(&x1[31], &x1[28]);
+ btf_32_add_sub_avx2(&x1[30], &x1[29]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[45], &x1[50], _r, cos_bit);
+
+ // stage 7
+ btf_32_avx2_type1_new(cospi_p56, cospi_p08, &x1[4], &x1[7], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p24, cospi_p40, &x1[5], &x1[6], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[8], &x1[9]);
+ btf_32_add_sub_avx2(&x1[11], &x1[10]);
+ btf_32_add_sub_avx2(&x1[12], &x1[13]);
+ btf_32_add_sub_avx2(&x1[15], &x1[14]);
+ btf_32_avx2_type0_new(cospi_m08, cospi_p56, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m56, cospi_m08, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m40, cospi_p24, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m24, cospi_m40, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[35]);
+ btf_32_add_sub_avx2(&x1[33], &x1[34]);
+ btf_32_add_sub_avx2(&x1[39], &x1[36]);
+ btf_32_add_sub_avx2(&x1[38], &x1[37]);
+ btf_32_add_sub_avx2(&x1[40], &x1[43]);
+ btf_32_add_sub_avx2(&x1[41], &x1[42]);
+ btf_32_add_sub_avx2(&x1[47], &x1[44]);
+ btf_32_add_sub_avx2(&x1[46], &x1[45]);
+ btf_32_add_sub_avx2(&x1[48], &x1[51]);
+ btf_32_add_sub_avx2(&x1[49], &x1[50]);
+ btf_32_add_sub_avx2(&x1[55], &x1[52]);
+ btf_32_add_sub_avx2(&x1[54], &x1[53]);
+ btf_32_add_sub_avx2(&x1[56], &x1[59]);
+ btf_32_add_sub_avx2(&x1[57], &x1[58]);
+ btf_32_add_sub_avx2(&x1[63], &x1[60]);
+ btf_32_add_sub_avx2(&x1[62], &x1[61]);
+
+ // stage 8
+ btf_32_avx2_type1_new(cospi_p60, cospi_p04, &x1[8], &x1[15], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p28, cospi_p36, &x1[9], &x1[14], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p44, cospi_p20, &x1[10], &x1[13], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p12, cospi_p52, &x1[11], &x1[12], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[16], &x1[17]);
+ btf_32_add_sub_avx2(&x1[19], &x1[18]);
+ btf_32_add_sub_avx2(&x1[20], &x1[21]);
+ btf_32_add_sub_avx2(&x1[23], &x1[22]);
+ btf_32_add_sub_avx2(&x1[24], &x1[25]);
+ btf_32_add_sub_avx2(&x1[27], &x1[26]);
+ btf_32_add_sub_avx2(&x1[28], &x1[29]);
+ btf_32_add_sub_avx2(&x1[31], &x1[30]);
+ btf_32_avx2_type0_new(cospi_m04, cospi_p60, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m60, cospi_m04, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m36, cospi_p28, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m28, cospi_m36, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m20, cospi_p44, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m44, cospi_m20, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m52, cospi_p12, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type0_new(cospi_m12, cospi_m52, &x1[46], &x1[49], _r, cos_bit);
+
+ // stage 9
+ btf_32_avx2_type1_new(cospi_p62, cospi_p02, &x1[16], &x1[31], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p30, cospi_p34, &x1[17], &x1[30], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p46, cospi_p18, &x1[18], &x1[29], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p14, cospi_p50, &x1[19], &x1[28], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p54, cospi_p10, &x1[20], &x1[27], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p22, cospi_p42, &x1[21], &x1[26], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p38, cospi_p26, &x1[22], &x1[25], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p06, cospi_p58, &x1[23], &x1[24], _r, cos_bit);
+ btf_32_add_sub_avx2(&x1[32], &x1[33]);
+ btf_32_add_sub_avx2(&x1[35], &x1[34]);
+ btf_32_add_sub_avx2(&x1[36], &x1[37]);
+ btf_32_add_sub_avx2(&x1[39], &x1[38]);
+ btf_32_add_sub_avx2(&x1[40], &x1[41]);
+ btf_32_add_sub_avx2(&x1[43], &x1[42]);
+ btf_32_add_sub_avx2(&x1[44], &x1[45]);
+ btf_32_add_sub_avx2(&x1[47], &x1[46]);
+ btf_32_add_sub_avx2(&x1[48], &x1[49]);
+ btf_32_add_sub_avx2(&x1[51], &x1[50]);
+ btf_32_add_sub_avx2(&x1[52], &x1[53]);
+ btf_32_add_sub_avx2(&x1[55], &x1[54]);
+ btf_32_add_sub_avx2(&x1[56], &x1[57]);
+ btf_32_add_sub_avx2(&x1[59], &x1[58]);
+ btf_32_add_sub_avx2(&x1[60], &x1[61]);
+ btf_32_add_sub_avx2(&x1[63], &x1[62]);
+
+ // stage 10
+ btf_32_avx2_type1_new(cospi_p63, cospi_p01, &x1[32], &x1[63], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p31, cospi_p33, &x1[33], &x1[62], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p47, cospi_p17, &x1[34], &x1[61], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p15, cospi_p49, &x1[35], &x1[60], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p55, cospi_p09, &x1[36], &x1[59], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p23, cospi_p41, &x1[37], &x1[58], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p39, cospi_p25, &x1[38], &x1[57], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p07, cospi_p57, &x1[39], &x1[56], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p59, cospi_p05, &x1[40], &x1[55], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p27, cospi_p37, &x1[41], &x1[54], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p43, cospi_p21, &x1[42], &x1[53], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p11, cospi_p53, &x1[43], &x1[52], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p51, cospi_p13, &x1[44], &x1[51], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p19, cospi_p45, &x1[45], &x1[50], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p35, cospi_p29, &x1[46], &x1[49], _r, cos_bit);
+ btf_32_avx2_type1_new(cospi_p03, cospi_p61, &x1[47], &x1[48], _r, cos_bit);
+
+ // stage 11
+ output[0] = x1[0];
+ output[1] = x1[32];
+ output[2] = x1[16];
+ output[3] = x1[48];
+ output[4] = x1[8];
+ output[5] = x1[40];
+ output[6] = x1[24];
+ output[7] = x1[56];
+ output[8] = x1[4];
+ output[9] = x1[36];
+ output[10] = x1[20];
+ output[11] = x1[52];
+ output[12] = x1[12];
+ output[13] = x1[44];
+ output[14] = x1[28];
+ output[15] = x1[60];
+ output[16] = x1[2];
+ output[17] = x1[34];
+ output[18] = x1[18];
+ output[19] = x1[50];
+ output[20] = x1[10];
+ output[21] = x1[42];
+ output[22] = x1[26];
+ output[23] = x1[58];
+ output[24] = x1[6];
+ output[25] = x1[38];
+ output[26] = x1[22];
+ output[27] = x1[54];
+ output[28] = x1[14];
+ output[29] = x1[46];
+ output[30] = x1[30];
+ output[31] = x1[62];
+ output[32] = x1[1];
+ output[33] = x1[33];
+ output[34] = x1[17];
+ output[35] = x1[49];
+ output[36] = x1[9];
+ output[37] = x1[41];
+ output[38] = x1[25];
+ output[39] = x1[57];
+ output[40] = x1[5];
+ output[41] = x1[37];
+ output[42] = x1[21];
+ output[43] = x1[53];
+ output[44] = x1[13];
+ output[45] = x1[45];
+ output[46] = x1[29];
+ output[47] = x1[61];
+ output[48] = x1[3];
+ output[49] = x1[35];
+ output[50] = x1[19];
+ output[51] = x1[51];
+ output[52] = x1[11];
+ output[53] = x1[43];
+ output[54] = x1[27];
+ output[55] = x1[59];
+ output[56] = x1[7];
+ output[57] = x1[39];
+ output[58] = x1[23];
+ output[59] = x1[55];
+ output[60] = x1[15];
+ output[61] = x1[47];
+ output[62] = x1[31];
+ output[63] = x1[63];
+}
+
+static INLINE void fadst16x16_new_avx2(const __m256i *input, __m256i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m256i __zero = _mm256_setzero_si256();
+ const __m256i _r = _mm256_set1_epi32(1 << (cos_bit - 1));
+
+ __m256i cospi_p32_p32 = pair_set_w16_epi16(cospi[32], cospi[32]);
+ __m256i cospi_p32_m32 = pair_set_w16_epi16(cospi[32], -cospi[32]);
+ __m256i cospi_p16_p48 = pair_set_w16_epi16(cospi[16], cospi[48]);
+ __m256i cospi_p48_m16 = pair_set_w16_epi16(cospi[48], -cospi[16]);
+ __m256i cospi_m48_p16 = pair_set_w16_epi16(-cospi[48], cospi[16]);
+ __m256i cospi_p08_p56 = pair_set_w16_epi16(cospi[8], cospi[56]);
+ __m256i cospi_p56_m08 = pair_set_w16_epi16(cospi[56], -cospi[8]);
+ __m256i cospi_p40_p24 = pair_set_w16_epi16(cospi[40], cospi[24]);
+ __m256i cospi_p24_m40 = pair_set_w16_epi16(cospi[24], -cospi[40]);
+ __m256i cospi_m56_p08 = pair_set_w16_epi16(-cospi[56], cospi[8]);
+ __m256i cospi_m24_p40 = pair_set_w16_epi16(-cospi[24], cospi[40]);
+ __m256i cospi_p02_p62 = pair_set_w16_epi16(cospi[2], cospi[62]);
+ __m256i cospi_p62_m02 = pair_set_w16_epi16(cospi[62], -cospi[2]);
+ __m256i cospi_p10_p54 = pair_set_w16_epi16(cospi[10], cospi[54]);
+ __m256i cospi_p54_m10 = pair_set_w16_epi16(cospi[54], -cospi[10]);
+ __m256i cospi_p18_p46 = pair_set_w16_epi16(cospi[18], cospi[46]);
+ __m256i cospi_p46_m18 = pair_set_w16_epi16(cospi[46], -cospi[18]);
+ __m256i cospi_p26_p38 = pair_set_w16_epi16(cospi[26], cospi[38]);
+ __m256i cospi_p38_m26 = pair_set_w16_epi16(cospi[38], -cospi[26]);
+ __m256i cospi_p34_p30 = pair_set_w16_epi16(cospi[34], cospi[30]);
+ __m256i cospi_p30_m34 = pair_set_w16_epi16(cospi[30], -cospi[34]);
+ __m256i cospi_p42_p22 = pair_set_w16_epi16(cospi[42], cospi[22]);
+ __m256i cospi_p22_m42 = pair_set_w16_epi16(cospi[22], -cospi[42]);
+ __m256i cospi_p50_p14 = pair_set_w16_epi16(cospi[50], cospi[14]);
+ __m256i cospi_p14_m50 = pair_set_w16_epi16(cospi[14], -cospi[50]);
+ __m256i cospi_p58_p06 = pair_set_w16_epi16(cospi[58], cospi[6]);
+ __m256i cospi_p06_m58 = pair_set_w16_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m256i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm256_subs_epi16(__zero, input[15]);
+ x1[2] = _mm256_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm256_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm256_subs_epi16(__zero, input[11]);
+ x1[8] = _mm256_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm256_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm256_subs_epi16(__zero, input[13]);
+ x1[14] = _mm256_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p32_p32, cospi_p32_m32, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 3
+ btf_16_adds_subs_avx2(&x1[0], &x1[2]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[3]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[12], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[13], &x1[15]);
+
+ // stage 4
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p16_p48, cospi_p48_m16, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m48_p16, cospi_p16_p48, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 5
+ btf_16_adds_subs_avx2(&x1[0], &x1[4]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[5]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[6]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[7]);
+ btf_16_adds_subs_avx2(&x1[8], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[9], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[10], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[11], &x1[15]);
+
+ // stage 6
+ btf_16_w16_avx2(cospi_p08_p56, cospi_p56_m08, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p40_p24, cospi_p24_m40, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m56_p08, cospi_p08_p56, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_m24_p40, cospi_p40_p24, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 7
+ btf_16_adds_subs_avx2(&x1[0], &x1[8]);
+ btf_16_adds_subs_avx2(&x1[1], &x1[9]);
+ btf_16_adds_subs_avx2(&x1[2], &x1[10]);
+ btf_16_adds_subs_avx2(&x1[3], &x1[11]);
+ btf_16_adds_subs_avx2(&x1[4], &x1[12]);
+ btf_16_adds_subs_avx2(&x1[5], &x1[13]);
+ btf_16_adds_subs_avx2(&x1[6], &x1[14]);
+ btf_16_adds_subs_avx2(&x1[7], &x1[15]);
+
+ // stage 8
+ btf_16_w16_avx2(cospi_p02_p62, cospi_p62_m02, &x1[0], &x1[1], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p10_p54, cospi_p54_m10, &x1[2], &x1[3], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p18_p46, cospi_p46_m18, &x1[4], &x1[5], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p26_p38, cospi_p38_m26, &x1[6], &x1[7], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p34_p30, cospi_p30_m34, &x1[8], &x1[9], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p42_p22, cospi_p22_m42, &x1[10], &x1[11], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p50_p14, cospi_p14_m50, &x1[12], &x1[13], _r, cos_bit);
+ btf_16_w16_avx2(cospi_p58_p06, cospi_p06_m58, &x1[14], &x1[15], _r, cos_bit);
+
+ // stage 9
+ output[0] = x1[1];
+ output[1] = x1[14];
+ output[2] = x1[3];
+ output[3] = x1[12];
+ output[4] = x1[5];
+ output[5] = x1[10];
+ output[6] = x1[7];
+ output[7] = x1[8];
+ output[8] = x1[9];
+ output[9] = x1[6];
+ output[10] = x1[11];
+ output[11] = x1[4];
+ output[12] = x1[13];
+ output[13] = x1[2];
+ output[14] = x1[15];
+ output[15] = x1[0];
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+ const __m256i scale__r = pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m256i b = _mm256_madd_epi16(a, scale__r);
+ return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void fidentity16x16_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m256i a_lo = _mm256_unpacklo_epi16(input[i], one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(input[i], one);
+ const __m256i b_lo = scale_round_avx2(a_lo, 2 * NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm256_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity16x32_new_avx2(const __m256i *input,
+ __m256i *output, int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm256_slli_epi16(input[i], 2);
+ }
+}
+
+static INLINE void av1_round_shift_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = av1_round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void av1_round_shift_rect_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size,
+ const int bit) {
+ const __m256i sqrt2 = _mm256_set1_epi32(NewSqrt2);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = av1_round_shift_32_avx2(input[i], bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = av1_round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+static INLINE void transpose_32_8x8_avx2(int stride, const __m256i *inputA,
+ __m256i *output) {
+ __m256i temp0 = _mm256_unpacklo_epi32(inputA[0], inputA[2]);
+ __m256i temp1 = _mm256_unpackhi_epi32(inputA[0], inputA[2]);
+ __m256i temp2 = _mm256_unpacklo_epi32(inputA[1], inputA[3]);
+ __m256i temp3 = _mm256_unpackhi_epi32(inputA[1], inputA[3]);
+ __m256i temp4 = _mm256_unpacklo_epi32(inputA[4], inputA[6]);
+ __m256i temp5 = _mm256_unpackhi_epi32(inputA[4], inputA[6]);
+ __m256i temp6 = _mm256_unpacklo_epi32(inputA[5], inputA[7]);
+ __m256i temp7 = _mm256_unpackhi_epi32(inputA[5], inputA[7]);
+
+ __m256i t0 = _mm256_unpacklo_epi32(temp0, temp2);
+ __m256i t1 = _mm256_unpackhi_epi32(temp0, temp2);
+ __m256i t2 = _mm256_unpacklo_epi32(temp1, temp3);
+ __m256i t3 = _mm256_unpackhi_epi32(temp1, temp3);
+ __m256i t4 = _mm256_unpacklo_epi32(temp4, temp6);
+ __m256i t5 = _mm256_unpackhi_epi32(temp4, temp6);
+ __m256i t6 = _mm256_unpacklo_epi32(temp5, temp7);
+ __m256i t7 = _mm256_unpackhi_epi32(temp5, temp7);
+
+ output[0 * stride] = _mm256_permute2x128_si256(t0, t4, 0x20);
+ output[1 * stride] = _mm256_permute2x128_si256(t1, t5, 0x20);
+ output[2 * stride] = _mm256_permute2x128_si256(t2, t6, 0x20);
+ output[3 * stride] = _mm256_permute2x128_si256(t3, t7, 0x20);
+ output[4 * stride] = _mm256_permute2x128_si256(t0, t4, 0x31);
+ output[5 * stride] = _mm256_permute2x128_si256(t1, t5, 0x31);
+ output[6 * stride] = _mm256_permute2x128_si256(t2, t6, 0x31);
+ output[7 * stride] = _mm256_permute2x128_si256(t3, t7, 0x31);
+}
+
+// Store 8 16 bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+ int32_t *out,
+ const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ _mm256_store_si256((__m256i *)(out),
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(in[i])));
+ _mm256_store_si256(
+ (__m256i *)(out + 8),
+ _mm256_cvtepi16_epi32(_mm256_extracti128_si256(in[i], 1)));
+ out += stride;
+ }
+}
+
+static INLINE void store_rect_16bit_to_32bit_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_reoder = _mm256_permute4x64_epi64(a, 0xd8);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a_reoder, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a_reoder, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ _mm256_store_si256((__m256i *)b, b_lo);
+ _mm256_store_si256((__m256i *)(b + 8), b_hi);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w16_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_avx2(in[i], out + i * stride);
+ }
+}
+
+static const transform_1d_avx2 col_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_new_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_new_avx2, // IDTX
+ fdct16x32_new_avx2, // V_DCT
+ fidentity16x32_new_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x32_arr[TX_TYPES] = {
+ fdct16x32_new_avx2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity16x32_new_avx2, // IDTX
+ fidentity16x32_new_avx2, // V_DCT
+ fdct16x32_new_avx2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const transform_1d_avx2 col_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fadst16x16_new_avx2, // ADST_DCT
+ fdct16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fadst16x16_new_avx2, // FLIPADST_DCT
+ fdct16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fdct16x16_new_avx2, // V_DCT
+ fidentity16x16_new_avx2, // H_DCT
+ fadst16x16_new_avx2, // V_ADST
+ fidentity16x16_new_avx2, // H_ADST
+ fadst16x16_new_avx2, // V_FLIPADST
+ fidentity16x16_new_avx2 // H_FLIPADST
+};
+
+static const transform_1d_avx2 row_txfm16x16_arr[TX_TYPES] = {
+ fdct16x16_new_avx2, // DCT_DCT
+ fdct16x16_new_avx2, // ADST_DCT
+ fadst16x16_new_avx2, // DCT_ADST
+ fadst16x16_new_avx2, // ADST_ADST
+ fdct16x16_new_avx2, // FLIPADST_DCT
+ fadst16x16_new_avx2, // DCT_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_FLIPADST
+ fadst16x16_new_avx2, // ADST_FLIPADST
+ fadst16x16_new_avx2, // FLIPADST_ADST
+ fidentity16x16_new_avx2, // IDTX
+ fidentity16x16_new_avx2, // V_DCT
+ fdct16x16_new_avx2, // H_DCT
+ fidentity16x16_new_avx2, // V_ADST
+ fadst16x16_new_avx2, // H_ADST
+ fidentity16x16_new_avx2, // V_FLIPADST
+ fadst16x16_new_avx2 // H_FLIPADST
+};
+
+static void lowbd_fwd_txfm2d_16x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X16;
+ __m256i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int32_t i = 0;
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_32x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_32X32;
+ __m256i buf0[32], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0 + 0 * 16, buf1 + 0 * width + 16 * i);
+ transpose_16bit_16x16_avx2(buf0 + 1 * 16, buf1 + 1 * width + 16 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i, width,
+ 16);
+ transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16 * width * i + 16,
+ width, 16);
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_16X32;
+ __m256i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x16_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1);
+ transpose_16bit_16x16_avx2(buf0 + 16, buf1 + 16);
+
+ for (int i = 0; i < 2; i++) {
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output + 16 * width * i,
+ width, 16);
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m256i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_avx2 col_txfm = col_txfm16x16_arr[tx_type];
+ const transform_1d_avx2 row_txfm = row_txfm16x32_arr[tx_type];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip_avx2(input + 16 * i, stride, buf0,
+ height);
+ } else {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ }
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ transpose_16bit_16x16_avx2(buf0, buf1 + 0 * width + 16 * i);
+ }
+
+ __m256i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_avx2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ transpose_16bit_16x16_avx2(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf, output, width, 16);
+
+ transpose_16bit_16x16_avx2(buf + 16, buf + 16);
+ store_rect_buffer_16bit_to_32bit_w16_avx2(buf + 16, output + 16, width, 16);
+}
+
+static void lowbd_fwd_txfm2d_64x32_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = col_txfm16x32_arr[tx_type];
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[64];
+ __m256i bufB[64];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct64_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct64_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m256i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(2, height_div16); ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(2, height_div16); i++) {
+ __m256i bufA[32];
+ __m256i bufB[32];
+ __m128i *buf = (__m128i *)(buf1 + width * i);
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm256_cvtepi16_epi32(buf[j * 2]);
+ bufB[j] = _mm256_cvtepi16_epi32(buf[j * 2 + 1]);
+ }
+ av1_fdct32_new_avx2(bufA, bufA, cos_bit_row);
+ av1_fdct32_new_avx2(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_avx2(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_rect_array_32_avx2(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 16 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m256i *out = (__m256i *)(output8 + 8 * j);
+ transpose_32_8x8_avx2(4, bufA + 8 * j, out);
+ transpose_32_8x8_avx2(4, bufB + 8 * j, out + 8 * 4);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_16x64_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x64_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x16_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div16); i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ int32_t *output16 = output + 16 * width * i;
+ for (int j = 0; j < width_div16; ++j) {
+ __m256i *buf16 = buf + 16 * j;
+ transpose_16bit_16x16_avx2(buf16, buf16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, width, 16);
+ }
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static void lowbd_fwd_txfm2d_64x16_avx2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m256i buf0[64], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_avx2 col_txfm = fdct16x16_new_avx2;
+ const transform_1d_avx2 row_txfm = fdct16x64_new_avx2;
+ const int width_div16 = (width >> 4);
+ const int height_div16 = (height >> 4);
+
+ for (int i = 0; i < width_div16; i++) {
+ load_buffer_16bit_to_16bit_avx2(input + 16 * i, stride, buf0, height);
+ round_shift_16bit_w16_avx2(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit_w16_avx2(buf0, height, shift[1]);
+ for (int j = 0; j < height_div16; ++j) {
+ transpose_16bit_16x16_avx2(buf0 + j * 16, buf1 + j * width + 16 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div16; i++) {
+ __m256i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit_w16_avx2(buf, width, shift[2]);
+ int32_t *output16 = output + 16 * 32 * i;
+ for (int j = 0; j < 2; ++j) {
+ __m256i *buf16 = buf + 16 * j;
+ transpose_16bit_16x16_avx2(buf16, buf16);
+ store_buffer_16bit_to_32bit_w16_avx2(buf16, output16 + 16 * j, 32, 16);
+ }
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ lowbd_fwd_txfm2d_16x16_avx2, // 16x16 transform
+ lowbd_fwd_txfm2d_32x32_avx2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_avx2, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ lowbd_fwd_txfm2d_16x32_avx2, // 16x32 transform
+ lowbd_fwd_txfm2d_32x16_avx2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_avx2, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_avx2, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ lowbd_fwd_txfm2d_16x64_avx2, // 16x64 transform
+ lowbd_fwd_txfm2d_64x16_avx2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_avx2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000000..8ec0256eb8
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm2d_sse4.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/av1_rtcd.h"
+
+#include "av1/common/enums.h"
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+ const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+ int r, c;
+ for (r = 0; r < txfm1d_size; r++) {
+ for (c = 0; c < txfm1d_size; c++) {
+ output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+ }
+ }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 32;
+ const int num_per_128 = 4;
+ __m128i buf0[32];
+ __m128i buf1[32];
+ int col_num = txfm_size / num_per_128;
+ int col;
+ (void)stage_range;
+ for (col = 0; col < col_num; col++) {
+ int j;
+ for (j = 0; j < 32; ++j) {
+ buf0[j] = input[j * col_num + col];
+ }
+ av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
+ for (j = 0; j < 32; ++j) {
+ output[j * col_num + col] = buf1[j];
+ }
+ }
+}
+
+static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range) {
+ const int txfm_size = 64;
+ const int num_per_128 = 4;
+ int col_num = txfm_size / num_per_128;
+ (void)stage_range;
+ for (int col = 0; col < col_num; col++) {
+ av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
+ col_num);
+ }
+}
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+ switch (txfm_type) {
+ case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
+ case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
+ default: assert(0);
+ }
+ return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+ const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ // TODO(sarahparker) This does not currently support rectangular transforms
+ // and will break without splitting txfm_size out into row and col size.
+ // Rectangular transforms use c code only, so it should be ok for now.
+ // It will be corrected when there are sse implementations for rectangular
+ // transforms.
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t *stage_range_row = cfg->stage_range_row;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+ int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+ txfm_size);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+ txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32(txfm_size, buf_128, out_128);
+}
+
+static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
+ int32_t *output, const int stride,
+ const TXFM_2D_FLIP_CFG *cfg,
+ int32_t *txfm_buf) {
+ assert(cfg->tx_size < TX_SIZES);
+ const int txfm_size = tx_size_wide[cfg->tx_size];
+ const int8_t *shift = cfg->shift;
+ const int8_t *stage_range_col = cfg->stage_range_col;
+ const int8_t cos_bit_col = cfg->cos_bit_col;
+ const int8_t cos_bit_row = cfg->cos_bit_row;
+ const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+ __m128i *buf_128 = (__m128i *)txfm_buf;
+ __m128i *out_128 = (__m128i *)output;
+
+ const int num_per_128 = 4;
+ int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+ int col_num = txfm_size / num_per_128;
+
+ int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
+ txfm_size);
+ /*col wise transform*/
+ txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+ av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+ transpose_32(txfm_size, out_128, buf_128);
+
+ /*row wise transform*/
+ for (int col = 0; col < (col_num >> 1); col++) {
+ av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
+ col_num, (col_num >> 1));
+ }
+
+ txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
+ av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+ transpose_32x32(buf_128, out_128);
+}
+
+void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
+ (void)bd;
+ fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
+ TXFM_2D_FLIP_CFG cfg;
+ av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
+ (void)bd;
+ fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
+}
+
+static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
+ const __m128i *inputB, __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
+ __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
+ __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
+ __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+
+ temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
+ temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
+ temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
+ temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
+
+ output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X64;
+ __m128i buf0[64], buf1[512];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
+ av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ const TX_SIZE tx_size = TX_64X32;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+ assert(tx_type == DCT_DCT);
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[64];
+ __m128i bufB[64];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
+ av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_32X64;
+ __m128i buf0[64], buf1[256];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i bufA[32];
+ __m128i bufB[32];
+ __m128i *buf = buf1 + width * i;
+ for (int j = 0; j < width; ++j) {
+ bufA[j] = _mm_cvtepi16_epi32(buf[j]);
+ bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
+ }
+ av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
+ av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
+ av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
+ av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
+
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < (32 / 4); ++j) {
+ __m128i *out = (__m128i *)(output8 + 4 * j);
+ transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
+ }
+ }
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ lowbd_fwd_txfm2d_64x64_sse4_1, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ lowbd_fwd_txfm2d_32x64_sse4_1, // 32x64 transform
+ lowbd_fwd_txfm2d_64x32_sse4_1, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ } else {
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
new file mode 100644
index 0000000000..38707137c4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_avx2.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
+#include <immintrin.h>
+
+static INLINE __m256i av1_round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+static INLINE void btf_32_avx2_type1(const int32_t w0, const int32_t w1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r, const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i ww0 = _mm256_set1_epi32(w0);
+ const __m256i ww1 = _mm256_set1_epi32(w1);
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+static INLINE void btf_32_avx2_type0_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in0_w1, in1_w0);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+static INLINE void btf_32_avx2_type1_new(const __m256i ww0, const __m256i ww1,
+ __m256i *in0, __m256i *in1,
+ const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i _in0 = *in0;
+ __m256i _in1 = *in1;
+ const __m256i in0_w0 = _mm256_mullo_epi32(_in0, ww0);
+ const __m256i in1_w1 = _mm256_mullo_epi32(_in1, ww1);
+ __m256i temp0 = _mm256_add_epi32(in0_w0, in1_w1);
+ temp0 = _mm256_add_epi32(temp0, _r);
+ *in0 = _mm256_srai_epi32(temp0, cos_bit);
+ const __m256i in0_w1 = _mm256_mullo_epi32(_in0, ww1);
+ const __m256i in1_w0 = _mm256_mullo_epi32(_in1, ww0);
+ __m256i temp1 = _mm256_sub_epi32(in1_w0, in0_w1);
+ temp1 = _mm256_add_epi32(temp1, _r);
+ *in1 = _mm256_srai_epi32(temp1, cos_bit);
+}
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_AVX2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
new file mode 100644
index 0000000000..6aae7ce1e4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -0,0 +1,2889 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "av1/common/x86/av1_txfm_sse2.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
+
+// TODO(linfengz): refine fdct4x8 and fadst4x8 optimization (if possible).
+
+static void fdct4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ const __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ const __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ const __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ __m128i u[4], v[4];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[3], input[2]);
+
+ v[0] = _mm_add_epi16(u[0], u[1]);
+ v[1] = _mm_sub_epi16(u[0], u[1]);
+
+ u[0] = _mm_madd_epi16(v[0], cospi_p32_p32); // 0
+ u[1] = _mm_madd_epi16(v[0], cospi_p32_m32); // 2
+ u[2] = _mm_madd_epi16(v[1], cospi_p16_p48); // 1
+ u[3] = _mm_madd_epi16(v[1], cospi_p48_m16); // 3
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[3], __rounding);
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[1]);
+ output[1] = _mm_packs_epi32(u[2], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fdct8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+
+ // stage 1
+ __m128i x1[4];
+ x1[0] = _mm_adds_epi16(input[0], input[3]);
+ x1[3] = _mm_subs_epi16(input[0], input[3]);
+ x1[1] = _mm_adds_epi16(input[1], input[2]);
+ x1[2] = _mm_subs_epi16(input[1], input[2]);
+
+ // stage 2
+ __m128i x2[4];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[0], x1[1], x2[0], x2[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x1[2], x1[3], x2[2], x2[3]);
+
+ // stage 3
+ output[0] = x2[0];
+ output[1] = x2[2];
+ output[2] = x2[1];
+ output[3] = x2[3];
+}
+
+static void fdct4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_w4_sse2(&cospi_m32_p32, &cospi_p32_p32, __rounding, cos_bit, &x1[5],
+ &x1[6], &x2[5], &x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x2[0],
+ &x2[1], &x3[0], &x3[1]);
+ btf_16_w4_sse2(&cospi_p48_p16, &cospi_m16_p48, __rounding, cos_bit, &x2[2],
+ &x2[3], &x3[2], &x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p56_p08, &cospi_m08_p56, __rounding, cos_bit, &x3[4],
+ &x3[7], &x4[4], &x4[7]);
+ btf_16_w4_sse2(&cospi_p24_p40, &cospi_m40_p24, __rounding, cos_bit, &x3[5],
+ &x3[6], &x4[5], &x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = _mm_adds_epi16(input[0], input[7]);
+ x1[7] = _mm_subs_epi16(input[0], input[7]);
+ x1[1] = _mm_adds_epi16(input[1], input[6]);
+ x1[6] = _mm_subs_epi16(input[1], input[6]);
+ x1[2] = _mm_adds_epi16(input[2], input[5]);
+ x1[5] = _mm_subs_epi16(input[2], input[5]);
+ x1[3] = _mm_adds_epi16(input[3], input[4]);
+ x1[4] = _mm_subs_epi16(input[3], input[4]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = _mm_adds_epi16(x1[0], x1[3]);
+ x2[3] = _mm_subs_epi16(x1[0], x1[3]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[2]);
+ x2[2] = _mm_subs_epi16(x1[1], x1[2]);
+ x2[4] = x1[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[5], x1[6], x2[5], x2[6]);
+ x2[7] = x1[7];
+
+ // stage 3
+ __m128i x3[8];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x2[0], x2[1], x3[0], x3[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x2[2], x2[3], x3[2], x3[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[4], x2[5]);
+ x3[6] = _mm_subs_epi16(x2[7], x2[6]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[6]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x3[4], x3[7], x4[4], x4[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x3[5], x3[6], x4[5], x4[6]);
+
+ // stage 5
+ output[0] = x4[0];
+ output[1] = x4[4];
+ output[2] = x4[2];
+ output[3] = x4[6];
+ output[4] = x4[1];
+ output[5] = x4[5];
+ output[6] = x4[3];
+ output[7] = x4[7];
+}
+
+static void fdct8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = _mm_adds_epi16(input[0], input[15]);
+ x1[15] = _mm_subs_epi16(input[0], input[15]);
+ x1[1] = _mm_adds_epi16(input[1], input[14]);
+ x1[14] = _mm_subs_epi16(input[1], input[14]);
+ x1[2] = _mm_adds_epi16(input[2], input[13]);
+ x1[13] = _mm_subs_epi16(input[2], input[13]);
+ x1[3] = _mm_adds_epi16(input[3], input[12]);
+ x1[12] = _mm_subs_epi16(input[3], input[12]);
+ x1[4] = _mm_adds_epi16(input[4], input[11]);
+ x1[11] = _mm_subs_epi16(input[4], input[11]);
+ x1[5] = _mm_adds_epi16(input[5], input[10]);
+ x1[10] = _mm_subs_epi16(input[5], input[10]);
+ x1[6] = _mm_adds_epi16(input[6], input[9]);
+ x1[9] = _mm_subs_epi16(input[6], input[9]);
+ x1[7] = _mm_adds_epi16(input[7], input[8]);
+ x1[8] = _mm_subs_epi16(input[7], input[8]);
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = _mm_adds_epi16(x1[0], x1[7]);
+ x2[7] = _mm_subs_epi16(x1[0], x1[7]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[6]);
+ x2[6] = _mm_subs_epi16(x1[1], x1[6]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[5]);
+ x2[5] = _mm_subs_epi16(x1[2], x1[5]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[4]);
+ x2[4] = _mm_subs_epi16(x1[3], x1[4]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[10], x1[13], x2[10], x2[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[11], x1[12], x2[11], x2[12]);
+ x2[14] = x1[14];
+ x2[15] = x1[15];
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[0], x2[3]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[1], x2[2]);
+ x3[4] = x2[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[5], x2[6], x3[5], x3[6]);
+ x3[7] = x2[7];
+ x3[8] = _mm_adds_epi16(x2[8], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[8], x2[11]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[9], x2[10]);
+ x3[12] = _mm_subs_epi16(x2[15], x2[12]);
+ x3[15] = _mm_adds_epi16(x2[15], x2[12]);
+ x3[13] = _mm_subs_epi16(x2[14], x2[13]);
+ x3[14] = _mm_adds_epi16(x2[14], x2[13]);
+
+ // stage 4
+ __m128i x4[16];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x3[0], x3[1], x4[0], x4[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x3[2], x3[3], x4[2], x4[3]);
+ x4[4] = _mm_adds_epi16(x3[4], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[4], x3[5]);
+ x4[6] = _mm_subs_epi16(x3[7], x3[6]);
+ x4[7] = _mm_adds_epi16(x3[7], x3[6]);
+ x4[8] = x3[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[9], x3[14], x4[9], x4[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[10], x3[13], x4[10], x4[13]);
+ x4[11] = x3[11];
+ x4[12] = x3[12];
+ x4[15] = x3[15];
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = x4[0];
+ x5[1] = x4[1];
+ x5[2] = x4[2];
+ x5[3] = x4[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x4[4], x4[7], x5[4], x5[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x4[5], x4[6], x5[5], x5[6]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[9]);
+ x5[9] = _mm_subs_epi16(x4[8], x4[9]);
+ x5[10] = _mm_subs_epi16(x4[11], x4[10]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[10]);
+ x5[12] = _mm_adds_epi16(x4[12], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[12], x4[13]);
+ x5[14] = _mm_subs_epi16(x4[15], x4[14]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[14]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x5[8], x5[15], x6[8], x6[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x5[10], x5[13], x6[10], x6[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x5[11], x5[12], x6[11], x6[12]);
+
+ // stage 7
+ output[0] = x6[0];
+ output[1] = x6[8];
+ output[2] = x6[4];
+ output[3] = x6[12];
+ output[4] = x6[2];
+ output[5] = x6[10];
+ output[6] = x6[6];
+ output[7] = x6[14];
+ output[8] = x6[1];
+ output[9] = x6[9];
+ output[10] = x6[5];
+ output[11] = x6[13];
+ output[12] = x6[3];
+ output[13] = x6[11];
+ output[14] = x6[7];
+ output[15] = x6[15];
+}
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+
+ // stage 1
+ __m128i x1[32];
+ x1[0] = _mm_adds_epi16(input[0], input[31]);
+ x1[31] = _mm_subs_epi16(input[0], input[31]);
+ x1[1] = _mm_adds_epi16(input[1], input[30]);
+ x1[30] = _mm_subs_epi16(input[1], input[30]);
+ x1[2] = _mm_adds_epi16(input[2], input[29]);
+ x1[29] = _mm_subs_epi16(input[2], input[29]);
+ x1[3] = _mm_adds_epi16(input[3], input[28]);
+ x1[28] = _mm_subs_epi16(input[3], input[28]);
+ x1[4] = _mm_adds_epi16(input[4], input[27]);
+ x1[27] = _mm_subs_epi16(input[4], input[27]);
+ x1[5] = _mm_adds_epi16(input[5], input[26]);
+ x1[26] = _mm_subs_epi16(input[5], input[26]);
+ x1[6] = _mm_adds_epi16(input[6], input[25]);
+ x1[25] = _mm_subs_epi16(input[6], input[25]);
+ x1[7] = _mm_adds_epi16(input[7], input[24]);
+ x1[24] = _mm_subs_epi16(input[7], input[24]);
+ x1[8] = _mm_adds_epi16(input[8], input[23]);
+ x1[23] = _mm_subs_epi16(input[8], input[23]);
+ x1[9] = _mm_adds_epi16(input[9], input[22]);
+ x1[22] = _mm_subs_epi16(input[9], input[22]);
+ x1[10] = _mm_adds_epi16(input[10], input[21]);
+ x1[21] = _mm_subs_epi16(input[10], input[21]);
+ x1[11] = _mm_adds_epi16(input[11], input[20]);
+ x1[20] = _mm_subs_epi16(input[11], input[20]);
+ x1[12] = _mm_adds_epi16(input[12], input[19]);
+ x1[19] = _mm_subs_epi16(input[12], input[19]);
+ x1[13] = _mm_adds_epi16(input[13], input[18]);
+ x1[18] = _mm_subs_epi16(input[13], input[18]);
+ x1[14] = _mm_adds_epi16(input[14], input[17]);
+ x1[17] = _mm_subs_epi16(input[14], input[17]);
+ x1[15] = _mm_adds_epi16(input[15], input[16]);
+ x1[16] = _mm_subs_epi16(input[15], input[16]);
+
+ // stage 2
+ __m128i x2[32];
+ x2[0] = _mm_adds_epi16(x1[0], x1[15]);
+ x2[15] = _mm_subs_epi16(x1[0], x1[15]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[14]);
+ x2[14] = _mm_subs_epi16(x1[1], x1[14]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[13]);
+ x2[13] = _mm_subs_epi16(x1[2], x1[13]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[12]);
+ x2[12] = _mm_subs_epi16(x1[3], x1[12]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[11]);
+ x2[11] = _mm_subs_epi16(x1[4], x1[11]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[10]);
+ x2[10] = _mm_subs_epi16(x1[5], x1[10]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[9]);
+ x2[9] = _mm_subs_epi16(x1[6], x1[9]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[8]);
+ x2[8] = _mm_subs_epi16(x1[7], x1[8]);
+ x2[16] = x1[16];
+ x2[17] = x1[17];
+ x2[18] = x1[18];
+ x2[19] = x1[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[20], x1[27], x2[20], x2[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[21], x1[26], x2[21], x2[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[22], x1[25], x2[22], x2[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[23], x1[24], x2[23], x2[24]);
+ x2[28] = x1[28];
+ x2[29] = x1[29];
+ x2[30] = x1[30];
+ x2[31] = x1[31];
+
+ // stage 3
+ __m128i x3[32];
+ x3[0] = _mm_adds_epi16(x2[0], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[0], x2[7]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[1], x2[6]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[5]);
+ x3[5] = _mm_subs_epi16(x2[2], x2[5]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[4]);
+ x3[4] = _mm_subs_epi16(x2[3], x2[4]);
+ x3[8] = x2[8];
+ x3[9] = x2[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[10], x2[13], x3[10], x3[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[11], x2[12], x3[11], x3[12]);
+ x3[14] = x2[14];
+ x3[15] = x2[15];
+ x3[16] = _mm_adds_epi16(x2[16], x2[23]);
+ x3[23] = _mm_subs_epi16(x2[16], x2[23]);
+ x3[17] = _mm_adds_epi16(x2[17], x2[22]);
+ x3[22] = _mm_subs_epi16(x2[17], x2[22]);
+ x3[18] = _mm_adds_epi16(x2[18], x2[21]);
+ x3[21] = _mm_subs_epi16(x2[18], x2[21]);
+ x3[19] = _mm_adds_epi16(x2[19], x2[20]);
+ x3[20] = _mm_subs_epi16(x2[19], x2[20]);
+ x3[24] = _mm_subs_epi16(x2[31], x2[24]);
+ x3[31] = _mm_adds_epi16(x2[31], x2[24]);
+ x3[25] = _mm_subs_epi16(x2[30], x2[25]);
+ x3[30] = _mm_adds_epi16(x2[30], x2[25]);
+ x3[26] = _mm_subs_epi16(x2[29], x2[26]);
+ x3[29] = _mm_adds_epi16(x2[29], x2[26]);
+ x3[27] = _mm_subs_epi16(x2[28], x2[27]);
+ x3[28] = _mm_adds_epi16(x2[28], x2[27]);
+
+ // stage 4
+ __m128i x4[32];
+ x4[0] = _mm_adds_epi16(x3[0], x3[3]);
+ x4[3] = _mm_subs_epi16(x3[0], x3[3]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[2]);
+ x4[2] = _mm_subs_epi16(x3[1], x3[2]);
+ x4[4] = x3[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[5], x3[6], x4[5], x4[6]);
+ x4[7] = x3[7];
+ x4[8] = _mm_adds_epi16(x3[8], x3[11]);
+ x4[11] = _mm_subs_epi16(x3[8], x3[11]);
+ x4[9] = _mm_adds_epi16(x3[9], x3[10]);
+ x4[10] = _mm_subs_epi16(x3[9], x3[10]);
+ x4[12] = _mm_subs_epi16(x3[15], x3[12]);
+ x4[15] = _mm_adds_epi16(x3[15], x3[12]);
+ x4[13] = _mm_subs_epi16(x3[14], x3[13]);
+ x4[14] = _mm_adds_epi16(x3[14], x3[13]);
+ x4[16] = x3[16];
+ x4[17] = x3[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[18], x3[29], x4[18], x4[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[19], x3[28], x4[19], x4[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[20], x3[27], x4[20], x4[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[21], x3[26], x4[21], x4[26]);
+ x4[22] = x3[22];
+ x4[23] = x3[23];
+ x4[24] = x3[24];
+ x4[25] = x3[25];
+ x4[30] = x3[30];
+ x4[31] = x3[31];
+
+ // stage 5
+ __m128i x5[32];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x4[0], x4[1], x5[0], x5[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x4[2], x4[3], x5[2], x5[3]);
+ x5[4] = _mm_adds_epi16(x4[4], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[4], x4[5]);
+ x5[6] = _mm_subs_epi16(x4[7], x4[6]);
+ x5[7] = _mm_adds_epi16(x4[7], x4[6]);
+ x5[8] = x4[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[9], x4[14], x5[9], x5[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[10], x4[13], x5[10], x5[13]);
+ x5[11] = x4[11];
+ x5[12] = x4[12];
+ x5[15] = x4[15];
+ x5[16] = _mm_adds_epi16(x4[16], x4[19]);
+ x5[19] = _mm_subs_epi16(x4[16], x4[19]);
+ x5[17] = _mm_adds_epi16(x4[17], x4[18]);
+ x5[18] = _mm_subs_epi16(x4[17], x4[18]);
+ x5[20] = _mm_subs_epi16(x4[23], x4[20]);
+ x5[23] = _mm_adds_epi16(x4[23], x4[20]);
+ x5[21] = _mm_subs_epi16(x4[22], x4[21]);
+ x5[22] = _mm_adds_epi16(x4[22], x4[21]);
+ x5[24] = _mm_adds_epi16(x4[24], x4[27]);
+ x5[27] = _mm_subs_epi16(x4[24], x4[27]);
+ x5[25] = _mm_adds_epi16(x4[25], x4[26]);
+ x5[26] = _mm_subs_epi16(x4[25], x4[26]);
+ x5[28] = _mm_subs_epi16(x4[31], x4[28]);
+ x5[31] = _mm_adds_epi16(x4[31], x4[28]);
+ x5[29] = _mm_subs_epi16(x4[30], x4[29]);
+ x5[30] = _mm_adds_epi16(x4[30], x4[29]);
+
+ // stage 6
+ __m128i x6[32];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x5[4], x5[7], x6[4], x6[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x5[5], x5[6], x6[5], x6[6]);
+ x6[8] = _mm_adds_epi16(x5[8], x5[9]);
+ x6[9] = _mm_subs_epi16(x5[8], x5[9]);
+ x6[10] = _mm_subs_epi16(x5[11], x5[10]);
+ x6[11] = _mm_adds_epi16(x5[11], x5[10]);
+ x6[12] = _mm_adds_epi16(x5[12], x5[13]);
+ x6[13] = _mm_subs_epi16(x5[12], x5[13]);
+ x6[14] = _mm_subs_epi16(x5[15], x5[14]);
+ x6[15] = _mm_adds_epi16(x5[15], x5[14]);
+ x6[16] = x5[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[17], x5[30], x6[17], x6[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[18], x5[29], x6[18], x6[29]);
+ x6[19] = x5[19];
+ x6[20] = x5[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[21], x5[26], x6[21], x6[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[22], x5[25], x6[22], x6[25]);
+ x6[23] = x5[23];
+ x6[24] = x5[24];
+ x6[27] = x5[27];
+ x6[28] = x5[28];
+ x6[31] = x5[31];
+
+ // stage 7
+ __m128i x7[32];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ x7[4] = x6[4];
+ x7[5] = x6[5];
+ x7[6] = x6[6];
+ x7[7] = x6[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x6[8], x6[15], x7[8], x7[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x6[9], x6[14], x7[9], x7[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x6[10], x6[13], x7[10], x7[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x6[11], x6[12], x7[11], x7[12]);
+ x7[16] = _mm_adds_epi16(x6[16], x6[17]);
+ x7[17] = _mm_subs_epi16(x6[16], x6[17]);
+ x7[18] = _mm_subs_epi16(x6[19], x6[18]);
+ x7[19] = _mm_adds_epi16(x6[19], x6[18]);
+ x7[20] = _mm_adds_epi16(x6[20], x6[21]);
+ x7[21] = _mm_subs_epi16(x6[20], x6[21]);
+ x7[22] = _mm_subs_epi16(x6[23], x6[22]);
+ x7[23] = _mm_adds_epi16(x6[23], x6[22]);
+ x7[24] = _mm_adds_epi16(x6[24], x6[25]);
+ x7[25] = _mm_subs_epi16(x6[24], x6[25]);
+ x7[26] = _mm_subs_epi16(x6[27], x6[26]);
+ x7[27] = _mm_adds_epi16(x6[27], x6[26]);
+ x7[28] = _mm_adds_epi16(x6[28], x6[29]);
+ x7[29] = _mm_subs_epi16(x6[28], x6[29]);
+ x7[30] = _mm_subs_epi16(x6[31], x6[30]);
+ x7[31] = _mm_adds_epi16(x6[31], x6[30]);
+
+ // stage 8
+ __m128i x8[32];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ x8[8] = x7[8];
+ x8[9] = x7[9];
+ x8[10] = x7[10];
+ x8[11] = x7[11];
+ x8[12] = x7[12];
+ x8[13] = x7[13];
+ x8[14] = x7[14];
+ x8[15] = x7[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x7[16], x7[31], x8[16], x8[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x7[17], x7[30], x8[17], x8[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x7[18], x7[29], x8[18], x8[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x7[19], x7[28], x8[19], x8[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x7[20], x7[27], x8[20], x8[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x7[21], x7[26], x8[21], x8[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x7[22], x7[25], x8[22], x8[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x7[23], x7[24], x8[23], x8[24]);
+
+ // stage 9
+ output[0] = x8[0];
+ output[1] = x8[16];
+ output[2] = x8[8];
+ output[3] = x8[24];
+ output[4] = x8[4];
+ output[5] = x8[20];
+ output[6] = x8[12];
+ output[7] = x8[28];
+ output[8] = x8[2];
+ output[9] = x8[18];
+ output[10] = x8[10];
+ output[11] = x8[26];
+ output[12] = x8[6];
+ output[13] = x8[22];
+ output[14] = x8[14];
+ output[15] = x8[30];
+ output[16] = x8[1];
+ output[17] = x8[17];
+ output[18] = x8[9];
+ output[19] = x8[25];
+ output[20] = x8[5];
+ output[21] = x8[21];
+ output[22] = x8[13];
+ output[23] = x8[29];
+ output[24] = x8[3];
+ output[25] = x8[19];
+ output[26] = x8[11];
+ output[27] = x8[27];
+ output[28] = x8[7];
+ output[29] = x8[23];
+ output[30] = x8[15];
+ output[31] = x8[31];
+}
+
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_m32_p32 = pair_set_epi16(-cospi[32], cospi[32]);
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_m16_p48 = pair_set_epi16(-cospi[16], cospi[48]);
+ __m128i cospi_p48_p16 = pair_set_epi16(cospi[48], cospi[16]);
+ __m128i cospi_m48_m16 = pair_set_epi16(-cospi[48], -cospi[16]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_m08_p56 = pair_set_epi16(-cospi[8], cospi[56]);
+ __m128i cospi_p56_p08 = pair_set_epi16(cospi[56], cospi[8]);
+ __m128i cospi_m56_m08 = pair_set_epi16(-cospi[56], -cospi[8]);
+ __m128i cospi_m40_p24 = pair_set_epi16(-cospi[40], cospi[24]);
+ __m128i cospi_p24_p40 = pair_set_epi16(cospi[24], cospi[40]);
+ __m128i cospi_m24_m40 = pair_set_epi16(-cospi[24], -cospi[40]);
+ __m128i cospi_p60_p04 = pair_set_epi16(cospi[60], cospi[4]);
+ __m128i cospi_m04_p60 = pair_set_epi16(-cospi[4], cospi[60]);
+ __m128i cospi_p28_p36 = pair_set_epi16(cospi[28], cospi[36]);
+ __m128i cospi_m36_p28 = pair_set_epi16(-cospi[36], cospi[28]);
+ __m128i cospi_p44_p20 = pair_set_epi16(cospi[44], cospi[20]);
+ __m128i cospi_m20_p44 = pair_set_epi16(-cospi[20], cospi[44]);
+ __m128i cospi_p12_p52 = pair_set_epi16(cospi[12], cospi[52]);
+ __m128i cospi_m52_p12 = pair_set_epi16(-cospi[52], cospi[12]);
+ __m128i cospi_m60_m04 = pair_set_epi16(-cospi[60], -cospi[4]);
+ __m128i cospi_m28_m36 = pair_set_epi16(-cospi[28], -cospi[36]);
+ __m128i cospi_m44_m20 = pair_set_epi16(-cospi[44], -cospi[20]);
+ __m128i cospi_m12_m52 = pair_set_epi16(-cospi[12], -cospi[52]);
+ __m128i cospi_p62_p02 = pair_set_epi16(cospi[62], cospi[2]);
+ __m128i cospi_m02_p62 = pair_set_epi16(-cospi[2], cospi[62]);
+ __m128i cospi_p30_p34 = pair_set_epi16(cospi[30], cospi[34]);
+ __m128i cospi_m34_p30 = pair_set_epi16(-cospi[34], cospi[30]);
+ __m128i cospi_p46_p18 = pair_set_epi16(cospi[46], cospi[18]);
+ __m128i cospi_m18_p46 = pair_set_epi16(-cospi[18], cospi[46]);
+ __m128i cospi_p14_p50 = pair_set_epi16(cospi[14], cospi[50]);
+ __m128i cospi_m50_p14 = pair_set_epi16(-cospi[50], cospi[14]);
+ __m128i cospi_p54_p10 = pair_set_epi16(cospi[54], cospi[10]);
+ __m128i cospi_m10_p54 = pair_set_epi16(-cospi[10], cospi[54]);
+ __m128i cospi_p22_p42 = pair_set_epi16(cospi[22], cospi[42]);
+ __m128i cospi_m42_p22 = pair_set_epi16(-cospi[42], cospi[22]);
+ __m128i cospi_p38_p26 = pair_set_epi16(cospi[38], cospi[26]);
+ __m128i cospi_m26_p38 = pair_set_epi16(-cospi[26], cospi[38]);
+ __m128i cospi_p06_p58 = pair_set_epi16(cospi[6], cospi[58]);
+ __m128i cospi_m58_p06 = pair_set_epi16(-cospi[58], cospi[6]);
+ __m128i cospi_p63_p01 = pair_set_epi16(cospi[63], cospi[1]);
+ __m128i cospi_m01_p63 = pair_set_epi16(-cospi[1], cospi[63]);
+ __m128i cospi_p31_p33 = pair_set_epi16(cospi[31], cospi[33]);
+ __m128i cospi_m33_p31 = pair_set_epi16(-cospi[33], cospi[31]);
+ __m128i cospi_p47_p17 = pair_set_epi16(cospi[47], cospi[17]);
+ __m128i cospi_m17_p47 = pair_set_epi16(-cospi[17], cospi[47]);
+ __m128i cospi_p15_p49 = pair_set_epi16(cospi[15], cospi[49]);
+ __m128i cospi_m49_p15 = pair_set_epi16(-cospi[49], cospi[15]);
+ __m128i cospi_p55_p09 = pair_set_epi16(cospi[55], cospi[9]);
+ __m128i cospi_m09_p55 = pair_set_epi16(-cospi[9], cospi[55]);
+ __m128i cospi_p23_p41 = pair_set_epi16(cospi[23], cospi[41]);
+ __m128i cospi_m41_p23 = pair_set_epi16(-cospi[41], cospi[23]);
+ __m128i cospi_p39_p25 = pair_set_epi16(cospi[39], cospi[25]);
+ __m128i cospi_m25_p39 = pair_set_epi16(-cospi[25], cospi[39]);
+ __m128i cospi_p07_p57 = pair_set_epi16(cospi[7], cospi[57]);
+ __m128i cospi_m57_p07 = pair_set_epi16(-cospi[57], cospi[7]);
+ __m128i cospi_p59_p05 = pair_set_epi16(cospi[59], cospi[5]);
+ __m128i cospi_m05_p59 = pair_set_epi16(-cospi[5], cospi[59]);
+ __m128i cospi_p27_p37 = pair_set_epi16(cospi[27], cospi[37]);
+ __m128i cospi_m37_p27 = pair_set_epi16(-cospi[37], cospi[27]);
+ __m128i cospi_p43_p21 = pair_set_epi16(cospi[43], cospi[21]);
+ __m128i cospi_m21_p43 = pair_set_epi16(-cospi[21], cospi[43]);
+ __m128i cospi_p11_p53 = pair_set_epi16(cospi[11], cospi[53]);
+ __m128i cospi_m53_p11 = pair_set_epi16(-cospi[53], cospi[11]);
+ __m128i cospi_p51_p13 = pair_set_epi16(cospi[51], cospi[13]);
+ __m128i cospi_m13_p51 = pair_set_epi16(-cospi[13], cospi[51]);
+ __m128i cospi_p19_p45 = pair_set_epi16(cospi[19], cospi[45]);
+ __m128i cospi_m45_p19 = pair_set_epi16(-cospi[45], cospi[19]);
+ __m128i cospi_p35_p29 = pair_set_epi16(cospi[35], cospi[29]);
+ __m128i cospi_m29_p35 = pair_set_epi16(-cospi[29], cospi[35]);
+ __m128i cospi_p03_p61 = pair_set_epi16(cospi[3], cospi[61]);
+ __m128i cospi_m61_p03 = pair_set_epi16(-cospi[61], cospi[3]);
+
+ // stage 1
+ __m128i x1[64];
+ x1[0] = _mm_adds_epi16(input[0], input[63]);
+ x1[63] = _mm_subs_epi16(input[0], input[63]);
+ x1[1] = _mm_adds_epi16(input[1], input[62]);
+ x1[62] = _mm_subs_epi16(input[1], input[62]);
+ x1[2] = _mm_adds_epi16(input[2], input[61]);
+ x1[61] = _mm_subs_epi16(input[2], input[61]);
+ x1[3] = _mm_adds_epi16(input[3], input[60]);
+ x1[60] = _mm_subs_epi16(input[3], input[60]);
+ x1[4] = _mm_adds_epi16(input[4], input[59]);
+ x1[59] = _mm_subs_epi16(input[4], input[59]);
+ x1[5] = _mm_adds_epi16(input[5], input[58]);
+ x1[58] = _mm_subs_epi16(input[5], input[58]);
+ x1[6] = _mm_adds_epi16(input[6], input[57]);
+ x1[57] = _mm_subs_epi16(input[6], input[57]);
+ x1[7] = _mm_adds_epi16(input[7], input[56]);
+ x1[56] = _mm_subs_epi16(input[7], input[56]);
+ x1[8] = _mm_adds_epi16(input[8], input[55]);
+ x1[55] = _mm_subs_epi16(input[8], input[55]);
+ x1[9] = _mm_adds_epi16(input[9], input[54]);
+ x1[54] = _mm_subs_epi16(input[9], input[54]);
+ x1[10] = _mm_adds_epi16(input[10], input[53]);
+ x1[53] = _mm_subs_epi16(input[10], input[53]);
+ x1[11] = _mm_adds_epi16(input[11], input[52]);
+ x1[52] = _mm_subs_epi16(input[11], input[52]);
+ x1[12] = _mm_adds_epi16(input[12], input[51]);
+ x1[51] = _mm_subs_epi16(input[12], input[51]);
+ x1[13] = _mm_adds_epi16(input[13], input[50]);
+ x1[50] = _mm_subs_epi16(input[13], input[50]);
+ x1[14] = _mm_adds_epi16(input[14], input[49]);
+ x1[49] = _mm_subs_epi16(input[14], input[49]);
+ x1[15] = _mm_adds_epi16(input[15], input[48]);
+ x1[48] = _mm_subs_epi16(input[15], input[48]);
+ x1[16] = _mm_adds_epi16(input[16], input[47]);
+ x1[47] = _mm_subs_epi16(input[16], input[47]);
+ x1[17] = _mm_adds_epi16(input[17], input[46]);
+ x1[46] = _mm_subs_epi16(input[17], input[46]);
+ x1[18] = _mm_adds_epi16(input[18], input[45]);
+ x1[45] = _mm_subs_epi16(input[18], input[45]);
+ x1[19] = _mm_adds_epi16(input[19], input[44]);
+ x1[44] = _mm_subs_epi16(input[19], input[44]);
+ x1[20] = _mm_adds_epi16(input[20], input[43]);
+ x1[43] = _mm_subs_epi16(input[20], input[43]);
+ x1[21] = _mm_adds_epi16(input[21], input[42]);
+ x1[42] = _mm_subs_epi16(input[21], input[42]);
+ x1[22] = _mm_adds_epi16(input[22], input[41]);
+ x1[41] = _mm_subs_epi16(input[22], input[41]);
+ x1[23] = _mm_adds_epi16(input[23], input[40]);
+ x1[40] = _mm_subs_epi16(input[23], input[40]);
+ x1[24] = _mm_adds_epi16(input[24], input[39]);
+ x1[39] = _mm_subs_epi16(input[24], input[39]);
+ x1[25] = _mm_adds_epi16(input[25], input[38]);
+ x1[38] = _mm_subs_epi16(input[25], input[38]);
+ x1[26] = _mm_adds_epi16(input[26], input[37]);
+ x1[37] = _mm_subs_epi16(input[26], input[37]);
+ x1[27] = _mm_adds_epi16(input[27], input[36]);
+ x1[36] = _mm_subs_epi16(input[27], input[36]);
+ x1[28] = _mm_adds_epi16(input[28], input[35]);
+ x1[35] = _mm_subs_epi16(input[28], input[35]);
+ x1[29] = _mm_adds_epi16(input[29], input[34]);
+ x1[34] = _mm_subs_epi16(input[29], input[34]);
+ x1[30] = _mm_adds_epi16(input[30], input[33]);
+ x1[33] = _mm_subs_epi16(input[30], input[33]);
+ x1[31] = _mm_adds_epi16(input[31], input[32]);
+ x1[32] = _mm_subs_epi16(input[31], input[32]);
+
+ // stage 2
+ __m128i x2[64];
+ x2[0] = _mm_adds_epi16(x1[0], x1[31]);
+ x2[31] = _mm_subs_epi16(x1[0], x1[31]);
+ x2[1] = _mm_adds_epi16(x1[1], x1[30]);
+ x2[30] = _mm_subs_epi16(x1[1], x1[30]);
+ x2[2] = _mm_adds_epi16(x1[2], x1[29]);
+ x2[29] = _mm_subs_epi16(x1[2], x1[29]);
+ x2[3] = _mm_adds_epi16(x1[3], x1[28]);
+ x2[28] = _mm_subs_epi16(x1[3], x1[28]);
+ x2[4] = _mm_adds_epi16(x1[4], x1[27]);
+ x2[27] = _mm_subs_epi16(x1[4], x1[27]);
+ x2[5] = _mm_adds_epi16(x1[5], x1[26]);
+ x2[26] = _mm_subs_epi16(x1[5], x1[26]);
+ x2[6] = _mm_adds_epi16(x1[6], x1[25]);
+ x2[25] = _mm_subs_epi16(x1[6], x1[25]);
+ x2[7] = _mm_adds_epi16(x1[7], x1[24]);
+ x2[24] = _mm_subs_epi16(x1[7], x1[24]);
+ x2[8] = _mm_adds_epi16(x1[8], x1[23]);
+ x2[23] = _mm_subs_epi16(x1[8], x1[23]);
+ x2[9] = _mm_adds_epi16(x1[9], x1[22]);
+ x2[22] = _mm_subs_epi16(x1[9], x1[22]);
+ x2[10] = _mm_adds_epi16(x1[10], x1[21]);
+ x2[21] = _mm_subs_epi16(x1[10], x1[21]);
+ x2[11] = _mm_adds_epi16(x1[11], x1[20]);
+ x2[20] = _mm_subs_epi16(x1[11], x1[20]);
+ x2[12] = _mm_adds_epi16(x1[12], x1[19]);
+ x2[19] = _mm_subs_epi16(x1[12], x1[19]);
+ x2[13] = _mm_adds_epi16(x1[13], x1[18]);
+ x2[18] = _mm_subs_epi16(x1[13], x1[18]);
+ x2[14] = _mm_adds_epi16(x1[14], x1[17]);
+ x2[17] = _mm_subs_epi16(x1[14], x1[17]);
+ x2[15] = _mm_adds_epi16(x1[15], x1[16]);
+ x2[16] = _mm_subs_epi16(x1[15], x1[16]);
+ x2[32] = x1[32];
+ x2[33] = x1[33];
+ x2[34] = x1[34];
+ x2[35] = x1[35];
+ x2[36] = x1[36];
+ x2[37] = x1[37];
+ x2[38] = x1[38];
+ x2[39] = x1[39];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[40], x1[55], x2[40], x2[55]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[41], x1[54], x2[41], x2[54]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[42], x1[53], x2[42], x2[53]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[43], x1[52], x2[43], x2[52]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[44], x1[51], x2[44], x2[51]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[45], x1[50], x2[45], x2[50]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[46], x1[49], x2[46], x2[49]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x1[47], x1[48], x2[47], x2[48]);
+ x2[56] = x1[56];
+ x2[57] = x1[57];
+ x2[58] = x1[58];
+ x2[59] = x1[59];
+ x2[60] = x1[60];
+ x2[61] = x1[61];
+ x2[62] = x1[62];
+ x2[63] = x1[63];
+
+ // stage 3
+ __m128i x3[64];
+ x3[0] = _mm_adds_epi16(x2[0], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[0], x2[15]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[1], x2[14]);
+ x3[2] = _mm_adds_epi16(x2[2], x2[13]);
+ x3[13] = _mm_subs_epi16(x2[2], x2[13]);
+ x3[3] = _mm_adds_epi16(x2[3], x2[12]);
+ x3[12] = _mm_subs_epi16(x2[3], x2[12]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[4], x2[11]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[5], x2[10]);
+ x3[6] = _mm_adds_epi16(x2[6], x2[9]);
+ x3[9] = _mm_subs_epi16(x2[6], x2[9]);
+ x3[7] = _mm_adds_epi16(x2[7], x2[8]);
+ x3[8] = _mm_subs_epi16(x2[7], x2[8]);
+ x3[16] = x2[16];
+ x3[17] = x2[17];
+ x3[18] = x2[18];
+ x3[19] = x2[19];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[20], x2[27], x3[20], x3[27]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[21], x2[26], x3[21], x3[26]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[22], x2[25], x3[22], x3[25]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x2[23], x2[24], x3[23], x3[24]);
+ x3[28] = x2[28];
+ x3[29] = x2[29];
+ x3[30] = x2[30];
+ x3[31] = x2[31];
+ x3[32] = _mm_adds_epi16(x2[32], x2[47]);
+ x3[47] = _mm_subs_epi16(x2[32], x2[47]);
+ x3[33] = _mm_adds_epi16(x2[33], x2[46]);
+ x3[46] = _mm_subs_epi16(x2[33], x2[46]);
+ x3[34] = _mm_adds_epi16(x2[34], x2[45]);
+ x3[45] = _mm_subs_epi16(x2[34], x2[45]);
+ x3[35] = _mm_adds_epi16(x2[35], x2[44]);
+ x3[44] = _mm_subs_epi16(x2[35], x2[44]);
+ x3[36] = _mm_adds_epi16(x2[36], x2[43]);
+ x3[43] = _mm_subs_epi16(x2[36], x2[43]);
+ x3[37] = _mm_adds_epi16(x2[37], x2[42]);
+ x3[42] = _mm_subs_epi16(x2[37], x2[42]);
+ x3[38] = _mm_adds_epi16(x2[38], x2[41]);
+ x3[41] = _mm_subs_epi16(x2[38], x2[41]);
+ x3[39] = _mm_adds_epi16(x2[39], x2[40]);
+ x3[40] = _mm_subs_epi16(x2[39], x2[40]);
+ x3[48] = _mm_subs_epi16(x2[63], x2[48]);
+ x3[63] = _mm_adds_epi16(x2[63], x2[48]);
+ x3[49] = _mm_subs_epi16(x2[62], x2[49]);
+ x3[62] = _mm_adds_epi16(x2[62], x2[49]);
+ x3[50] = _mm_subs_epi16(x2[61], x2[50]);
+ x3[61] = _mm_adds_epi16(x2[61], x2[50]);
+ x3[51] = _mm_subs_epi16(x2[60], x2[51]);
+ x3[60] = _mm_adds_epi16(x2[60], x2[51]);
+ x3[52] = _mm_subs_epi16(x2[59], x2[52]);
+ x3[59] = _mm_adds_epi16(x2[59], x2[52]);
+ x3[53] = _mm_subs_epi16(x2[58], x2[53]);
+ x3[58] = _mm_adds_epi16(x2[58], x2[53]);
+ x3[54] = _mm_subs_epi16(x2[57], x2[54]);
+ x3[57] = _mm_adds_epi16(x2[57], x2[54]);
+ x3[55] = _mm_subs_epi16(x2[56], x2[55]);
+ x3[56] = _mm_adds_epi16(x2[56], x2[55]);
+
+ // stage 4
+ __m128i x4[64];
+ x4[0] = _mm_adds_epi16(x3[0], x3[7]);
+ x4[7] = _mm_subs_epi16(x3[0], x3[7]);
+ x4[1] = _mm_adds_epi16(x3[1], x3[6]);
+ x4[6] = _mm_subs_epi16(x3[1], x3[6]);
+ x4[2] = _mm_adds_epi16(x3[2], x3[5]);
+ x4[5] = _mm_subs_epi16(x3[2], x3[5]);
+ x4[3] = _mm_adds_epi16(x3[3], x3[4]);
+ x4[4] = _mm_subs_epi16(x3[3], x3[4]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[10], x3[13], x4[10], x4[13]);
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x3[11], x3[12], x4[11], x4[12]);
+ x4[14] = x3[14];
+ x4[15] = x3[15];
+ x4[16] = _mm_adds_epi16(x3[16], x3[23]);
+ x4[23] = _mm_subs_epi16(x3[16], x3[23]);
+ x4[17] = _mm_adds_epi16(x3[17], x3[22]);
+ x4[22] = _mm_subs_epi16(x3[17], x3[22]);
+ x4[18] = _mm_adds_epi16(x3[18], x3[21]);
+ x4[21] = _mm_subs_epi16(x3[18], x3[21]);
+ x4[19] = _mm_adds_epi16(x3[19], x3[20]);
+ x4[20] = _mm_subs_epi16(x3[19], x3[20]);
+ x4[24] = _mm_subs_epi16(x3[31], x3[24]);
+ x4[31] = _mm_adds_epi16(x3[31], x3[24]);
+ x4[25] = _mm_subs_epi16(x3[30], x3[25]);
+ x4[30] = _mm_adds_epi16(x3[30], x3[25]);
+ x4[26] = _mm_subs_epi16(x3[29], x3[26]);
+ x4[29] = _mm_adds_epi16(x3[29], x3[26]);
+ x4[27] = _mm_subs_epi16(x3[28], x3[27]);
+ x4[28] = _mm_adds_epi16(x3[28], x3[27]);
+ x4[32] = x3[32];
+ x4[33] = x3[33];
+ x4[34] = x3[34];
+ x4[35] = x3[35];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[36], x3[59], x4[36], x4[59]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[37], x3[58], x4[37], x4[58]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[38], x3[57], x4[38], x4[57]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x3[39], x3[56], x4[39], x4[56]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[40], x3[55], x4[40], x4[55]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[41], x3[54], x4[41], x4[54]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[42], x3[53], x4[42], x4[53]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x3[43], x3[52], x4[43], x4[52]);
+ x4[44] = x3[44];
+ x4[45] = x3[45];
+ x4[46] = x3[46];
+ x4[47] = x3[47];
+ x4[48] = x3[48];
+ x4[49] = x3[49];
+ x4[50] = x3[50];
+ x4[51] = x3[51];
+ x4[60] = x3[60];
+ x4[61] = x3[61];
+ x4[62] = x3[62];
+ x4[63] = x3[63];
+
+ // stage 5
+ __m128i x5[64];
+ x5[0] = _mm_adds_epi16(x4[0], x4[3]);
+ x5[3] = _mm_subs_epi16(x4[0], x4[3]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[2]);
+ x5[2] = _mm_subs_epi16(x4[1], x4[2]);
+ x5[4] = x4[4];
+ btf_16_sse2(cospi_m32_p32, cospi_p32_p32, x4[5], x4[6], x5[5], x5[6]);
+ x5[7] = x4[7];
+ x5[8] = _mm_adds_epi16(x4[8], x4[11]);
+ x5[11] = _mm_subs_epi16(x4[8], x4[11]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[10]);
+ x5[10] = _mm_subs_epi16(x4[9], x4[10]);
+ x5[12] = _mm_subs_epi16(x4[15], x4[12]);
+ x5[15] = _mm_adds_epi16(x4[15], x4[12]);
+ x5[13] = _mm_subs_epi16(x4[14], x4[13]);
+ x5[14] = _mm_adds_epi16(x4[14], x4[13]);
+ x5[16] = x4[16];
+ x5[17] = x4[17];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[18], x4[29], x5[18], x5[29]);
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x4[19], x4[28], x5[19], x5[28]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[20], x4[27], x5[20], x5[27]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x4[21], x4[26], x5[21], x5[26]);
+ x5[22] = x4[22];
+ x5[23] = x4[23];
+ x5[24] = x4[24];
+ x5[25] = x4[25];
+ x5[30] = x4[30];
+ x5[31] = x4[31];
+ x5[32] = _mm_adds_epi16(x4[32], x4[39]);
+ x5[39] = _mm_subs_epi16(x4[32], x4[39]);
+ x5[33] = _mm_adds_epi16(x4[33], x4[38]);
+ x5[38] = _mm_subs_epi16(x4[33], x4[38]);
+ x5[34] = _mm_adds_epi16(x4[34], x4[37]);
+ x5[37] = _mm_subs_epi16(x4[34], x4[37]);
+ x5[35] = _mm_adds_epi16(x4[35], x4[36]);
+ x5[36] = _mm_subs_epi16(x4[35], x4[36]);
+ x5[40] = _mm_subs_epi16(x4[47], x4[40]);
+ x5[47] = _mm_adds_epi16(x4[47], x4[40]);
+ x5[41] = _mm_subs_epi16(x4[46], x4[41]);
+ x5[46] = _mm_adds_epi16(x4[46], x4[41]);
+ x5[42] = _mm_subs_epi16(x4[45], x4[42]);
+ x5[45] = _mm_adds_epi16(x4[45], x4[42]);
+ x5[43] = _mm_subs_epi16(x4[44], x4[43]);
+ x5[44] = _mm_adds_epi16(x4[44], x4[43]);
+ x5[48] = _mm_adds_epi16(x4[48], x4[55]);
+ x5[55] = _mm_subs_epi16(x4[48], x4[55]);
+ x5[49] = _mm_adds_epi16(x4[49], x4[54]);
+ x5[54] = _mm_subs_epi16(x4[49], x4[54]);
+ x5[50] = _mm_adds_epi16(x4[50], x4[53]);
+ x5[53] = _mm_subs_epi16(x4[50], x4[53]);
+ x5[51] = _mm_adds_epi16(x4[51], x4[52]);
+ x5[52] = _mm_subs_epi16(x4[51], x4[52]);
+ x5[56] = _mm_subs_epi16(x4[63], x4[56]);
+ x5[63] = _mm_adds_epi16(x4[63], x4[56]);
+ x5[57] = _mm_subs_epi16(x4[62], x4[57]);
+ x5[62] = _mm_adds_epi16(x4[62], x4[57]);
+ x5[58] = _mm_subs_epi16(x4[61], x4[58]);
+ x5[61] = _mm_adds_epi16(x4[61], x4[58]);
+ x5[59] = _mm_subs_epi16(x4[60], x4[59]);
+ x5[60] = _mm_adds_epi16(x4[60], x4[59]);
+
+ // stage 6
+ __m128i x6[64];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p48_p16, cospi_m16_p48, x5[2], x5[3], x6[2], x6[3]);
+ x6[4] = _mm_adds_epi16(x5[4], x5[5]);
+ x6[5] = _mm_subs_epi16(x5[4], x5[5]);
+ x6[6] = _mm_subs_epi16(x5[7], x5[6]);
+ x6[7] = _mm_adds_epi16(x5[7], x5[6]);
+ x6[8] = x5[8];
+ btf_16_sse2(cospi_m16_p48, cospi_p48_p16, x5[9], x5[14], x6[9], x6[14]);
+ btf_16_sse2(cospi_m48_m16, cospi_m16_p48, x5[10], x5[13], x6[10], x6[13]);
+ x6[11] = x5[11];
+ x6[12] = x5[12];
+ x6[15] = x5[15];
+ x6[16] = _mm_adds_epi16(x5[16], x5[19]);
+ x6[19] = _mm_subs_epi16(x5[16], x5[19]);
+ x6[17] = _mm_adds_epi16(x5[17], x5[18]);
+ x6[18] = _mm_subs_epi16(x5[17], x5[18]);
+ x6[20] = _mm_subs_epi16(x5[23], x5[20]);
+ x6[23] = _mm_adds_epi16(x5[23], x5[20]);
+ x6[21] = _mm_subs_epi16(x5[22], x5[21]);
+ x6[22] = _mm_adds_epi16(x5[22], x5[21]);
+ x6[24] = _mm_adds_epi16(x5[24], x5[27]);
+ x6[27] = _mm_subs_epi16(x5[24], x5[27]);
+ x6[25] = _mm_adds_epi16(x5[25], x5[26]);
+ x6[26] = _mm_subs_epi16(x5[25], x5[26]);
+ x6[28] = _mm_subs_epi16(x5[31], x5[28]);
+ x6[31] = _mm_adds_epi16(x5[31], x5[28]);
+ x6[29] = _mm_subs_epi16(x5[30], x5[29]);
+ x6[30] = _mm_adds_epi16(x5[30], x5[29]);
+ x6[32] = x5[32];
+ x6[33] = x5[33];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[34], x5[61], x6[34], x6[61]);
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x5[35], x5[60], x6[35], x6[60]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[36], x5[59], x6[36], x6[59]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x5[37], x5[58], x6[37], x6[58]);
+ x6[38] = x5[38];
+ x6[39] = x5[39];
+ x6[40] = x5[40];
+ x6[41] = x5[41];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[42], x5[53], x6[42], x6[53]);
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x5[43], x5[52], x6[43], x6[52]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[44], x5[51], x6[44], x6[51]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x5[45], x5[50], x6[45], x6[50]);
+ x6[46] = x5[46];
+ x6[47] = x5[47];
+ x6[48] = x5[48];
+ x6[49] = x5[49];
+ x6[54] = x5[54];
+ x6[55] = x5[55];
+ x6[56] = x5[56];
+ x6[57] = x5[57];
+ x6[62] = x5[62];
+ x6[63] = x5[63];
+
+ // stage 7
+ __m128i x7[64];
+ x7[0] = x6[0];
+ x7[1] = x6[1];
+ x7[2] = x6[2];
+ x7[3] = x6[3];
+ btf_16_sse2(cospi_p56_p08, cospi_m08_p56, x6[4], x6[7], x7[4], x7[7]);
+ btf_16_sse2(cospi_p24_p40, cospi_m40_p24, x6[5], x6[6], x7[5], x7[6]);
+ x7[8] = _mm_adds_epi16(x6[8], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[8], x6[9]);
+ x7[10] = _mm_subs_epi16(x6[11], x6[10]);
+ x7[11] = _mm_adds_epi16(x6[11], x6[10]);
+ x7[12] = _mm_adds_epi16(x6[12], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[12], x6[13]);
+ x7[14] = _mm_subs_epi16(x6[15], x6[14]);
+ x7[15] = _mm_adds_epi16(x6[15], x6[14]);
+ x7[16] = x6[16];
+ btf_16_sse2(cospi_m08_p56, cospi_p56_p08, x6[17], x6[30], x7[17], x7[30]);
+ btf_16_sse2(cospi_m56_m08, cospi_m08_p56, x6[18], x6[29], x7[18], x7[29]);
+ x7[19] = x6[19];
+ x7[20] = x6[20];
+ btf_16_sse2(cospi_m40_p24, cospi_p24_p40, x6[21], x6[26], x7[21], x7[26]);
+ btf_16_sse2(cospi_m24_m40, cospi_m40_p24, x6[22], x6[25], x7[22], x7[25]);
+ x7[23] = x6[23];
+ x7[24] = x6[24];
+ x7[27] = x6[27];
+ x7[28] = x6[28];
+ x7[31] = x6[31];
+ x7[32] = _mm_adds_epi16(x6[32], x6[35]);
+ x7[35] = _mm_subs_epi16(x6[32], x6[35]);
+ x7[33] = _mm_adds_epi16(x6[33], x6[34]);
+ x7[34] = _mm_subs_epi16(x6[33], x6[34]);
+ x7[36] = _mm_subs_epi16(x6[39], x6[36]);
+ x7[39] = _mm_adds_epi16(x6[39], x6[36]);
+ x7[37] = _mm_subs_epi16(x6[38], x6[37]);
+ x7[38] = _mm_adds_epi16(x6[38], x6[37]);
+ x7[40] = _mm_adds_epi16(x6[40], x6[43]);
+ x7[43] = _mm_subs_epi16(x6[40], x6[43]);
+ x7[41] = _mm_adds_epi16(x6[41], x6[42]);
+ x7[42] = _mm_subs_epi16(x6[41], x6[42]);
+ x7[44] = _mm_subs_epi16(x6[47], x6[44]);
+ x7[47] = _mm_adds_epi16(x6[47], x6[44]);
+ x7[45] = _mm_subs_epi16(x6[46], x6[45]);
+ x7[46] = _mm_adds_epi16(x6[46], x6[45]);
+ x7[48] = _mm_adds_epi16(x6[48], x6[51]);
+ x7[51] = _mm_subs_epi16(x6[48], x6[51]);
+ x7[49] = _mm_adds_epi16(x6[49], x6[50]);
+ x7[50] = _mm_subs_epi16(x6[49], x6[50]);
+ x7[52] = _mm_subs_epi16(x6[55], x6[52]);
+ x7[55] = _mm_adds_epi16(x6[55], x6[52]);
+ x7[53] = _mm_subs_epi16(x6[54], x6[53]);
+ x7[54] = _mm_adds_epi16(x6[54], x6[53]);
+ x7[56] = _mm_adds_epi16(x6[56], x6[59]);
+ x7[59] = _mm_subs_epi16(x6[56], x6[59]);
+ x7[57] = _mm_adds_epi16(x6[57], x6[58]);
+ x7[58] = _mm_subs_epi16(x6[57], x6[58]);
+ x7[60] = _mm_subs_epi16(x6[63], x6[60]);
+ x7[63] = _mm_adds_epi16(x6[63], x6[60]);
+ x7[61] = _mm_subs_epi16(x6[62], x6[61]);
+ x7[62] = _mm_adds_epi16(x6[62], x6[61]);
+
+ // stage 8
+ __m128i x8[64];
+ x8[0] = x7[0];
+ x8[1] = x7[1];
+ x8[2] = x7[2];
+ x8[3] = x7[3];
+ x8[4] = x7[4];
+ x8[5] = x7[5];
+ x8[6] = x7[6];
+ x8[7] = x7[7];
+ btf_16_sse2(cospi_p60_p04, cospi_m04_p60, x7[8], x7[15], x8[8], x8[15]);
+ btf_16_sse2(cospi_p28_p36, cospi_m36_p28, x7[9], x7[14], x8[9], x8[14]);
+ btf_16_sse2(cospi_p44_p20, cospi_m20_p44, x7[10], x7[13], x8[10], x8[13]);
+ btf_16_sse2(cospi_p12_p52, cospi_m52_p12, x7[11], x7[12], x8[11], x8[12]);
+ x8[16] = _mm_adds_epi16(x7[16], x7[17]);
+ x8[17] = _mm_subs_epi16(x7[16], x7[17]);
+ x8[18] = _mm_subs_epi16(x7[19], x7[18]);
+ x8[19] = _mm_adds_epi16(x7[19], x7[18]);
+ x8[20] = _mm_adds_epi16(x7[20], x7[21]);
+ x8[21] = _mm_subs_epi16(x7[20], x7[21]);
+ x8[22] = _mm_subs_epi16(x7[23], x7[22]);
+ x8[23] = _mm_adds_epi16(x7[23], x7[22]);
+ x8[24] = _mm_adds_epi16(x7[24], x7[25]);
+ x8[25] = _mm_subs_epi16(x7[24], x7[25]);
+ x8[26] = _mm_subs_epi16(x7[27], x7[26]);
+ x8[27] = _mm_adds_epi16(x7[27], x7[26]);
+ x8[28] = _mm_adds_epi16(x7[28], x7[29]);
+ x8[29] = _mm_subs_epi16(x7[28], x7[29]);
+ x8[30] = _mm_subs_epi16(x7[31], x7[30]);
+ x8[31] = _mm_adds_epi16(x7[31], x7[30]);
+ x8[32] = x7[32];
+ btf_16_sse2(cospi_m04_p60, cospi_p60_p04, x7[33], x7[62], x8[33], x8[62]);
+ btf_16_sse2(cospi_m60_m04, cospi_m04_p60, x7[34], x7[61], x8[34], x8[61]);
+ x8[35] = x7[35];
+ x8[36] = x7[36];
+ btf_16_sse2(cospi_m36_p28, cospi_p28_p36, x7[37], x7[58], x8[37], x8[58]);
+ btf_16_sse2(cospi_m28_m36, cospi_m36_p28, x7[38], x7[57], x8[38], x8[57]);
+ x8[39] = x7[39];
+ x8[40] = x7[40];
+ btf_16_sse2(cospi_m20_p44, cospi_p44_p20, x7[41], x7[54], x8[41], x8[54]);
+ btf_16_sse2(cospi_m44_m20, cospi_m20_p44, x7[42], x7[53], x8[42], x8[53]);
+ x8[43] = x7[43];
+ x8[44] = x7[44];
+ btf_16_sse2(cospi_m52_p12, cospi_p12_p52, x7[45], x7[50], x8[45], x8[50]);
+ btf_16_sse2(cospi_m12_m52, cospi_m52_p12, x7[46], x7[49], x8[46], x8[49]);
+ x8[47] = x7[47];
+ x8[48] = x7[48];
+ x8[51] = x7[51];
+ x8[52] = x7[52];
+ x8[55] = x7[55];
+ x8[56] = x7[56];
+ x8[59] = x7[59];
+ x8[60] = x7[60];
+ x8[63] = x7[63];
+
+ // stage 9
+ __m128i x9[64];
+ x9[0] = x8[0];
+ x9[1] = x8[1];
+ x9[2] = x8[2];
+ x9[3] = x8[3];
+ x9[4] = x8[4];
+ x9[5] = x8[5];
+ x9[6] = x8[6];
+ x9[7] = x8[7];
+ x9[8] = x8[8];
+ x9[9] = x8[9];
+ x9[10] = x8[10];
+ x9[11] = x8[11];
+ x9[12] = x8[12];
+ x9[13] = x8[13];
+ x9[14] = x8[14];
+ x9[15] = x8[15];
+ btf_16_sse2(cospi_p62_p02, cospi_m02_p62, x8[16], x8[31], x9[16], x9[31]);
+ btf_16_sse2(cospi_p30_p34, cospi_m34_p30, x8[17], x8[30], x9[17], x9[30]);
+ btf_16_sse2(cospi_p46_p18, cospi_m18_p46, x8[18], x8[29], x9[18], x9[29]);
+ btf_16_sse2(cospi_p14_p50, cospi_m50_p14, x8[19], x8[28], x9[19], x9[28]);
+ btf_16_sse2(cospi_p54_p10, cospi_m10_p54, x8[20], x8[27], x9[20], x9[27]);
+ btf_16_sse2(cospi_p22_p42, cospi_m42_p22, x8[21], x8[26], x9[21], x9[26]);
+ btf_16_sse2(cospi_p38_p26, cospi_m26_p38, x8[22], x8[25], x9[22], x9[25]);
+ btf_16_sse2(cospi_p06_p58, cospi_m58_p06, x8[23], x8[24], x9[23], x9[24]);
+ x9[32] = _mm_adds_epi16(x8[32], x8[33]);
+ x9[33] = _mm_subs_epi16(x8[32], x8[33]);
+ x9[34] = _mm_subs_epi16(x8[35], x8[34]);
+ x9[35] = _mm_adds_epi16(x8[35], x8[34]);
+ x9[36] = _mm_adds_epi16(x8[36], x8[37]);
+ x9[37] = _mm_subs_epi16(x8[36], x8[37]);
+ x9[38] = _mm_subs_epi16(x8[39], x8[38]);
+ x9[39] = _mm_adds_epi16(x8[39], x8[38]);
+ x9[40] = _mm_adds_epi16(x8[40], x8[41]);
+ x9[41] = _mm_subs_epi16(x8[40], x8[41]);
+ x9[42] = _mm_subs_epi16(x8[43], x8[42]);
+ x9[43] = _mm_adds_epi16(x8[43], x8[42]);
+ x9[44] = _mm_adds_epi16(x8[44], x8[45]);
+ x9[45] = _mm_subs_epi16(x8[44], x8[45]);
+ x9[46] = _mm_subs_epi16(x8[47], x8[46]);
+ x9[47] = _mm_adds_epi16(x8[47], x8[46]);
+ x9[48] = _mm_adds_epi16(x8[48], x8[49]);
+ x9[49] = _mm_subs_epi16(x8[48], x8[49]);
+ x9[50] = _mm_subs_epi16(x8[51], x8[50]);
+ x9[51] = _mm_adds_epi16(x8[51], x8[50]);
+ x9[52] = _mm_adds_epi16(x8[52], x8[53]);
+ x9[53] = _mm_subs_epi16(x8[52], x8[53]);
+ x9[54] = _mm_subs_epi16(x8[55], x8[54]);
+ x9[55] = _mm_adds_epi16(x8[55], x8[54]);
+ x9[56] = _mm_adds_epi16(x8[56], x8[57]);
+ x9[57] = _mm_subs_epi16(x8[56], x8[57]);
+ x9[58] = _mm_subs_epi16(x8[59], x8[58]);
+ x9[59] = _mm_adds_epi16(x8[59], x8[58]);
+ x9[60] = _mm_adds_epi16(x8[60], x8[61]);
+ x9[61] = _mm_subs_epi16(x8[60], x8[61]);
+ x9[62] = _mm_subs_epi16(x8[63], x8[62]);
+ x9[63] = _mm_adds_epi16(x8[63], x8[62]);
+
+ // stage 10
+ __m128i x10[64];
+ x10[0] = x9[0];
+ x10[1] = x9[1];
+ x10[2] = x9[2];
+ x10[3] = x9[3];
+ x10[4] = x9[4];
+ x10[5] = x9[5];
+ x10[6] = x9[6];
+ x10[7] = x9[7];
+ x10[8] = x9[8];
+ x10[9] = x9[9];
+ x10[10] = x9[10];
+ x10[11] = x9[11];
+ x10[12] = x9[12];
+ x10[13] = x9[13];
+ x10[14] = x9[14];
+ x10[15] = x9[15];
+ x10[16] = x9[16];
+ x10[17] = x9[17];
+ x10[18] = x9[18];
+ x10[19] = x9[19];
+ x10[20] = x9[20];
+ x10[21] = x9[21];
+ x10[22] = x9[22];
+ x10[23] = x9[23];
+ x10[24] = x9[24];
+ x10[25] = x9[25];
+ x10[26] = x9[26];
+ x10[27] = x9[27];
+ x10[28] = x9[28];
+ x10[29] = x9[29];
+ x10[30] = x9[30];
+ x10[31] = x9[31];
+ btf_16_sse2(cospi_p63_p01, cospi_m01_p63, x9[32], x9[63], x10[32], x10[63]);
+ btf_16_sse2(cospi_p31_p33, cospi_m33_p31, x9[33], x9[62], x10[33], x10[62]);
+ btf_16_sse2(cospi_p47_p17, cospi_m17_p47, x9[34], x9[61], x10[34], x10[61]);
+ btf_16_sse2(cospi_p15_p49, cospi_m49_p15, x9[35], x9[60], x10[35], x10[60]);
+ btf_16_sse2(cospi_p55_p09, cospi_m09_p55, x9[36], x9[59], x10[36], x10[59]);
+ btf_16_sse2(cospi_p23_p41, cospi_m41_p23, x9[37], x9[58], x10[37], x10[58]);
+ btf_16_sse2(cospi_p39_p25, cospi_m25_p39, x9[38], x9[57], x10[38], x10[57]);
+ btf_16_sse2(cospi_p07_p57, cospi_m57_p07, x9[39], x9[56], x10[39], x10[56]);
+ btf_16_sse2(cospi_p59_p05, cospi_m05_p59, x9[40], x9[55], x10[40], x10[55]);
+ btf_16_sse2(cospi_p27_p37, cospi_m37_p27, x9[41], x9[54], x10[41], x10[54]);
+ btf_16_sse2(cospi_p43_p21, cospi_m21_p43, x9[42], x9[53], x10[42], x10[53]);
+ btf_16_sse2(cospi_p11_p53, cospi_m53_p11, x9[43], x9[52], x10[43], x10[52]);
+ btf_16_sse2(cospi_p51_p13, cospi_m13_p51, x9[44], x9[51], x10[44], x10[51]);
+ btf_16_sse2(cospi_p19_p45, cospi_m45_p19, x9[45], x9[50], x10[45], x10[50]);
+ btf_16_sse2(cospi_p35_p29, cospi_m29_p35, x9[46], x9[49], x10[46], x10[49]);
+ btf_16_sse2(cospi_p03_p61, cospi_m61_p03, x9[47], x9[48], x10[47], x10[48]);
+
+ // stage 11
+ output[0] = x10[0];
+ output[1] = x10[32];
+ output[2] = x10[16];
+ output[3] = x10[48];
+ output[4] = x10[8];
+ output[5] = x10[40];
+ output[6] = x10[24];
+ output[7] = x10[56];
+ output[8] = x10[4];
+ output[9] = x10[36];
+ output[10] = x10[20];
+ output[11] = x10[52];
+ output[12] = x10[12];
+ output[13] = x10[44];
+ output[14] = x10[28];
+ output[15] = x10[60];
+ output[16] = x10[2];
+ output[17] = x10[34];
+ output[18] = x10[18];
+ output[19] = x10[50];
+ output[20] = x10[10];
+ output[21] = x10[42];
+ output[22] = x10[26];
+ output[23] = x10[58];
+ output[24] = x10[6];
+ output[25] = x10[38];
+ output[26] = x10[22];
+ output[27] = x10[54];
+ output[28] = x10[14];
+ output[29] = x10[46];
+ output[30] = x10[30];
+ output[31] = x10[62];
+ output[32] = x10[1];
+ output[33] = x10[33];
+ output[34] = x10[17];
+ output[35] = x10[49];
+ output[36] = x10[9];
+ output[37] = x10[41];
+ output[38] = x10[25];
+ output[39] = x10[57];
+ output[40] = x10[5];
+ output[41] = x10[37];
+ output[42] = x10[21];
+ output[43] = x10[53];
+ output[44] = x10[13];
+ output[45] = x10[45];
+ output[46] = x10[29];
+ output[47] = x10[61];
+ output[48] = x10[3];
+ output[49] = x10[35];
+ output[50] = x10[19];
+ output[51] = x10[51];
+ output[52] = x10[11];
+ output[53] = x10[43];
+ output[54] = x10[27];
+ output[55] = x10[59];
+ output[56] = x10[7];
+ output[57] = x10[39];
+ output[58] = x10[23];
+ output[59] = x10[55];
+ output[60] = x10[15];
+ output[61] = x10[47];
+ output[62] = x10[31];
+ output[63] = x10[63];
+}
+
+static void fadst4x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u[8], v[8];
+
+ u[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u[2] = _mm_unpacklo_epi16(in7, __zero);
+ u[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u[4] = _mm_unpacklo_epi16(input[3], __zero);
+
+ v[0] = _mm_madd_epi16(u[0], sinpi_p01_p02); // s0 + s2
+ v[1] = _mm_madd_epi16(u[1], sinpi_p03_p04); // s4 + s5
+ v[2] = _mm_madd_epi16(u[2], sinpi_p03_p03); // x1
+ v[3] = _mm_madd_epi16(u[0], sinpi_p04_m01); // s1 - s3
+ v[4] = _mm_madd_epi16(u[1], sinpi_m03_p02); // -s4 + s6
+ v[5] = _mm_madd_epi16(u[3], sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], sinpi_p03_p03);
+
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[1] = _mm_sub_epi32(v[2], v[6]);
+ u[2] = _mm_add_epi32(v[3], v[4]);
+ u[3] = _mm_sub_epi32(u[2], u[0]);
+ u[4] = _mm_slli_epi32(v[5], 2);
+ u[5] = _mm_sub_epi32(u[4], v[5]);
+ u[6] = _mm_add_epi32(u[3], u[5]);
+
+ v[0] = _mm_add_epi32(u[0], __rounding);
+ v[1] = _mm_add_epi32(u[1], __rounding);
+ v[2] = _mm_add_epi32(u[2], __rounding);
+ v[3] = _mm_add_epi32(u[6], __rounding);
+
+ u[0] = _mm_srai_epi32(v[0], cos_bit);
+ u[1] = _mm_srai_epi32(v[1], cos_bit);
+ u[2] = _mm_srai_epi32(v[2], cos_bit);
+ u[3] = _mm_srai_epi32(v[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u[0], u[2]);
+ output[1] = _mm_packs_epi32(u[1], u[3]);
+ output[2] = _mm_srli_si128(output[0], 8);
+ output[3] = _mm_srli_si128(output[1], 8);
+}
+
+static void fadst4x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[2],
+ &x1[3], &x2[2], &x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_w4_sse2(&cospi_p32_p32, &cospi_p32_m32, __rounding, cos_bit, &x1[6],
+ &x1[7], &x2[6], &x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_w4_sse2(&cospi_p16_p48, &cospi_p48_m16, __rounding, cos_bit, &x3[4],
+ &x3[5], &x4[4], &x4[5]);
+ btf_16_w4_sse2(&cospi_m48_p16, &cospi_p16_p48, __rounding, cos_bit, &x3[6],
+ &x3[7], &x4[6], &x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_w4_sse2(&cospi_p04_p60, &cospi_p60_m04, __rounding, cos_bit, &x5[0],
+ &x5[1], &x6[0], &x6[1]);
+ btf_16_w4_sse2(&cospi_p20_p44, &cospi_p44_m20, __rounding, cos_bit, &x5[2],
+ &x5[3], &x6[2], &x6[3]);
+ btf_16_w4_sse2(&cospi_p36_p28, &cospi_p28_m36, __rounding, cos_bit, &x5[4],
+ &x5[5], &x6[4], &x6[5]);
+ btf_16_w4_sse2(&cospi_p52_p12, &cospi_p12_m52, __rounding, cos_bit, &x5[6],
+ &x5[7], &x6[6], &x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p02 = pair_set_epi16(sinpi[1], sinpi[2]);
+ const __m128i sinpi_p04_m01 = pair_set_epi16(sinpi[4], -sinpi[1]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ const __m128i sinpi_m03_p02 = pair_set_epi16(-sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi[3]);
+ const __m128i __zero = _mm_set1_epi16(0);
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ const __m128i in7 = _mm_add_epi16(input[0], input[1]);
+ __m128i u_lo[8], u_hi[8], v_lo[8], v_hi[8];
+
+ u_lo[0] = _mm_unpacklo_epi16(input[0], input[1]);
+ u_hi[0] = _mm_unpackhi_epi16(input[0], input[1]);
+ u_lo[1] = _mm_unpacklo_epi16(input[2], input[3]);
+ u_hi[1] = _mm_unpackhi_epi16(input[2], input[3]);
+ u_lo[2] = _mm_unpacklo_epi16(in7, __zero);
+ u_hi[2] = _mm_unpackhi_epi16(in7, __zero);
+ u_lo[3] = _mm_unpacklo_epi16(input[2], __zero);
+ u_hi[3] = _mm_unpackhi_epi16(input[2], __zero);
+ u_lo[4] = _mm_unpacklo_epi16(input[3], __zero);
+ u_hi[4] = _mm_unpackhi_epi16(input[3], __zero);
+
+ v_lo[0] = _mm_madd_epi16(u_lo[0], sinpi_p01_p02); // s0 + s2
+ v_hi[0] = _mm_madd_epi16(u_hi[0], sinpi_p01_p02); // s0 + s2
+ v_lo[1] = _mm_madd_epi16(u_lo[1], sinpi_p03_p04); // s4 + s5
+ v_hi[1] = _mm_madd_epi16(u_hi[1], sinpi_p03_p04); // s4 + s5
+ v_lo[2] = _mm_madd_epi16(u_lo[2], sinpi_p03_p03); // x1
+ v_hi[2] = _mm_madd_epi16(u_hi[2], sinpi_p03_p03); // x1
+ v_lo[3] = _mm_madd_epi16(u_lo[0], sinpi_p04_m01); // s1 - s3
+ v_hi[3] = _mm_madd_epi16(u_hi[0], sinpi_p04_m01); // s1 - s3
+ v_lo[4] = _mm_madd_epi16(u_lo[1], sinpi_m03_p02); // -s4 + s6
+ v_hi[4] = _mm_madd_epi16(u_hi[1], sinpi_m03_p02); // -s4 + s6
+ v_lo[5] = _mm_madd_epi16(u_lo[3], sinpi_p03_p03); // s4
+ v_hi[5] = _mm_madd_epi16(u_hi[3], sinpi_p03_p03); // s4
+ v_lo[6] = _mm_madd_epi16(u_lo[4], sinpi_p03_p03);
+ v_hi[6] = _mm_madd_epi16(u_hi[4], sinpi_p03_p03);
+
+ u_lo[0] = _mm_add_epi32(v_lo[0], v_lo[1]);
+ u_hi[0] = _mm_add_epi32(v_hi[0], v_hi[1]);
+ u_lo[1] = _mm_sub_epi32(v_lo[2], v_lo[6]);
+ u_hi[1] = _mm_sub_epi32(v_hi[2], v_hi[6]);
+ u_lo[2] = _mm_add_epi32(v_lo[3], v_lo[4]);
+ u_hi[2] = _mm_add_epi32(v_hi[3], v_hi[4]);
+ u_lo[3] = _mm_sub_epi32(u_lo[2], u_lo[0]);
+ u_hi[3] = _mm_sub_epi32(u_hi[2], u_hi[0]);
+ u_lo[4] = _mm_slli_epi32(v_lo[5], 2);
+ u_hi[4] = _mm_slli_epi32(v_hi[5], 2);
+ u_lo[5] = _mm_sub_epi32(u_lo[4], v_lo[5]);
+ u_hi[5] = _mm_sub_epi32(u_hi[4], v_hi[5]);
+ u_lo[6] = _mm_add_epi32(u_lo[3], u_lo[5]);
+ u_hi[6] = _mm_add_epi32(u_hi[3], u_hi[5]);
+
+ v_lo[0] = _mm_add_epi32(u_lo[0], __rounding);
+ v_hi[0] = _mm_add_epi32(u_hi[0], __rounding);
+ v_lo[1] = _mm_add_epi32(u_lo[1], __rounding);
+ v_hi[1] = _mm_add_epi32(u_hi[1], __rounding);
+ v_lo[2] = _mm_add_epi32(u_lo[2], __rounding);
+ v_hi[2] = _mm_add_epi32(u_hi[2], __rounding);
+ v_lo[3] = _mm_add_epi32(u_lo[6], __rounding);
+ v_hi[3] = _mm_add_epi32(u_hi[6], __rounding);
+
+ u_lo[0] = _mm_srai_epi32(v_lo[0], cos_bit);
+ u_hi[0] = _mm_srai_epi32(v_hi[0], cos_bit);
+ u_lo[1] = _mm_srai_epi32(v_lo[1], cos_bit);
+ u_hi[1] = _mm_srai_epi32(v_hi[1], cos_bit);
+ u_lo[2] = _mm_srai_epi32(v_lo[2], cos_bit);
+ u_hi[2] = _mm_srai_epi32(v_hi[2], cos_bit);
+ u_lo[3] = _mm_srai_epi32(v_lo[3], cos_bit);
+ u_hi[3] = _mm_srai_epi32(v_hi[3], cos_bit);
+
+ output[0] = _mm_packs_epi32(u_lo[0], u_hi[0]);
+ output[1] = _mm_packs_epi32(u_lo[1], u_hi[1]);
+ output[2] = _mm_packs_epi32(u_lo[2], u_hi[2]);
+ output[3] = _mm_packs_epi32(u_lo[3], u_hi[3]);
+}
+
+static void fadst8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p04_p60 = pair_set_epi16(cospi[4], cospi[60]);
+ __m128i cospi_p60_m04 = pair_set_epi16(cospi[60], -cospi[4]);
+ __m128i cospi_p20_p44 = pair_set_epi16(cospi[20], cospi[44]);
+ __m128i cospi_p44_m20 = pair_set_epi16(cospi[44], -cospi[20]);
+ __m128i cospi_p36_p28 = pair_set_epi16(cospi[36], cospi[28]);
+ __m128i cospi_p28_m36 = pair_set_epi16(cospi[28], -cospi[36]);
+ __m128i cospi_p52_p12 = pair_set_epi16(cospi[52], cospi[12]);
+ __m128i cospi_p12_m52 = pair_set_epi16(cospi[12], -cospi[52]);
+
+ // stage 1
+ __m128i x1[8];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[7]);
+ x1[2] = _mm_subs_epi16(__zero, input[3]);
+ x1[3] = input[4];
+ x1[4] = _mm_subs_epi16(__zero, input[1]);
+ x1[5] = input[6];
+ x1[6] = input[2];
+ x1[7] = _mm_subs_epi16(__zero, input[5]);
+
+ // stage 2
+ __m128i x2[8];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+
+ // stage 3
+ __m128i x3[8];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+
+ // stage 4
+ __m128i x4[8];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+
+ // stage 5
+ __m128i x5[8];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+
+ // stage 6
+ __m128i x6[8];
+ btf_16_sse2(cospi_p04_p60, cospi_p60_m04, x5[0], x5[1], x6[0], x6[1]);
+ btf_16_sse2(cospi_p20_p44, cospi_p44_m20, x5[2], x5[3], x6[2], x6[3]);
+ btf_16_sse2(cospi_p36_p28, cospi_p28_m36, x5[4], x5[5], x6[4], x6[5]);
+ btf_16_sse2(cospi_p52_p12, cospi_p12_m52, x5[6], x5[7], x6[6], x6[7]);
+
+ // stage 7
+ output[0] = x6[1];
+ output[1] = x6[6];
+ output[2] = x6[3];
+ output[3] = x6[4];
+ output[4] = x6[5];
+ output[5] = x6[2];
+ output[6] = x6[7];
+ output[7] = x6[0];
+}
+
+static void fadst8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ const int32_t *cospi = cospi_arr(cos_bit);
+ const __m128i __zero = _mm_setzero_si128();
+ const __m128i __rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+
+ __m128i cospi_p32_p32 = pair_set_epi16(cospi[32], cospi[32]);
+ __m128i cospi_p32_m32 = pair_set_epi16(cospi[32], -cospi[32]);
+ __m128i cospi_p16_p48 = pair_set_epi16(cospi[16], cospi[48]);
+ __m128i cospi_p48_m16 = pair_set_epi16(cospi[48], -cospi[16]);
+ __m128i cospi_m48_p16 = pair_set_epi16(-cospi[48], cospi[16]);
+ __m128i cospi_p08_p56 = pair_set_epi16(cospi[8], cospi[56]);
+ __m128i cospi_p56_m08 = pair_set_epi16(cospi[56], -cospi[8]);
+ __m128i cospi_p40_p24 = pair_set_epi16(cospi[40], cospi[24]);
+ __m128i cospi_p24_m40 = pair_set_epi16(cospi[24], -cospi[40]);
+ __m128i cospi_m56_p08 = pair_set_epi16(-cospi[56], cospi[8]);
+ __m128i cospi_m24_p40 = pair_set_epi16(-cospi[24], cospi[40]);
+ __m128i cospi_p02_p62 = pair_set_epi16(cospi[2], cospi[62]);
+ __m128i cospi_p62_m02 = pair_set_epi16(cospi[62], -cospi[2]);
+ __m128i cospi_p10_p54 = pair_set_epi16(cospi[10], cospi[54]);
+ __m128i cospi_p54_m10 = pair_set_epi16(cospi[54], -cospi[10]);
+ __m128i cospi_p18_p46 = pair_set_epi16(cospi[18], cospi[46]);
+ __m128i cospi_p46_m18 = pair_set_epi16(cospi[46], -cospi[18]);
+ __m128i cospi_p26_p38 = pair_set_epi16(cospi[26], cospi[38]);
+ __m128i cospi_p38_m26 = pair_set_epi16(cospi[38], -cospi[26]);
+ __m128i cospi_p34_p30 = pair_set_epi16(cospi[34], cospi[30]);
+ __m128i cospi_p30_m34 = pair_set_epi16(cospi[30], -cospi[34]);
+ __m128i cospi_p42_p22 = pair_set_epi16(cospi[42], cospi[22]);
+ __m128i cospi_p22_m42 = pair_set_epi16(cospi[22], -cospi[42]);
+ __m128i cospi_p50_p14 = pair_set_epi16(cospi[50], cospi[14]);
+ __m128i cospi_p14_m50 = pair_set_epi16(cospi[14], -cospi[50]);
+ __m128i cospi_p58_p06 = pair_set_epi16(cospi[58], cospi[6]);
+ __m128i cospi_p06_m58 = pair_set_epi16(cospi[6], -cospi[58]);
+
+ // stage 1
+ __m128i x1[16];
+ x1[0] = input[0];
+ x1[1] = _mm_subs_epi16(__zero, input[15]);
+ x1[2] = _mm_subs_epi16(__zero, input[7]);
+ x1[3] = input[8];
+ x1[4] = _mm_subs_epi16(__zero, input[3]);
+ x1[5] = input[12];
+ x1[6] = input[4];
+ x1[7] = _mm_subs_epi16(__zero, input[11]);
+ x1[8] = _mm_subs_epi16(__zero, input[1]);
+ x1[9] = input[14];
+ x1[10] = input[6];
+ x1[11] = _mm_subs_epi16(__zero, input[9]);
+ x1[12] = input[2];
+ x1[13] = _mm_subs_epi16(__zero, input[13]);
+ x1[14] = _mm_subs_epi16(__zero, input[5]);
+ x1[15] = input[10];
+
+ // stage 2
+ __m128i x2[16];
+ x2[0] = x1[0];
+ x2[1] = x1[1];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[2], x1[3], x2[2], x2[3]);
+ x2[4] = x1[4];
+ x2[5] = x1[5];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[6], x1[7], x2[6], x2[7]);
+ x2[8] = x1[8];
+ x2[9] = x1[9];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[10], x1[11], x2[10], x2[11]);
+ x2[12] = x1[12];
+ x2[13] = x1[13];
+ btf_16_sse2(cospi_p32_p32, cospi_p32_m32, x1[14], x1[15], x2[14], x2[15]);
+
+ // stage 3
+ __m128i x3[16];
+ x3[0] = _mm_adds_epi16(x2[0], x2[2]);
+ x3[2] = _mm_subs_epi16(x2[0], x2[2]);
+ x3[1] = _mm_adds_epi16(x2[1], x2[3]);
+ x3[3] = _mm_subs_epi16(x2[1], x2[3]);
+ x3[4] = _mm_adds_epi16(x2[4], x2[6]);
+ x3[6] = _mm_subs_epi16(x2[4], x2[6]);
+ x3[5] = _mm_adds_epi16(x2[5], x2[7]);
+ x3[7] = _mm_subs_epi16(x2[5], x2[7]);
+ x3[8] = _mm_adds_epi16(x2[8], x2[10]);
+ x3[10] = _mm_subs_epi16(x2[8], x2[10]);
+ x3[9] = _mm_adds_epi16(x2[9], x2[11]);
+ x3[11] = _mm_subs_epi16(x2[9], x2[11]);
+ x3[12] = _mm_adds_epi16(x2[12], x2[14]);
+ x3[14] = _mm_subs_epi16(x2[12], x2[14]);
+ x3[13] = _mm_adds_epi16(x2[13], x2[15]);
+ x3[15] = _mm_subs_epi16(x2[13], x2[15]);
+
+ // stage 4
+ __m128i x4[16];
+ x4[0] = x3[0];
+ x4[1] = x3[1];
+ x4[2] = x3[2];
+ x4[3] = x3[3];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[4], x3[5], x4[4], x4[5]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[6], x3[7], x4[6], x4[7]);
+ x4[8] = x3[8];
+ x4[9] = x3[9];
+ x4[10] = x3[10];
+ x4[11] = x3[11];
+ btf_16_sse2(cospi_p16_p48, cospi_p48_m16, x3[12], x3[13], x4[12], x4[13]);
+ btf_16_sse2(cospi_m48_p16, cospi_p16_p48, x3[14], x3[15], x4[14], x4[15]);
+
+ // stage 5
+ __m128i x5[16];
+ x5[0] = _mm_adds_epi16(x4[0], x4[4]);
+ x5[4] = _mm_subs_epi16(x4[0], x4[4]);
+ x5[1] = _mm_adds_epi16(x4[1], x4[5]);
+ x5[5] = _mm_subs_epi16(x4[1], x4[5]);
+ x5[2] = _mm_adds_epi16(x4[2], x4[6]);
+ x5[6] = _mm_subs_epi16(x4[2], x4[6]);
+ x5[3] = _mm_adds_epi16(x4[3], x4[7]);
+ x5[7] = _mm_subs_epi16(x4[3], x4[7]);
+ x5[8] = _mm_adds_epi16(x4[8], x4[12]);
+ x5[12] = _mm_subs_epi16(x4[8], x4[12]);
+ x5[9] = _mm_adds_epi16(x4[9], x4[13]);
+ x5[13] = _mm_subs_epi16(x4[9], x4[13]);
+ x5[10] = _mm_adds_epi16(x4[10], x4[14]);
+ x5[14] = _mm_subs_epi16(x4[10], x4[14]);
+ x5[11] = _mm_adds_epi16(x4[11], x4[15]);
+ x5[15] = _mm_subs_epi16(x4[11], x4[15]);
+
+ // stage 6
+ __m128i x6[16];
+ x6[0] = x5[0];
+ x6[1] = x5[1];
+ x6[2] = x5[2];
+ x6[3] = x5[3];
+ x6[4] = x5[4];
+ x6[5] = x5[5];
+ x6[6] = x5[6];
+ x6[7] = x5[7];
+ btf_16_sse2(cospi_p08_p56, cospi_p56_m08, x5[8], x5[9], x6[8], x6[9]);
+ btf_16_sse2(cospi_p40_p24, cospi_p24_m40, x5[10], x5[11], x6[10], x6[11]);
+ btf_16_sse2(cospi_m56_p08, cospi_p08_p56, x5[12], x5[13], x6[12], x6[13]);
+ btf_16_sse2(cospi_m24_p40, cospi_p40_p24, x5[14], x5[15], x6[14], x6[15]);
+
+ // stage 7
+ __m128i x7[16];
+ x7[0] = _mm_adds_epi16(x6[0], x6[8]);
+ x7[8] = _mm_subs_epi16(x6[0], x6[8]);
+ x7[1] = _mm_adds_epi16(x6[1], x6[9]);
+ x7[9] = _mm_subs_epi16(x6[1], x6[9]);
+ x7[2] = _mm_adds_epi16(x6[2], x6[10]);
+ x7[10] = _mm_subs_epi16(x6[2], x6[10]);
+ x7[3] = _mm_adds_epi16(x6[3], x6[11]);
+ x7[11] = _mm_subs_epi16(x6[3], x6[11]);
+ x7[4] = _mm_adds_epi16(x6[4], x6[12]);
+ x7[12] = _mm_subs_epi16(x6[4], x6[12]);
+ x7[5] = _mm_adds_epi16(x6[5], x6[13]);
+ x7[13] = _mm_subs_epi16(x6[5], x6[13]);
+ x7[6] = _mm_adds_epi16(x6[6], x6[14]);
+ x7[14] = _mm_subs_epi16(x6[6], x6[14]);
+ x7[7] = _mm_adds_epi16(x6[7], x6[15]);
+ x7[15] = _mm_subs_epi16(x6[7], x6[15]);
+
+ // stage 8
+ __m128i x8[16];
+ btf_16_sse2(cospi_p02_p62, cospi_p62_m02, x7[0], x7[1], x8[0], x8[1]);
+ btf_16_sse2(cospi_p10_p54, cospi_p54_m10, x7[2], x7[3], x8[2], x8[3]);
+ btf_16_sse2(cospi_p18_p46, cospi_p46_m18, x7[4], x7[5], x8[4], x8[5]);
+ btf_16_sse2(cospi_p26_p38, cospi_p38_m26, x7[6], x7[7], x8[6], x8[7]);
+ btf_16_sse2(cospi_p34_p30, cospi_p30_m34, x7[8], x7[9], x8[8], x8[9]);
+ btf_16_sse2(cospi_p42_p22, cospi_p22_m42, x7[10], x7[11], x8[10], x8[11]);
+ btf_16_sse2(cospi_p50_p14, cospi_p14_m50, x7[12], x7[13], x8[12], x8[13]);
+ btf_16_sse2(cospi_p58_p06, cospi_p06_m58, x7[14], x7[15], x8[14], x8[15]);
+
+ // stage 9
+ output[0] = x8[1];
+ output[1] = x8[14];
+ output[2] = x8[3];
+ output[3] = x8[12];
+ output[4] = x8[5];
+ output[5] = x8[10];
+ output[6] = x8[7];
+ output[7] = x8[8];
+ output[8] = x8[9];
+ output[9] = x8[6];
+ output[10] = x8[11];
+ output[11] = x8[4];
+ output[12] = x8[13];
+ output[13] = x8[2];
+ output[14] = x8[15];
+ output[15] = x8[0];
+}
+
+static const transform_1d_sse2 col_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fadst4x4_new_sse2, // ADST_DCT
+ fdct4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fadst4x4_new_sse2, // FLIPADST_DCT
+ fdct4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fdct4x4_new_sse2, // V_DCT
+ fidentity4x4_new_sse2, // H_DCT
+ fadst4x4_new_sse2, // V_ADST
+ fidentity4x4_new_sse2, // H_ADST
+ fadst4x4_new_sse2, // V_FLIPADST
+ fidentity4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x4_arr[TX_TYPES] = {
+ fdct4x4_new_sse2, // DCT_DCT
+ fdct4x4_new_sse2, // ADST_DCT
+ fadst4x4_new_sse2, // DCT_ADST
+ fadst4x4_new_sse2, // ADST_ADST
+ fdct4x4_new_sse2, // FLIPADST_DCT
+ fadst4x4_new_sse2, // DCT_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_FLIPADST
+ fadst4x4_new_sse2, // ADST_FLIPADST
+ fadst4x4_new_sse2, // FLIPADST_ADST
+ fidentity4x4_new_sse2, // IDTX
+ fidentity4x4_new_sse2, // V_DCT
+ fdct4x4_new_sse2, // H_DCT
+ fidentity4x4_new_sse2, // V_ADST
+ fadst4x4_new_sse2, // H_ADST
+ fidentity4x4_new_sse2, // V_FLIPADST
+ fadst4x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fadst4x8_new_sse2, // ADST_DCT
+ fdct4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fadst4x8_new_sse2, // FLIPADST_DCT
+ fdct4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct4x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst4x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst4x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fdct8x4_new_sse2, // ADST_DCT
+ fadst8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fdct8x4_new_sse2, // FLIPADST_DCT
+ fadst8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fidentity8x4_new_sse2, // V_DCT
+ fdct8x4_new_sse2, // H_DCT
+ fidentity8x4_new_sse2, // V_ADST
+ fadst8x4_new_sse2, // H_ADST
+ fidentity8x4_new_sse2, // V_FLIPADST
+ fadst8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x4_arr[TX_TYPES] = {
+ fdct8x4_new_sse2, // DCT_DCT
+ fadst8x4_new_sse2, // ADST_DCT
+ fdct8x4_new_sse2, // DCT_ADST
+ fadst8x4_new_sse2, // ADST_ADST
+ fadst8x4_new_sse2, // FLIPADST_DCT
+ fdct8x4_new_sse2, // DCT_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_FLIPADST
+ fadst8x4_new_sse2, // ADST_FLIPADST
+ fadst8x4_new_sse2, // FLIPADST_ADST
+ fidentity8x4_new_sse2, // IDTX
+ fdct8x4_new_sse2, // V_DCT
+ fidentity8x4_new_sse2, // H_DCT
+ fadst8x4_new_sse2, // V_ADST
+ fidentity8x4_new_sse2, // H_ADST
+ fadst8x4_new_sse2, // V_FLIPADST
+ fidentity8x4_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm4x8_arr[TX_TYPES] = {
+ fdct4x8_new_sse2, // DCT_DCT
+ fdct4x8_new_sse2, // ADST_DCT
+ fadst4x8_new_sse2, // DCT_ADST
+ fadst4x8_new_sse2, // ADST_ADST
+ fdct4x8_new_sse2, // FLIPADST_DCT
+ fadst4x8_new_sse2, // DCT_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_FLIPADST
+ fadst4x8_new_sse2, // ADST_FLIPADST
+ fadst4x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct4x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst4x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst4x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fadst8x8_new_sse2, // ADST_DCT
+ fdct8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fadst8x8_new_sse2, // FLIPADST_DCT
+ fdct8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fdct8x8_new_sse2, // V_DCT
+ fidentity8x8_new_sse2, // H_DCT
+ fadst8x8_new_sse2, // V_ADST
+ fidentity8x8_new_sse2, // H_ADST
+ fadst8x8_new_sse2, // V_FLIPADST
+ fidentity8x8_new_sse2, // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_new_sse2, // DCT_DCT
+ fdct8x8_new_sse2, // ADST_DCT
+ fadst8x8_new_sse2, // DCT_ADST
+ fadst8x8_new_sse2, // ADST_ADST
+ fdct8x8_new_sse2, // FLIPADST_DCT
+ fadst8x8_new_sse2, // DCT_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_FLIPADST
+ fadst8x8_new_sse2, // ADST_FLIPADST
+ fadst8x8_new_sse2, // FLIPADST_ADST
+ fidentity8x8_new_sse2, // IDTX
+ fidentity8x8_new_sse2, // V_DCT
+ fdct8x8_new_sse2, // H_DCT
+ fidentity8x8_new_sse2, // V_ADST
+ fadst8x8_new_sse2, // H_ADST
+ fidentity8x8_new_sse2, // V_FLIPADST
+ fadst8x8_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 col_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fadst8x16_new_sse2, // ADST_DCT
+ fdct8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fadst8x16_new_sse2, // FLIPADST_DCT
+ fdct8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fdct8x16_new_sse2, // V_DCT
+ fidentity8x16_new_sse2, // H_DCT
+ fadst8x16_new_sse2, // V_ADST
+ fidentity8x16_new_sse2, // H_ADST
+ fadst8x16_new_sse2, // V_FLIPADST
+ fidentity8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x16_arr[TX_TYPES] = {
+ fdct8x16_new_sse2, // DCT_DCT
+ fdct8x16_new_sse2, // ADST_DCT
+ fadst8x16_new_sse2, // DCT_ADST
+ fadst8x16_new_sse2, // ADST_ADST
+ fdct8x16_new_sse2, // FLIPADST_DCT
+ fadst8x16_new_sse2, // DCT_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_FLIPADST
+ fadst8x16_new_sse2, // ADST_FLIPADST
+ fadst8x16_new_sse2, // FLIPADST_ADST
+ fidentity8x16_new_sse2, // IDTX
+ fidentity8x16_new_sse2, // V_DCT
+ fdct8x16_new_sse2, // H_DCT
+ fidentity8x16_new_sse2, // V_ADST
+ fadst8x16_new_sse2, // H_ADST
+ fidentity8x16_new_sse2, // V_FLIPADST
+ fadst8x16_new_sse2 // H_FLIPADST
+};
+
+static const transform_1d_sse2 row_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fidentity8x32_new_sse2, // V_DCT
+ fdct8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[4], buf1[4], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm4x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x4(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_4x4(buf, buf);
+ store_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)stride;
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X8];
+ const int txw_idx = get_txw_idx(TX_4X8);
+ const int txh_idx = get_txh_idx(TX_4X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm4x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x4(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w4(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X16];
+ const int txw_idx = get_txw_idx(TX_4X16);
+ const int txh_idx = get_txh_idx(TX_4X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 4;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x4_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_w4_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit_w4(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_4x8(buf0, buf1);
+ transpose_16bit_4x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + 8 * i, buf, width);
+ } else {
+ buf = buf1 + 8 * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x4(buf, buf);
+ store_buffer_16bit_to_32bit_w4(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X4];
+ const int txw_idx = get_txw_idx(TX_8X4);
+ const int txh_idx = get_txh_idx(TX_8X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm4x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[8], buf1[8], *buf;
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X32];
+ const int txw_idx = get_txw_idx(TX_8X32);
+ const int txh_idx = get_txh_idx(TX_8X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 8;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x8_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+ transpose_16bit_8x8(buf0 + 16, buf1 + 16);
+ transpose_16bit_8x8(buf0 + 24, buf1 + 24);
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X4];
+ const int txw_idx = get_txw_idx(TX_16X4);
+ const int txh_idx = get_txh_idx(TX_16X4);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 4;
+ const transform_1d_sse2 col_txfm = col_txfm8x4_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x4(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_4x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output, width, height);
+ transpose_16bit_4x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ __m128i *buf;
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 8 * i);
+ }
+
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1, buf, width);
+ } else {
+ buf = buf1;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output, width, height);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8, width, height);
+}
+
+void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[16], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+ int ud_flip, lr_flip;
+
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ 8);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X32];
+ const int txw_idx = get_txw_idx(TX_16X32);
+ const int txh_idx = get_txh_idx(TX_16X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 16;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x16_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X8];
+ const int txw_idx = get_txw_idx(TX_32X8);
+ const int txh_idx = get_txh_idx(TX_32X8);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 8;
+ const transform_1d_sse2 col_txfm = col_txfm8x8_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 1; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ height);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ height);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, height);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, height);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X16];
+ const int txw_idx = get_txw_idx(TX_32X16);
+ const int txh_idx = get_txh_idx(TX_32X16);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 16;
+ const transform_1d_sse2 col_txfm = col_txfm8x16_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width,
+ 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8,
+ width, 8);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, 8);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_rect_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf0[32], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_32X32];
+ const int txw_idx = get_txw_idx(TX_32X32);
+ const int txh_idx = get_txh_idx(TX_32X32);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = 32;
+ const int height = 32;
+ const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
+ const transform_1d_sse2 row_txfm = row_txfm8x32_arr[tx_type];
+
+ if (col_txfm != NULL && row_txfm != NULL) {
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 4; i++) {
+ if (ud_flip) {
+ load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+ } else {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ }
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ transpose_16bit_8x8(buf0 + 0 * 8, buf1 + 0 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 1 * 8, buf1 + 1 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 2 * 8, buf1 + 2 * width + 8 * i);
+ transpose_16bit_8x8(buf0 + 3 * 8, buf1 + 3 * width + 8 * i);
+ }
+
+ for (int i = 0; i < 4; i++) {
+ __m128i *buf;
+ if (lr_flip) {
+ buf = buf0;
+ flip_buf_sse2(buf1 + width * i, buf, width);
+ } else {
+ buf = buf1 + width * i;
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ transpose_16bit_8x8(buf, buf);
+ store_buffer_16bit_to_32bit_w8(buf, output + 8 * width * i, width, 8);
+ transpose_16bit_8x8(buf + 8, buf + 8);
+ store_buffer_16bit_to_32bit_w8(buf + 8, output + 8 * width * i + 8, width,
+ 8);
+ transpose_16bit_8x8(buf + 16, buf + 16);
+ store_buffer_16bit_to_32bit_w8(buf + 16, output + 8 * width * i + 16,
+ width, 8);
+ transpose_16bit_8x8(buf + 24, buf + 24);
+ store_buffer_16bit_to_32bit_w8(buf + 24, output + 8 * width * i + 24,
+ width, 8);
+ }
+ } else {
+ av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_64X16;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x16_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x64_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < height_div8; i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ int32_t *output8 = output + 8 * 32 * i;
+ for (int j = 0; j < 4; ++j) {
+ __m128i *buf8 = buf + 8 * j;
+ transpose_16bit_8x8(buf8, buf8);
+ store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, 32, 8);
+ }
+ }
+}
+
+void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ (void)tx_type;
+ assert(tx_type == DCT_DCT);
+ const TX_SIZE tx_size = TX_16X64;
+ __m128i buf0[64], buf1[128];
+ const int8_t *shift = fwd_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
+ const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
+ const int width = tx_size_wide[tx_size];
+ const int height = tx_size_high[tx_size];
+ const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
+ const transform_1d_sse2 row_txfm = fdct8x16_new_sse2;
+ const int width_div8 = (width >> 3);
+ const int height_div8 = (height >> 3);
+
+ for (int i = 0; i < width_div8; i++) {
+ load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
+ round_shift_16bit(buf0, height, shift[0]);
+ col_txfm(buf0, buf0, cos_bit_col);
+ round_shift_16bit(buf0, height, shift[1]);
+ for (int j = 0; j < height_div8; ++j) {
+ transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
+ }
+ }
+
+ for (int i = 0; i < AOMMIN(4, height_div8); i++) {
+ __m128i *buf = buf1 + width * i;
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, width, shift[2]);
+ int32_t *output8 = output + 8 * width * i;
+ for (int j = 0; j < width_div8; ++j) {
+ __m128i *buf8 = buf + 8 * j;
+ transpose_16bit_8x8(buf8, buf8);
+ store_buffer_16bit_to_32bit_w8(buf8, output8 + 8 * j, width, 8);
+ }
+ }
+ // Zero out the bottom 16x32 area.
+ memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
+}
+
+static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
+ av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform
+ av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform
+ av1_lowbd_fwd_txfm2d_16x16_sse2, // 16x16 transform
+ av1_lowbd_fwd_txfm2d_32x32_sse2, // 32x32 transform
+ NULL, // 64x64 transform
+ av1_lowbd_fwd_txfm2d_4x8_sse2, // 4x8 transform
+ av1_lowbd_fwd_txfm2d_8x4_sse2, // 8x4 transform
+ av1_lowbd_fwd_txfm2d_8x16_sse2, // 8x16 transform
+ av1_lowbd_fwd_txfm2d_16x8_sse2, // 16x8 transform
+ av1_lowbd_fwd_txfm2d_16x32_sse2, // 16x32 transform
+ av1_lowbd_fwd_txfm2d_32x16_sse2, // 32x16 transform
+ NULL, // 32x64 transform
+ NULL, // 64x32 transform
+ av1_lowbd_fwd_txfm2d_4x16_sse2, // 4x16 transform
+ av1_lowbd_fwd_txfm2d_16x4_sse2, // 16x4 transform
+ av1_lowbd_fwd_txfm2d_8x32_sse2, // 8x32 transform
+ av1_lowbd_fwd_txfm2d_32x8_sse2, // 32x8 transform
+ av1_lowbd_fwd_txfm2d_16x64_sse2, // 16x64 transform
+ av1_lowbd_fwd_txfm2d_64x16_sse2, // 64x16 transform
+};
+
+void av1_lowbd_fwd_txfm_sse2(const int16_t *src_diff, tran_low_t *coeff,
+ int diff_stride, TxfmParam *txfm_param) {
+ FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
+
+ if ((fwd_txfm2d_func == NULL) ||
+ (txfm_param->lossless && txfm_param->tx_size == TX_4X4))
+ av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
+ else
+ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
+ txfm_param->bd);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
new file mode 100644
index 0000000000..99a6b90829
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_fwd_txfm_sse2.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+#define AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fdct8x32_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+void fdct8x64_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit);
+
+static INLINE void fidentity4x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a = _mm_unpacklo_epi16(input[i], one);
+ const __m128i b = scale_round_sse2(a, NewSqrt2);
+ output[i] = _mm_packs_epi32(b, b);
+ }
+}
+
+static INLINE void fidentity8x4_new_sse2(const __m128i *const input,
+ __m128i *const output,
+ const int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x8_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+
+ output[0] = _mm_adds_epi16(input[0], input[0]);
+ output[1] = _mm_adds_epi16(input[1], input[1]);
+ output[2] = _mm_adds_epi16(input[2], input[2]);
+ output[3] = _mm_adds_epi16(input[3], input[3]);
+ output[4] = _mm_adds_epi16(input[4], input[4]);
+ output[5] = _mm_adds_epi16(input[5], input[5]);
+ output[6] = _mm_adds_epi16(input[6], input[6]);
+ output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity8x16_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ const __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ const __m128i b_lo = scale_round_sse2(a_lo, 2 * NewSqrt2);
+ const __m128i b_hi = scale_round_sse2(a_hi, 2 * NewSqrt2);
+ output[i] = _mm_packs_epi32(b_lo, b_hi);
+ }
+}
+
+static INLINE void fidentity8x32_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ for (int i = 0; i < 32; ++i) {
+ output[i] = _mm_slli_epi16(input[i], 2);
+ }
+}
+
+static const transform_1d_sse2 col_txfm8x32_arr[TX_TYPES] = {
+ fdct8x32_new_sse2, // DCT_DCT
+ NULL, // ADST_DCT
+ NULL, // DCT_ADST
+ NULL, // ADST_ADST
+ NULL, // FLIPADST_DCT
+ NULL, // DCT_FLIPADST
+ NULL, // FLIPADST_FLIPADST
+ NULL, // ADST_FLIPADST
+ NULL, // FLIPADST_ADST
+ fidentity8x32_new_sse2, // IDTX
+ fdct8x32_new_sse2, // V_DCT
+ fidentity8x32_new_sse2, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
new file mode 100644
index 0000000000..b58911fcb2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_avx2.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i dc = _mm_unpacklo_epi16(*p, zero);
+ const __m128i ac = _mm_unpackhi_epi16(*p, zero);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ if (log_scale) {
+ const __m128i round_scale = _mm_set1_epi16(1 << (15 - log_scale));
+ round = _mm_mulhrs_epi16(round, round_scale);
+ }
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+ init_one_qp(&dequant, &qp[2]);
+}
+
+static INLINE void quantize(const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, int log_scale,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi32(*c);
+ __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);
+
+ __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
+ __m256i q_hi = _mm256_srli_epi64(q, 32);
+ const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
+ q_hi = _mm256_mul_epi32(q_hi, qp_hi);
+ q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
+ q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
+ q_hi = _mm256_slli_epi64(q_hi, 32);
+ q = _mm256_or_si256(q_lo, q_hi);
+ const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
+ const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
+ q = _mm256_andnot_si256(mask, q);
+
+ __m256i dq = _mm256_mullo_epi32(q, qp[2]);
+ dq = _mm256_srai_epi32(dq, log_scale);
+ q = _mm256_sign_epi32(q, *c);
+ dq = _mm256_sign_epi32(dq, *c);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
+ const __m128i zr = _mm_setzero_si128();
+ const __m128i lo = _mm_unpacklo_epi16(isc, zr);
+ const __m128i hi = _mm_unpackhi_epi16(isc, zr);
+ const __m256i iscan =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
+ const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
+ __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
+ cur_eob = _mm256_and_si256(cur_eob, nz);
+ *eob = _mm256_max_epi32(cur_eob, *eob);
+}
+
+void av1_highbd_quantize_fp_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ (void)scan;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 8;
+ __m256i qp[3], coeff;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, qp);
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+ while (n_coeffs > 0) {
+ coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ quantize(qp, &coeff, iscan, log_scale, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+ {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ const __m128i final_eob = _mm_max_epi16(_mm256_castsi256_si128(eob),
+ _mm256_extractf128_si256(eob, 1));
+ *eob_ptr = _mm_extract_epi16(final_eob, 0);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
new file mode 100644
index 0000000000..40b3b460b6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_highbd_quantize_sse4.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <stdint.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Coefficient quantization phase 1
+// param[0-2] : rounding/quan/dequan constants
+static INLINE void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
+ const int shift, const int scale,
+ __m128i *qcoeff, __m128i *dquan,
+ __m128i *sign) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+
+ *sign = _mm_cmplt_epi32(*coeff, zero);
+ *sign = _mm_or_si128(*sign, one);
+ *coeff = _mm_abs_epi32(*coeff);
+
+ qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
+ qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
+ qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
+
+ qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
+ qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
+ dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
+ dquan[0] = _mm_srli_epi64(dquan[0], scale);
+ const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
+ qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
+}
+
+// Coefficient quantization phase 2
+static INLINE void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
+ const __m128i *sign,
+ const __m128i *param, const int shift,
+ const int scale, tran_low_t *qAddr,
+ tran_low_t *dqAddr) {
+ __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
+ __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
+
+ qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
+ qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
+ dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
+ dquan[1] = _mm_srli_epi64(dquan[1], scale);
+
+ // combine L&H
+ qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
+ qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
+
+ qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
+ qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
+
+ dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
+ dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
+
+ dquan[0] = _mm_and_si128(dquan[0], mask0H);
+ dquan[1] = _mm_and_si128(dquan[1], mask0L);
+
+ qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
+ dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
+
+ qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
+ dquan[0] = _mm_sign_epi32(dquan[0], *sign);
+ qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
+ dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
+ _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
+ _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
+}
+
+static INLINE void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
+ __m128i *eob) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, iscanIdx;
+ const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
+ const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
+ __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
+ __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
+
+ nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
+ nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
+
+ mask = _mm_packs_epi32(nz_flag0, nz_flag1);
+ iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
+ iscanIdx = _mm_sub_epi16(iscanIdx, mask);
+ iscanIdx = _mm_and_si128(iscanIdx, mask);
+ *eob = _mm_max_epi16(*eob, iscanIdx);
+}
+
+static INLINE uint16_t get_accumulated_eob(__m128i *eob) {
+ __m128i eob_shuffled;
+ uint16_t eobValue;
+ eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
+ *eob = _mm_max_epi16(*eob, eob_shuffled);
+ eobValue = _mm_extract_epi16(*eob, 0);
+ return eobValue;
+}
+
+void av1_highbd_quantize_fp_sse4_1(
+ const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan, int log_scale) {
+ __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
+ __m128i eob = _mm_setzero_si128();
+ const tran_low_t *src = coeff_ptr;
+ tran_low_t *quanAddr = qcoeff_ptr;
+ tran_low_t *dquanAddr = dqcoeff_ptr;
+ const int shift = 16 - log_scale;
+ const int coeff_stride = 4;
+ const int quan_stride = coeff_stride;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)scan;
+
+ memset(quanAddr, 0, count * sizeof(quanAddr[0]));
+ memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+ const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+
+ qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
+ qparam[1] = xx_set_64_from_32i(quant_ptr[1], quant_ptr[0]);
+ qparam[2] = xx_set_64_from_32i(dequant_ptr[1], dequant_ptr[0]);
+ qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
+ dequant_ptr[0]);
+
+ // DC and first 3 AC
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+
+ // update round/quan/dquan for AC
+ qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
+ qparam[1] = xx_set1_64_from_32i(quant_ptr[1]);
+ qparam[2] = xx_set1_64_from_32i(dequant_ptr[1]);
+ qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr, dquanAddr);
+
+ // next 4 AC
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
+ quanAddr + quan_stride, dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+
+ // loop for the rest of AC
+ while (count > 0) {
+ src += coeff_stride << 1;
+ quanAddr += quan_stride << 1;
+ dquanAddr += quan_stride << 1;
+ iscan += quan_stride << 1;
+
+ coeff[0] = _mm_loadu_si128((__m128i const *)src);
+ coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
+
+ quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr, dquanAddr);
+
+ quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
+ &coeff_sign);
+ quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
+ log_scale, quanAddr + quan_stride,
+ dquanAddr + quan_stride);
+
+ find_eob(quanAddr, iscan, &eob);
+
+ count -= 8;
+ }
+ *eob_ptr = get_accumulated_eob(&eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
new file mode 100644
index 0000000000..df22aaba7c
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_avx2.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+ *c = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(*c, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)coeff);
+ }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff) {
+ const __m256i zero = _mm256_setzero_si256();
+ if (sizeof(tran_low_t) == 4) {
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+ } else {
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ }
+}
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i ac = _mm_unpackhi_epi64(*p, *p);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(*p), ac, 1);
+}
+
+static INLINE void init_qp(const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *dequant_ptr, int log_scale,
+ __m256i *thr, __m256i *qp) {
+ __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+
+ if (log_scale > 0) {
+ const __m128i rnd = _mm_set1_epi16((int16_t)1 << (log_scale - 1));
+ round = _mm_add_epi16(round, rnd);
+ round = _mm_srai_epi16(round, log_scale);
+ }
+
+ init_one_qp(&round, &qp[0]);
+ init_one_qp(&quant, &qp[1]);
+
+ if (log_scale == 1) {
+ qp[1] = _mm256_slli_epi16(qp[1], log_scale);
+ }
+
+ init_one_qp(&dequant, &qp[2]);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+static INLINE void update_qp(int log_scale, __m256i *thr, __m256i *qp) {
+ qp[0] = _mm256_permute2x128_si256(qp[0], qp[0], 0x11);
+ qp[1] = _mm256_permute2x128_si256(qp[1], qp[1], 0x11);
+ qp[2] = _mm256_permute2x128_si256(qp[2], qp[2], 0x11);
+ *thr = _mm256_srai_epi16(qp[2], 1 + log_scale);
+}
+
+#define store_quan(q, addr) \
+ do { \
+ __m256i sign_bits = _mm256_srai_epi16(q, 15); \
+ __m256i y0 = _mm256_unpacklo_epi16(q, sign_bits); \
+ __m256i y1 = _mm256_unpackhi_epi16(q, sign_bits); \
+ __m256i x0 = _mm256_permute2x128_si256(y0, y1, 0x20); \
+ __m256i x1 = _mm256_permute2x128_si256(y0, y1, 0x31); \
+ _mm256_storeu_si256((__m256i *)addr, x0); \
+ _mm256_storeu_si256((__m256i *)addr + 1, x1); \
+ } while (0)
+
+#define store_two_quan(q, addr1, dq, addr2) \
+ do { \
+ if (sizeof(tran_low_t) == 4) { \
+ store_quan(q, addr1); \
+ store_quan(dq, addr2); \
+ } else { \
+ _mm256_storeu_si256((__m256i *)addr1, q); \
+ _mm256_storeu_si256((__m256i *)addr2, dq); \
+ } \
+ } while (0)
+
+static INLINE uint16_t quant_gather_eob(__m256i eob) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob, 1);
+ __m128i eob_s = _mm_max_epi16(eob_lo, eob_hi);
+ eob_s = _mm_subs_epu16(_mm_set1_epi16(INT16_MAX), eob_s);
+ eob_s = _mm_minpos_epu16(eob_s);
+ return INT16_MAX - _mm_extract_epi16(eob_s, 0);
+}
+
+static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
+ const int16_t *iscan_ptr, tran_low_t *qcoeff,
+ tran_low_t *dqcoeff, __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+ q = _mm256_mulhi_epi16(q, qp[1]);
+ q = _mm256_sign_epi16(q, *c);
+ const __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 0;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_32x32(const __m256i *thr, const __m256i *qp,
+ __m256i *c, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+ q = _mm256_mulhi_epu16(q, qp[1]);
+
+ __m256i dq = _mm256_mullo_epi16(q, qp[2]);
+ dq = _mm256_srli_epi16(dq, 1);
+
+ q = _mm256_sign_epi16(q, *c);
+ dq = _mm256_sign_epi16(dq, *c);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 1;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize_32x32(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
+
+static INLINE void quantize_64x64(const __m256i *thr, const __m256i *qp,
+ __m256i *c, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i abs_coeff = _mm256_abs_epi16(*c);
+ __m256i mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+ mask = _mm256_or_si256(mask, _mm256_cmpeq_epi16(abs_coeff, *thr));
+ const int nzflag = _mm256_movemask_epi8(mask);
+
+ if (nzflag) {
+ __m256i q = _mm256_adds_epi16(abs_coeff, qp[0]);
+ __m256i qh = _mm256_mulhi_epi16(q, qp[1]);
+ __m256i ql = _mm256_mullo_epi16(q, qp[1]);
+ qh = _mm256_slli_epi16(qh, 2);
+ ql = _mm256_srli_epi16(ql, 14);
+ q = _mm256_or_si256(qh, ql);
+ const __m256i dqh = _mm256_slli_epi16(_mm256_mulhi_epi16(q, qp[2]), 14);
+ const __m256i dql = _mm256_srli_epi16(_mm256_mullo_epi16(q, qp[2]), 2);
+ __m256i dq = _mm256_or_si256(dqh, dql);
+
+ q = _mm256_sign_epi16(q, *c);
+ dq = _mm256_sign_epi16(dq, *c);
+
+ store_two_quan(q, qcoeff, dq, dqcoeff);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i iscan = _mm256_loadu_si256((const __m256i *)iscan_ptr);
+ const __m256i zero_coeff = _mm256_cmpeq_epi16(dq, zero);
+ const __m256i nzero_coeff = _mm256_cmpeq_epi16(zero_coeff, zero);
+ __m256i cur_eob = _mm256_sub_epi16(iscan, nzero_coeff);
+ cur_eob = _mm256_and_si256(cur_eob, nzero_coeff);
+ *eob = _mm256_max_epi16(*eob, cur_eob);
+ } else {
+ write_zero(qcoeff);
+ write_zero(dqcoeff);
+ }
+}
+
+void av1_quantize_fp_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ const unsigned int step = 16;
+
+ __m256i qp[3];
+ __m256i coeff, thr;
+ const int log_scale = 2;
+
+ init_qp(round_ptr, quant_ptr, dequant_ptr, log_scale, &thr, qp);
+ read_coeff(coeff_ptr, &coeff);
+
+ __m256i eob = _mm256_setzero_si256();
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+
+ update_qp(log_scale, &thr, qp);
+
+ while (n_coeffs > 0) {
+ read_coeff(coeff_ptr, &coeff);
+ quantize_64x64(&thr, qp, &coeff, iscan_ptr, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan_ptr += step;
+ n_coeffs -= step;
+ }
+ *eob_ptr = quant_gather_eob(eob);
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
new file mode 100644
index 0000000000..b07e7717f3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_sse2.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m128i *c0, __m128i *c1) {
+ const tran_low_t *addr = coeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+ const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+ const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+ const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+ *c0 = _mm_packs_epi32(x0, x1);
+ *c1 = _mm_packs_epi32(x2, x3);
+ } else {
+ *c0 = _mm_load_si128((const __m128i *)addr);
+ *c1 = _mm_load_si128((const __m128i *)addr + 1);
+ }
+}
+
+static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
+ tran_low_t *qcoeff, intptr_t offset) {
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+ __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+ __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+ _mm_store_si128((__m128i *)addr, y0);
+ _mm_store_si128((__m128i *)addr + 1, y1);
+
+ sign_bits = _mm_cmplt_epi16(*qc1, zero);
+ y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+ y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+ _mm_store_si128((__m128i *)addr + 2, y0);
+ _mm_store_si128((__m128i *)addr + 3, y1);
+ } else {
+ _mm_store_si128((__m128i *)addr, *qc0);
+ _mm_store_si128((__m128i *)addr + 1, *qc1);
+ }
+}
+
+static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
+ const __m128i zero = _mm_setzero_si128();
+ tran_low_t *addr = qcoeff + offset;
+ if (sizeof(tran_low_t) == 4) {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ _mm_store_si128((__m128i *)addr + 2, zero);
+ _mm_store_si128((__m128i *)addr + 3, zero);
+ } else {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ }
+}
+
+static INLINE void quantize(const int16_t *iscan_ptr,
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const __m128i *round0, const __m128i *round1,
+ const __m128i *quant0, const __m128i *quant1,
+ const __m128i *dequant0, const __m128i *dequant1,
+ const __m128i *thr0, const __m128i *thr1,
+ __m128i *eob) {
+ __m128i coeff0, coeff1;
+ // Do DC and first 15 AC
+ read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
+
+ // Poor man's sign extract
+ const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+ __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+ const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
+ _mm_cmpeq_epi16(qcoeff0, *thr0));
+ const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
+ _mm_cmpeq_epi16(qcoeff1, *thr1));
+ const int16_t nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
+
+ if (nzflag) {
+ qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
+ qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
+ const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
+ const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
+
+ // Reinsert signs
+ qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+ qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+ qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+ qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+ write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
+
+ coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
+ coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
+
+ write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
+
+ const __m128i zero = _mm_setzero_si128();
+ // Scan for eob
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+ const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+ const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+ const __m128i iscan0 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+ const __m128i iscan1 =
+ _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+ // Add one to convert from indices to counts
+ const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
+ const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
+ const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
+ const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
+ const __m128i eob2 = _mm_max_epi16(eob0, eob1);
+ *eob = _mm_max_epi16(*eob, eob2);
+ } else {
+ write_zero(qcoeff_ptr, n_coeffs);
+ write_zero(dqcoeff_ptr, n_coeffs);
+ }
+}
+
+void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+ (void)scan_ptr;
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
+ const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
+ const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
+ const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
+ const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
+ const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
+ __m128i eob = _mm_setzero_si128();
+
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
+ &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
+
+ n_coeffs += 8 * 2;
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
+ &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
+ &eob);
+ n_coeffs += 8 * 2;
+ }
+
+ // Accumulate EOB
+ {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ *eob_ptr = _mm_extract_epi16(eob, 1);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..ad4ae274e2
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_quantize_ssse3_x86_64.asm
@@ -0,0 +1,204 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FP 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+ cmp dword skipm, 0
+ jne .blank
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, dequantmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, fp_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m1, m5
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [r2q] ; m3 = dequant
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+%ifidn %1, fp_32x32
+ psllw m2, 1
+%endif
+ pxor m5, m5 ; m5 = dedicated zero
+
+ lea coeffq, [ coeffq+ncoeffq*2]
+ lea r5q, [ r5q+ncoeffq*2]
+ lea r3q, [ r3q+ncoeffq*2]
+ lea r4q, [r4q+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m8
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; r4[i] = r3[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+ psrlw m0, m3, 2
+%else
+ psrlw m0, m3, 1
+%endif
+ mova [r4q+ncoeffq*2+ 0], m8
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
+ mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+
+ pcmpgtw m7, m6, m0
+ pcmpgtw m12, m11, m0
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+
+ or r6, r2
+ jz .skip_iter
+
+ pcmpeqw m7, m7
+
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ mova [r3q+ncoeffq*2+ 0], m14
+ mova [r3q+ncoeffq*2+16], m13
+%ifidn %1, fp_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; r4[i] = r3[i] * q
+ pmullw m13, m3 ; r4[i] = r3[i] * q
+%ifidn %1, fp_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+ mova [r4q+ncoeffq*2+ 0], m14
+ mova [r4q+ncoeffq*2+16], m13
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m7 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+ jmp .accumulate_eob
+.skip_iter:
+ mova [r3q+ncoeffq*2+ 0], m5
+ mova [r3q+ncoeffq*2+16], m5
+ mova [r4q+ncoeffq*2+ 0], m5
+ mova [r4q+ncoeffq*2+16], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+
+ ; skip-block, i.e. just write all zeroes
+.blank:
+ mov r0, dqcoeffmp
+ movifnidn ncoeffq, ncoeffmp
+ mov r2, qcoeffmp
+ mov r3, eobmp
+
+ lea r0q, [r0q+ncoeffq*2]
+ lea r2q, [r2q+ncoeffq*2]
+ neg ncoeffq
+ pxor m7, m7
+.blank_loop:
+ mova [r0q+ncoeffq*2+ 0], m7
+ mova [r0q+ncoeffq*2+16], m7
+ mova [r2q+ncoeffq*2+ 0], m7
+ mova [r2q+ncoeffq*2+16], m7
+ add ncoeffq, mmsize
+ jl .blank_loop
+ mov word [r3q], 0
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FP fp, 7
+QUANTIZE_FP fp_32x32, 7
diff --git a/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
new file mode 100644
index 0000000000..faa2a232a3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_ssim_opt_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_16x16_sse2) PRIVATE
+sym(av1_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; unsigned long *sum_s,
+; unsigned long *sum_r,
+; unsigned long *sum_sq_s,
+; unsigned long *sum_sq_r,
+; unsigned long *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+global sym(av1_ssim_parms_8x8_sse2) PRIVATE
+sym(av1_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
new file mode 100644
index 0000000000..6df2a8bdbb
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/av1_txfm1d_sse4.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+#define AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
+
+#include <smmintrin.h>
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/av1_txfm_sse4.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void av1_fdct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fdct32_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit);
+void av1_fdct64_new_sse4_1(const __m128i *input, __m128i *output,
+ int8_t cos_bit, const int instride,
+ const int outstride);
+
+void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_fadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_idct4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct32_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_idct64_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+
+void av1_iadst4_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst8_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+void av1_iadst16_new_sse4_1(const __m128i *input, __m128i *output,
+ const int8_t cos_bit, const int8_t *stage_range);
+static INLINE void transpose_32_4x4(int stride, const __m128i *input,
+ __m128i *output) {
+ __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+ __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+ __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+ output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+ output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+ output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+ output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// then transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i *input,
+ __m128i *output) {
+ const int num_per_128 = 4;
+ const int row_size = txfm_size;
+ const int col_size = txfm_size / num_per_128;
+ int r, c;
+
+ // transpose each 4x4 block internally
+ for (r = 0; r < row_size; r += 4) {
+ for (c = 0; c < col_size; c++) {
+ transpose_32_4x4(col_size, &input[r * col_size + c],
+ &output[c * 4 * col_size + r / 4]);
+ }
+ }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ const __m128i ww0 = _mm_set1_epi32(w0); \
+ const __m128i ww1 = _mm_set1_epi32(w1); \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = av1_round_shift_32_sse4_1(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = av1_round_shift_32_sse4_1(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+ do { \
+ btf_32_sse4_1_type0(w1, w0, in1, in0, out0, out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_type0_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ const __m128i in0_w0 = _mm_mullo_epi32(in0, ww0); \
+ const __m128i in1_w1 = _mm_mullo_epi32(in1, ww1); \
+ out0 = _mm_add_epi32(in0_w0, in1_w1); \
+ out0 = _mm_add_epi32(out0, r); \
+ out0 = _mm_srai_epi32(out0, bit); \
+ const __m128i in0_w1 = _mm_mullo_epi32(in0, ww1); \
+ const __m128i in1_w0 = _mm_mullo_epi32(in1, ww0); \
+ out1 = _mm_sub_epi32(in0_w1, in1_w0); \
+ out1 = _mm_add_epi32(out1, r); \
+ out1 = _mm_srai_epi32(out1, bit); \
+ } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_type1_sse4_1_new(ww0, ww1, in0, in1, out0, out1, r, bit) \
+ do { \
+ btf_32_type0_sse4_1_new(ww1, ww0, in1, in0, out0, out1, r, bit); \
+ } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AV1_ENCODER_X86_AV1_TXFM1D_SSE4_H_
diff --git a/third_party/aom/av1/encoder/x86/corner_match_sse4.c b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
new file mode 100644
index 0000000000..93f37b71d3
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/corner_match_sse4.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "av1/encoder/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, byte_mask[16]) = {
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0
+};
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(im1, im2) * MATCH_SZ * stddev(im1), where the
+ correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+ of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double compute_cross_correlation_sse4_1(unsigned char *im1, int stride1, int x1,
+ int y1, unsigned char *im2, int stride2,
+ int x2, int y2) {
+ int i;
+ // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+ // 2)
+ __m128i sum1_vec = _mm_setzero_si128();
+ __m128i sum2_vec = _mm_setzero_si128();
+ // 4 32-bit partial sums of squares
+ __m128i sumsq2_vec = _mm_setzero_si128();
+ __m128i cross_vec = _mm_setzero_si128();
+
+ const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+ const __m128i zero = _mm_setzero_si128();
+
+ im1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+ im2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+ for (i = 0; i < MATCH_SZ; ++i) {
+ const __m128i v1 =
+ _mm_and_si128(_mm_loadu_si128((__m128i *)&im1[i * stride1]), mask);
+ const __m128i v2 =
+ _mm_and_si128(_mm_loadu_si128((__m128i *)&im2[i * stride2]), mask);
+
+ // Using the 'sad' intrinsic here is a bit faster than adding
+ // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+ // conversion step later, for a net speedup of ~10%
+ sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+ sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+ const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+ const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+ const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+ const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+ sumsq2_vec = _mm_add_epi32(
+ sumsq2_vec,
+ _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+ cross_vec = _mm_add_epi32(
+ cross_vec,
+ _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+ }
+
+ // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+ // cross_vec)
+ // as holding 4 32-bit elements each, which we want to sum horizontally.
+ // We do this by transposing and then summing vertically.
+ __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+ __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+ __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+ __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+ __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+ __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+ __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+ __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+ __m128i res =
+ _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+ int sum1 = _mm_extract_epi32(res, 0);
+ int sum2 = _mm_extract_epi32(res, 1);
+ int sumsq2 = _mm_extract_epi32(res, 2);
+ int cross = _mm_extract_epi32(res, 3);
+
+ int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+ int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+ return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/av1/encoder/x86/dct_sse2.asm b/third_party/aom/av1/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000000..b185548184
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/dct_sse2.asm
@@ -0,0 +1,82 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro TRANSFORM_COLS 0
+ paddw m0, m1
+ movq m4, m0
+ psubw m3, m2
+ psubw m4, m3
+ psraw m4, 1
+ movq m5, m4
+ psubw m5, m1 ;b1
+ psubw m4, m2 ;c1
+ psubw m0, m4
+ paddw m3, m5
+ ; m0 a0
+ SWAP 1, 4 ; m1 c1
+ SWAP 2, 3 ; m2 d1
+ SWAP 3, 5 ; m3 b1
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ ; 00 01 02 03
+ ; 10 11 12 13
+ ; 20 21 22 23
+ ; 30 31 32 33
+ punpcklwd m0, m1 ; 00 10 01 11 02 12 03 13
+ punpcklwd m2, m3 ; 20 30 21 31 22 32 23 33
+ mova m1, m0
+ punpckldq m0, m2 ; 00 10 20 30 01 11 21 31
+ punpckhdq m1, m2 ; 02 12 22 32 03 13 23 33
+%endmacro
+
+INIT_XMM sse2
+cglobal fwht4x4, 3, 4, 8, input, output, stride
+ lea r3q, [inputq + strideq*4]
+ movq m0, [inputq] ;a1
+ movq m1, [inputq + strideq*2] ;b1
+ movq m2, [r3q] ;c1
+ movq m3, [r3q + strideq*2] ;d1
+
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ SWAP 1, 2
+ psrldq m1, m0, 8
+ psrldq m3, m2, 8
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+
+ psllw m0, 2
+ psllw m1, 2
+
+ ; sign extension
+ mova m2, m0
+ mova m3, m1
+ punpcklwd m0, m0
+ punpcklwd m1, m1
+ punpckhwd m2, m2
+ punpckhwd m3, m3
+ psrad m0, 16
+ psrad m1, 16
+ psrad m2, 16
+ psrad m3, 16
+ mova [outputq], m0
+ mova [outputq + 16], m2
+ mova [outputq + 32], m1
+ mova [outputq + 48], m3
+
+ RET
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_avx2.c b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
new file mode 100644
index 0000000000..7642f57d18
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+#include <immintrin.h> /* AVX2 */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ const __m256i y_zeros = _mm256_setzero_si256();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ yy_storeu_256(pre_buf, y_zeros);
+ pre_buf += 32;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride;
+ uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31));
+
+ do {
+ yy_storeu_256(bottom_buf, y_zeros);
+ bottom_buf += 32;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (width == 4) {
+ do {
+ const __m256i c0 = yy_loadu_256(cf);
+ const __m256i c1 = yy_loadu_256(cf + 8);
+ const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1));
+ const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros);
+ const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8);
+ const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ ls += 32;
+ cf += 16;
+ i += 4;
+ } while (i < height);
+ } else if (width == 8) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ const __m128i res0 = _mm256_castsi256_si128(res);
+ const __m128i res1 = _mm256_extracti128_si256(res, 1);
+ xx_storel_64(ls, res0);
+ *(int32_t *)(ls + width) = 0;
+ xx_storel_64(ls + stride, _mm_srli_si128(res0, 8));
+ *(int32_t *)(ls + width + stride) = 0;
+ xx_storel_64(ls + stride * 2, res1);
+ *(int32_t *)(ls + width + stride * 2) = 0;
+ xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8));
+ *(int32_t *)(ls + width + stride * 3) = 0;
+ cf += 32;
+ ls += stride << 2;
+ i += 4;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ xx_storeu_128(ls, _mm256_castsi256_si128(res));
+ xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1));
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ *(int32_t *)(ls + stride + width) = 0;
+ ls += stride << 1;
+ i += 2;
+ } while (i < height);
+ } else {
+ do {
+ const __m256i coeffA = yy_loadu_256(cf);
+ const __m256i coeffB = yy_loadu_256(cf + 8);
+ const __m256i coeffC = yy_loadu_256(cf + 16);
+ const __m256i coeffD = yy_loadu_256(cf + 24);
+ const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB);
+ const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD);
+ const __m256i absAB = _mm256_abs_epi16(coeffAB);
+ const __m256i absCD = _mm256_abs_epi16(coeffCD);
+ const __m256i absABCD = _mm256_packs_epi16(absAB, absCD);
+ const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8);
+ const __m256i res = _mm256_shuffle_epi32(res_, 0xd8);
+ yy_storeu_256(ls, res);
+ cf += 32;
+ *(int32_t *)(ls + width) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse2.c b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
new file mode 100644
index 0000000000..dedb4d02f6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse2.c
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+
+static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
+ level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
+ level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
+ level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
+ level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
+}
+
+static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
+ const int stride,
+ const ptrdiff_t *const offsets,
+ __m128i *const level) {
+ level[0] = _mm_loadu_si128((__m128i *)(src + 1));
+ level[1] = _mm_loadu_si128((__m128i *)(src + stride));
+ level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
+ level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
+ level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
+}
+
+static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
+ const __m128i const_3 = _mm_set1_epi8(3);
+ const __m128i const_4 = _mm_set1_epi8(4);
+ __m128i count;
+
+ count = _mm_min_epu8(level[0], const_3);
+ level[1] = _mm_min_epu8(level[1], const_3);
+ level[2] = _mm_min_epu8(level[2], const_3);
+ level[3] = _mm_min_epu8(level[3], const_3);
+ level[4] = _mm_min_epu8(level[4], const_3);
+ count = _mm_add_epi8(count, level[1]);
+ count = _mm_add_epi8(count, level[2]);
+ count = _mm_add_epi8(count, level[3]);
+ count = _mm_add_epi8(count, level[4]);
+ count = _mm_avg_epu8(count, _mm_setzero_si128());
+ count = _mm_min_epu8(count, const_4);
+ return count;
+}
+
+static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *const coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(21);
+ __m128i pos_to_offset =
+ (height == 4)
+ ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
+ : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
+ 21, 21);
+ __m128i count;
+ __m128i level[5];
+ int8_t *cc = coeff_contexts;
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ cc += 16;
+ row -= 4;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 4 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(height % 4));
+
+ do {
+ load_levels_4x4x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 4 * stride;
+ coeff_contexts += 16;
+ row -= 4;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+ __m128i pos_to_offset[3];
+
+ assert(!(height % 2));
+
+ if (height == 8) {
+ pos_to_offset[0] =
+ _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ } else if (height < 8) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
+ 21, 21, 21, 21, 21);
+ } else {
+ pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11);
+ pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
+ 21, 21, 21, 21, 21);
+ }
+ pos_to_offset[2] = _mm_set1_epi8(21);
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += 2 * stride;
+ cc += 16;
+ row -= 2;
+ } while (row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
+ const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = 8 + TX_PAD_HOR;
+ const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
+ int row = height;
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(height % 2));
+
+ do {
+ load_levels_8x2x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 2 * stride;
+ coeff_contexts += 16;
+ row -= 2;
+ } while (row);
+}
+
+static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
+ const int real_width,
+ const int real_height,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ int8_t *cc = coeff_contexts;
+ int row = height;
+ __m128i pos_to_offset[5];
+ __m128i pos_to_offset_large[3];
+ __m128i count;
+ __m128i level[5];
+
+ assert(!(width % 16));
+
+ pos_to_offset_large[2] = _mm_set1_epi8(21);
+ if (real_width == real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
+ pos_to_offset_large[2];
+ } else if (real_width > real_height) {
+ pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
+ 16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
+ pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
+ } else { // real_width < real_height
+ pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
+ pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+ 21, 21, 21, 21, 21);
+ pos_to_offset[4] = pos_to_offset_large[2];
+ pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
+ }
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)cc, count);
+ levels += 16;
+ cc += 16;
+ w -= 16;
+ pos_to_offset[0] = pos_to_offset_large[0];
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ pos_to_offset[2] = pos_to_offset[3];
+ pos_to_offset[3] = pos_to_offset[4];
+ pos_to_offset_large[0] = pos_to_offset_large[1];
+ pos_to_offset_large[1] = pos_to_offset_large[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+
+ coeff_contexts[0] = 0;
+}
+
+static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ const __m128i pos_to_offset_large =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ do {
+ __m128i pos_to_offset =
+ _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
+ SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ pos_to_offset = pos_to_offset_large;
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
+ const int width, const int height,
+ const ptrdiff_t *const offsets,
+ int8_t *coeff_contexts) {
+ const int stride = width + TX_PAD_HOR;
+ __m128i pos_to_offset[3];
+ __m128i count;
+ __m128i level[5];
+ int row = height;
+
+ assert(!(width % 16));
+
+ pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
+ pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
+ pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
+
+ do {
+ int w = width;
+
+ do {
+ load_levels_16x1x5_sse2(levels, stride, offsets, level);
+ count = get_coeff_contexts_kernel_sse2(level);
+ count = _mm_add_epi8(count, pos_to_offset[0]);
+ _mm_store_si128((__m128i *)coeff_contexts, count);
+ levels += 16;
+ coeff_contexts += 16;
+ w -= 16;
+ } while (w);
+
+ pos_to_offset[0] = pos_to_offset[1];
+ pos_to_offset[1] = pos_to_offset[2];
+ levels += TX_PAD_HOR;
+ } while (--row);
+}
+
+// Note: levels[] must be in the range [0, 127], inclusive.
+void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
+ const int16_t *const scan, const uint16_t eob,
+ const TX_SIZE tx_size,
+ const TX_CLASS tx_class,
+ int8_t *const coeff_contexts) {
+ const int last_idx = eob - 1;
+ if (!last_idx) {
+ coeff_contexts[0] = 0;
+ return;
+ }
+
+ const int real_width = tx_size_wide[tx_size];
+ const int real_height = tx_size_high[tx_size];
+ const int width = get_txb_wide(tx_size);
+ const int height = get_txb_high(tx_size);
+ const int stride = width + TX_PAD_HOR;
+ ptrdiff_t offsets[3];
+
+ /* coeff_contexts must be 16 byte aligned. */
+ assert(!((intptr_t)coeff_contexts & 0xf));
+
+ if (tx_class == TX_CLASS_2D) {
+ offsets[0] = 0 * stride + 2;
+ offsets[1] = 1 * stride + 1;
+ offsets[2] = 2 * stride + 0;
+
+ if (width == 4) {
+ get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
+ } else if (width == 16) {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
+ offsets, coeff_contexts);
+ }
+ } else if (tx_class == TX_CLASS_HORIZ) {
+ offsets[0] = 2;
+ offsets[1] = 3;
+ offsets[2] = 4;
+ if (width == 4) {
+ get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_hor(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ } else { // TX_CLASS_VERT
+ offsets[0] = 2 * stride;
+ offsets[1] = 3 * stride;
+ offsets[2] = 4 * stride;
+ if (width == 4) {
+ get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else if (width == 8) {
+ get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
+ } else {
+ get_16n_coeff_contexts_ver(levels, width, height, offsets,
+ coeff_contexts);
+ }
+ }
+
+ const int bwl = get_txb_bwl(tx_size);
+ const int pos = scan[last_idx];
+ if (last_idx <= (height << bwl) / 8)
+ coeff_contexts[pos] = 1;
+ else if (last_idx <= (height << bwl) / 4)
+ coeff_contexts[pos] = 2;
+ else
+ coeff_contexts[pos] = 3;
+}
diff --git a/third_party/aom/av1/encoder/x86/encodetxb_sse4.c b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
new file mode 100644
index 0000000000..5e0687cd38
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/encodetxb_sse4.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "aom/aom_integer.h"
+#include "av1/common/onyxc_int.h"
+#include "av1/common/txb_common.h"
+#include "aom_dsp/x86/synonyms.h"
+
+void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
+ const int height, uint8_t *const levels) {
+ const int stride = width + TX_PAD_HOR;
+ const __m128i zeros = _mm_setzero_si128();
+
+ const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride;
+ uint8_t *pre_buf = levels - TX_PAD_TOP * stride;
+ uint8_t *pre_buf_end = pre_buf + pre_len;
+ do {
+ _mm_storeu_si128((__m128i *)(pre_buf), zeros);
+ pre_buf += 16;
+ } while (pre_buf < pre_buf_end);
+
+ const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
+ uint8_t *bottom_buf = levels + stride * height;
+ uint8_t *bottom_buf_end = bottom_buf + bottom_len;
+ do {
+ _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
+ bottom_buf += 16;
+ } while (bottom_buf < bottom_buf_end);
+
+ int i = 0;
+ uint8_t *ls = levels;
+ const tran_low_t *cf = coeff;
+ if (width == 4) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
+ xx_storeu_128(ls, lsAB);
+ ls += (stride << 1);
+ cf += (width << 1);
+ i += 2;
+ } while (i < height);
+ } else if (width == 8) {
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
+ xx_storeu_128(ls, absAB8);
+ ls += stride;
+ cf += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ int j = 0;
+ do {
+ const __m128i coeffA = xx_loadu_128(cf);
+ const __m128i coeffB = xx_loadu_128(cf + 4);
+ const __m128i coeffC = xx_loadu_128(cf + 8);
+ const __m128i coeffD = xx_loadu_128(cf + 12);
+ const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
+ const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
+ const __m128i absAB = _mm_abs_epi16(coeffAB);
+ const __m128i absCD = _mm_abs_epi16(coeffCD);
+ const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
+ xx_storeu_128(ls + j, absABCD);
+ j += 16;
+ cf += 16;
+ } while (j < width);
+ *(int32_t *)(ls + width) = 0;
+ ls += stride;
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/av1/encoder/x86/error_intrin_avx2.c b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
new file mode 100644
index 0000000000..7d4f695854
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_intrin_avx2.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
+ __m256i *c) {
+ const tran_low_t *addr = coeff + offset;
+
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+ const __m256i y = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(y, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)addr);
+ }
+}
+
+int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz) {
+ __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+ __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+ __m256i sse_reg_64hi, ssz_reg_64hi;
+ __m128i sse_reg128, ssz_reg128;
+ int64_t sse;
+ int i;
+ const __m256i zero_reg = _mm256_setzero_si256();
+
+ // init sse and ssz registerd to zero
+ sse_reg = _mm256_setzero_si256();
+ ssz_reg = _mm256_setzero_si256();
+
+ for (i = 0; i < block_size; i += 16) {
+ // load 32 bytes from coeff and dqcoeff
+ read_coeff(coeff, i, &coeff_reg);
+ read_coeff(dqcoeff, i, &dqcoeff_reg);
+ // dqcoeff - coeff
+ dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+ // madd (dqcoeff - coeff)
+ dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+ // madd coeff
+ coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+ // expand each double word of madd (dqcoeff - coeff) to quad word
+ exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+ exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+ // expand each double word of madd (coeff) to quad word
+ exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+ exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+ // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+ sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+ }
+ // save the higher 64 bit of each 128 bit lane
+ sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+ ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+ // add the higher 64 bit to the low 64 bit
+ sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+ ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+ // add each 64 bit from each of the 128 bit lane of the 256 bit
+ sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+ _mm256_extractf128_si256(sse_reg, 1));
+
+ ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+ _mm256_extractf128_si256(ssz_reg, 1));
+
+ // store the results
+ _mm_storel_epi64((__m128i *)(&sse), sse_reg128);
+
+ _mm_storel_epi64((__m128i *)(ssz), ssz_reg128);
+ _mm256_zeroupper();
+ return sse;
+}
diff --git a/third_party/aom/av1/encoder/x86/error_sse2.asm b/third_party/aom/av1/encoder/x86/error_sse2.asm
new file mode 100644
index 0000000000..72e9e22b18
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/error_sse2.asm
@@ -0,0 +1,79 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%define private_prefix av1
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; int64_t av1_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
+; int64_t *ssz)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
+ pxor m4, m4 ; sse accumulator
+ pxor m6, m6 ; ssz accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m2, [uqcq+sizeq*2]
+ mova m0, [dqcq+sizeq*2]
+ mova m3, [uqcq+sizeq*2+mmsize]
+ mova m1, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ ; accumulate in 64bit
+ punpckldq m7, m0, m5
+ punpckhdq m0, m5
+ paddq m4, m7
+ punpckldq m7, m1, m5
+ paddq m4, m0
+ punpckhdq m1, m5
+ paddq m4, m7
+ punpckldq m7, m2, m5
+ paddq m4, m1
+ punpckhdq m2, m5
+ paddq m6, m7
+ punpckldq m7, m3, m5
+ paddq m6, m2
+ punpckhdq m3, m5
+ paddq m6, m7
+ paddq m6, m3
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ movhlps m7, m6
+ paddq m4, m5
+ paddq m6, m7
+%if ARCH_X86_64
+ movq rax, m4
+ movq [sszq], m6
+%else
+ mov eax, sszm
+ pshufd m5, m4, 0x1
+ movq [eax], m6
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/third_party/aom/av1/encoder/x86/hash_sse42.c b/third_party/aom/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000000..65fa463117
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc32c_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+ return (crc ^= 0xFFFFFFFF);
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
new file mode 100644
index 0000000000..777304ace7
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_block_error_intrin_sse2.c
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "av1/common/common.h"
+
+int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
+ intptr_t block_size, int64_t *ssz,
+ int bps) {
+ int i, j, test;
+ uint32_t temp[4];
+ __m128i max, min, cmp0, cmp1, cmp2, cmp3;
+ int64_t error = 0, sqcoeff = 0;
+ const int shift = 2 * (bps - 8);
+ const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+ for (i = 0; i < block_size; i += 8) {
+ // Load the data into xmm registers
+ __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
+ __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
+ __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
+ __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
+ // Check if any values require more than 15 bit
+ max = _mm_set1_epi32(0x3fff);
+ min = _mm_set1_epi32(0xffffc000);
+ cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
+ _mm_cmplt_epi32(mm_coeff, min));
+ cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
+ _mm_cmplt_epi32(mm_coeff2, min));
+ cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
+ _mm_cmplt_epi32(mm_dqcoeff, min));
+ cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
+ _mm_cmplt_epi32(mm_dqcoeff2, min));
+ test = _mm_movemask_epi8(
+ _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));
+
+ if (!test) {
+ __m128i mm_diff, error_sse2, sqcoeff_sse2;
+ mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
+ mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
+ mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
+ error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
+ sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
+ _mm_storeu_si128((__m128i *)temp, error_sse2);
+ error = error + temp[0] + temp[1] + temp[2] + temp[3];
+ _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
+ sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
+ } else {
+ for (j = 0; j < 8; j++) {
+ const int64_t diff = coeff[i + j] - dqcoeff[i + j];
+ error += diff * diff;
+ sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
+ }
+ }
+ }
+ assert(error >= 0 && sqcoeff >= 0);
+ error = (error + rounding) >> shift;
+ sqcoeff = (sqcoeff + rounding) >> shift;
+
+ *ssz = sqcoeff;
+ return error;
+}
diff --git a/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000000..535485ae8e
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,1783 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/av1_rtcd.h"
+
+#include "av1/common/av1_txfm.h"
+#include "av1/common/x86/highbd_txfm_utility_sse4.h"
+#include "av1/encoder/av1_fwd_txfm1d_cfg.h"
+#include "av1/encoder/x86/av1_txfm1d_sse4.h"
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ if (!flipud) {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ } else {
+ in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+ in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+ in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+ in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+ in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+ in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+ }
+
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[2]);
+ in[3] = _mm_cvtepi16_epi32(in[3]);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i s0, s1, s2, s3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ s0 = _mm_add_epi32(in[0], in[3]);
+ s1 = _mm_add_epi32(in[1], in[2]);
+ s2 = _mm_sub_epi32(in[1], in[2]);
+ s3 = _mm_sub_epi32(in[0], in[3]);
+
+ // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+ u0 = _mm_mullo_epi32(s0, cospi32);
+ u1 = _mm_mullo_epi32(s1, cospi32);
+ u2 = _mm_add_epi32(u0, u1);
+ v0 = _mm_sub_epi32(u0, u1);
+
+ u3 = _mm_add_epi32(u2, rnding);
+ v1 = _mm_add_epi32(v0, rnding);
+
+ u0 = _mm_srai_epi32(u3, bit);
+ u2 = _mm_srai_epi32(v1, bit);
+
+ // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+ v0 = _mm_mullo_epi32(s2, cospi48);
+ v1 = _mm_mullo_epi32(s3, cospi16);
+ v2 = _mm_add_epi32(v0, v1);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u1 = _mm_srai_epi32(v3, bit);
+
+ v0 = _mm_mullo_epi32(s2, cospi16);
+ v1 = _mm_mullo_epi32(s3, cospi48);
+ v2 = _mm_sub_epi32(v1, v0);
+
+ v3 = _mm_add_epi32(v2, rnding);
+ u3 = _mm_srai_epi32(v3, bit);
+
+ // Note: shift[1] and shift[2] are zeros
+
+ // Transpose 4x4 32-bit
+ v0 = _mm_unpacklo_epi32(u0, u1);
+ v1 = _mm_unpackhi_epi32(u0, u1);
+ v2 = _mm_unpacklo_epi32(u2, u3);
+ v3 = _mm_unpackhi_epi32(u2, u3);
+
+ in[0] = _mm_unpacklo_epi64(v0, v2);
+ in[1] = _mm_unpackhi_epi64(v0, v2);
+ in[2] = _mm_unpacklo_epi64(v1, v3);
+ in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+ const int32_t *sinpi = sinpi_arr(bit);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i sinpi1 = _mm_set1_epi32((int)sinpi[1]);
+ const __m128i sinpi2 = _mm_set1_epi32((int)sinpi[2]);
+ const __m128i sinpi3 = _mm_set1_epi32((int)sinpi[3]);
+ const __m128i sinpi4 = _mm_set1_epi32((int)sinpi[4]);
+ __m128i t;
+ __m128i s0, s1, s2, s3, s4, s5, s6, s7;
+ __m128i x0, x1, x2, x3;
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+
+ s0 = _mm_mullo_epi32(in[0], sinpi1);
+ s1 = _mm_mullo_epi32(in[0], sinpi4);
+ s2 = _mm_mullo_epi32(in[1], sinpi2);
+ s3 = _mm_mullo_epi32(in[1], sinpi1);
+ s4 = _mm_mullo_epi32(in[2], sinpi3);
+ s5 = _mm_mullo_epi32(in[3], sinpi4);
+ s6 = _mm_mullo_epi32(in[3], sinpi2);
+ t = _mm_add_epi32(in[0], in[1]);
+ s7 = _mm_sub_epi32(t, in[3]);
+
+ t = _mm_add_epi32(s0, s2);
+ x0 = _mm_add_epi32(t, s5);
+ x1 = _mm_mullo_epi32(s7, sinpi3);
+ t = _mm_sub_epi32(s1, s3);
+ x2 = _mm_add_epi32(t, s6);
+ x3 = s4;
+
+ s0 = _mm_add_epi32(x0, x3);
+ s1 = x1;
+ s2 = _mm_sub_epi32(x2, x3);
+ t = _mm_sub_epi32(x2, x0);
+ s3 = _mm_add_epi32(t, x3);
+
+ u0 = _mm_add_epi32(s0, rnding);
+ u0 = _mm_srai_epi32(u0, bit);
+
+ u1 = _mm_add_epi32(s1, rnding);
+ u1 = _mm_srai_epi32(u1, bit);
+
+ u2 = _mm_add_epi32(s2, rnding);
+ u2 = _mm_srai_epi32(u2, bit);
+
+ u3 = _mm_add_epi32(s3, rnding);
+ u3 = _mm_srai_epi32(u3, bit);
+
+ v0 = _mm_unpacklo_epi32(u0, u1);
+ v1 = _mm_unpackhi_epi32(u0, u1);
+ v2 = _mm_unpacklo_epi32(u2, u3);
+ v3 = _mm_unpackhi_epi32(u2, u3);
+
+ in[0] = _mm_unpacklo_epi64(v0, v2);
+ in[1] = _mm_unpackhi_epi64(v0, v2);
+ in[2] = _mm_unpacklo_epi64(v1, v3);
+ in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+ int input_stride, TX_TYPE tx_type, int bd) {
+ __m128i in[4];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_4X4];
+ const int txw_idx = get_txw_idx(TX_4X4);
+ const int txh_idx = get_txh_idx(TX_4X4);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fdct4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 1, 1, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_col[txw_idx][txh_idx]);
+ fadst4x4_sse4_1(in, fwd_cos_bit_row[txw_idx][txh_idx]);
+ write_buffer_4x4(in, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i u;
+ if (!flipud) {
+ in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ } else {
+ in[0] = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ in[1] = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ in[2] = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ in[3] = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ in[4] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ in[6] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ in[7] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ }
+
+ if (fliplr) {
+ in[0] = mm_reverse_epi16(in[0]);
+ in[1] = mm_reverse_epi16(in[1]);
+ in[2] = mm_reverse_epi16(in[2]);
+ in[3] = mm_reverse_epi16(in[3]);
+ in[4] = mm_reverse_epi16(in[4]);
+ in[5] = mm_reverse_epi16(in[5]);
+ in[6] = mm_reverse_epi16(in[6]);
+ in[7] = mm_reverse_epi16(in[7]);
+ }
+
+ u = _mm_unpackhi_epi64(in[4], in[4]);
+ in[8] = _mm_cvtepi16_epi32(in[4]);
+ in[9] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[5], in[5]);
+ in[10] = _mm_cvtepi16_epi32(in[5]);
+ in[11] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[6], in[6]);
+ in[12] = _mm_cvtepi16_epi32(in[6]);
+ in[13] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[7], in[7]);
+ in[14] = _mm_cvtepi16_epi32(in[7]);
+ in[15] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[3], in[3]);
+ in[6] = _mm_cvtepi16_epi32(in[3]);
+ in[7] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[2], in[2]);
+ in[4] = _mm_cvtepi16_epi32(in[2]);
+ in[5] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[1], in[1]);
+ in[2] = _mm_cvtepi16_epi32(in[1]);
+ in[3] = _mm_cvtepi16_epi32(u);
+
+ u = _mm_unpackhi_epi64(in[0], in[0]);
+ in[0] = _mm_cvtepi16_epi32(in[0]);
+ in[1] = _mm_cvtepi16_epi32(u);
+
+ in[0] = _mm_slli_epi32(in[0], shift);
+ in[1] = _mm_slli_epi32(in[1], shift);
+ in[2] = _mm_slli_epi32(in[2], shift);
+ in[3] = _mm_slli_epi32(in[3], shift);
+ in[4] = _mm_slli_epi32(in[4], shift);
+ in[5] = _mm_slli_epi32(in[5], shift);
+ in[6] = _mm_slli_epi32(in[6], shift);
+ in[7] = _mm_slli_epi32(in[7], shift);
+
+ in[8] = _mm_slli_epi32(in[8], shift);
+ in[9] = _mm_slli_epi32(in[9], shift);
+ in[10] = _mm_slli_epi32(in[10], shift);
+ in[11] = _mm_slli_epi32(in[11], shift);
+ in[12] = _mm_slli_epi32(in[12], shift);
+ in[13] = _mm_slli_epi32(in[13], shift);
+ in[14] = _mm_slli_epi32(in[14], shift);
+ in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+ in[0] = _mm_add_epi32(in[0], rounding);
+ in[1] = _mm_add_epi32(in[1], rounding);
+ in[2] = _mm_add_epi32(in[2], rounding);
+ in[3] = _mm_add_epi32(in[3], rounding);
+ in[4] = _mm_add_epi32(in[4], rounding);
+ in[5] = _mm_add_epi32(in[5], rounding);
+ in[6] = _mm_add_epi32(in[6], rounding);
+ in[7] = _mm_add_epi32(in[7], rounding);
+ in[8] = _mm_add_epi32(in[8], rounding);
+ in[9] = _mm_add_epi32(in[9], rounding);
+ in[10] = _mm_add_epi32(in[10], rounding);
+ in[11] = _mm_add_epi32(in[11], rounding);
+ in[12] = _mm_add_epi32(in[12], rounding);
+ in[13] = _mm_add_epi32(in[13], rounding);
+ in[14] = _mm_add_epi32(in[14], rounding);
+ in[15] = _mm_add_epi32(in[15], rounding);
+
+ in[0] = _mm_srai_epi32(in[0], shift);
+ in[1] = _mm_srai_epi32(in[1], shift);
+ in[2] = _mm_srai_epi32(in[2], shift);
+ in[3] = _mm_srai_epi32(in[3], shift);
+ in[4] = _mm_srai_epi32(in[4], shift);
+ in[5] = _mm_srai_epi32(in[5], shift);
+ in[6] = _mm_srai_epi32(in[6], shift);
+ in[7] = _mm_srai_epi32(in[7], shift);
+ in[8] = _mm_srai_epi32(in[8], shift);
+ in[9] = _mm_srai_epi32(in[9], shift);
+ in[10] = _mm_srai_epi32(in[10], shift);
+ in[11] = _mm_srai_epi32(in[11], shift);
+ in[12] = _mm_srai_epi32(in[12], shift);
+ in[13] = _mm_srai_epi32(in[13], shift);
+ in[14] = _mm_srai_epi32(in[14], shift);
+ in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, int32_t *output) {
+ _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+ _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+ _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+ _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+ _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+ _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+ _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+ _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+ _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+ _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static INLINE void write_buffer_16x8(const __m128i *res, int32_t *output,
+ const int stride) {
+ _mm_storeu_si128((__m128i *)(output), res[0]);
+ _mm_storeu_si128((__m128i *)(output + 4), res[1]);
+ _mm_storeu_si128((__m128i *)(output + stride), res[2]);
+ _mm_storeu_si128((__m128i *)(output + stride + 4), res[3]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 2)), res[4]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 2) + 4), res[5]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3)), res[6]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 3) + 4), res[7]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 4)), res[8]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 4) + 4), res[9]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5)), res[10]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 5) + 4), res[11]);
+
+ _mm_storeu_si128((__m128i *)(output + (stride * 6)), res[12]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 6) + 4), res[13]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7)), res[14]);
+ _mm_storeu_si128((__m128i *)(output + (stride * 7) + 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[8], v[8];
+
+ // Even 8 points 0, 2, ..., 14
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0], in[14]);
+ v[7] = _mm_sub_epi32(in[0], in[14]); // v[7]
+ u[1] = _mm_add_epi32(in[2], in[12]);
+ u[6] = _mm_sub_epi32(in[2], in[12]);
+ u[2] = _mm_add_epi32(in[4], in[10]);
+ u[5] = _mm_sub_epi32(in[4], in[10]);
+ u[3] = _mm_add_epi32(in[6], in[8]);
+ v[4] = _mm_sub_epi32(in[6], in[8]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[2] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[14] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[10] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[6] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[0] = u[0]; // buf0[0]
+ out[8] = u[1]; // buf0[1]
+ out[4] = u[2]; // buf0[2]
+ out[12] = u[3]; // buf0[3]
+
+ // Odd 8 points: 1, 3, ..., 15
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[1], in[15]);
+ v[7] = _mm_sub_epi32(in[1], in[15]); // v[7]
+ u[1] = _mm_add_epi32(in[3], in[13]);
+ u[6] = _mm_sub_epi32(in[3], in[13]);
+ u[2] = _mm_add_epi32(in[5], in[11]);
+ u[5] = _mm_sub_epi32(in[5], in[11]);
+ u[3] = _mm_add_epi32(in[7], in[9]);
+ v[4] = _mm_sub_epi32(in[7], in[9]); // v[4]
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[3]);
+ v[3] = _mm_sub_epi32(u[0], u[3]);
+ v[1] = _mm_add_epi32(u[1], u[2]);
+ v[2] = _mm_sub_epi32(u[1], u[2]);
+
+ v[5] = _mm_mullo_epi32(u[5], cospim32);
+ v[6] = _mm_mullo_epi32(u[6], cospi32);
+ v[5] = _mm_add_epi32(v[5], v[6]);
+ v[5] = _mm_add_epi32(v[5], rnding);
+ v[5] = _mm_srai_epi32(v[5], bit);
+
+ u[0] = _mm_mullo_epi32(u[5], cospi32);
+ v[6] = _mm_mullo_epi32(u[6], cospim32);
+ v[6] = _mm_sub_epi32(u[0], v[6]);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ // stage 3
+ // type 0
+ v[0] = _mm_mullo_epi32(v[0], cospi32);
+ v[1] = _mm_mullo_epi32(v[1], cospi32);
+ u[0] = _mm_add_epi32(v[0], v[1]);
+ u[0] = _mm_add_epi32(u[0], rnding);
+ u[0] = _mm_srai_epi32(u[0], bit);
+
+ u[1] = _mm_sub_epi32(v[0], v[1]);
+ u[1] = _mm_add_epi32(u[1], rnding);
+ u[1] = _mm_srai_epi32(u[1], bit);
+
+ // type 1
+ v[0] = _mm_mullo_epi32(v[2], cospi48);
+ v[1] = _mm_mullo_epi32(v[3], cospi16);
+ u[2] = _mm_add_epi32(v[0], v[1]);
+ u[2] = _mm_add_epi32(u[2], rnding);
+ u[2] = _mm_srai_epi32(u[2], bit);
+
+ v[0] = _mm_mullo_epi32(v[2], cospi16);
+ v[1] = _mm_mullo_epi32(v[3], cospi48);
+ u[3] = _mm_sub_epi32(v[1], v[0]);
+ u[3] = _mm_add_epi32(u[3], rnding);
+ u[3] = _mm_srai_epi32(u[3], bit);
+
+ u[4] = _mm_add_epi32(v[4], v[5]);
+ u[5] = _mm_sub_epi32(v[4], v[5]);
+ u[6] = _mm_sub_epi32(v[7], v[6]);
+ u[7] = _mm_add_epi32(v[7], v[6]);
+
+ // stage 4
+ // stage 5
+ v[0] = _mm_mullo_epi32(u[4], cospi56);
+ v[1] = _mm_mullo_epi32(u[7], cospi8);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[3] = _mm_srai_epi32(v[0], bit); // buf0[4]
+
+ v[0] = _mm_mullo_epi32(u[4], cospi8);
+ v[1] = _mm_mullo_epi32(u[7], cospi56);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[15] = _mm_srai_epi32(v[0], bit); // buf0[7]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi24);
+ v[1] = _mm_mullo_epi32(u[6], cospi40);
+ v[0] = _mm_add_epi32(v[0], v[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[11] = _mm_srai_epi32(v[0], bit); // buf0[5]
+
+ v[0] = _mm_mullo_epi32(u[5], cospi40);
+ v[1] = _mm_mullo_epi32(u[6], cospi24);
+ v[0] = _mm_sub_epi32(v[1], v[0]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ out[7] = _mm_srai_epi32(v[0], bit); // buf0[6]
+
+ out[1] = u[0]; // buf0[0]
+ out[9] = u[1]; // buf0[1]
+ out[5] = u[2]; // buf0[2]
+ out[13] = u[3]; // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ (void)(col_num);
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x, y;
+ int col;
+
+ // Note:
+ // Even column: 0, 2, ..., 14
+ // Odd column: 1, 3, ..., 15
+ // one even column plus one odd column constructs one row (8 coeffs)
+ // total we have 8 rows (8x8).
+ for (col = 0; col < 2; ++col) {
+ // stage 0
+ // stage 1
+ u0 = in[2 * 0 + col];
+ u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+ u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+ u3 = in[2 * 4 + col];
+ u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+ u5 = in[2 * 6 + col];
+ u6 = in[2 * 2 + col];
+ u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+ // stage 2
+ v0 = u0;
+ v1 = u1;
+
+ x = _mm_mullo_epi32(u2, cospi32);
+ y = _mm_mullo_epi32(u3, cospi32);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ v3 = _mm_sub_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ v4 = u4;
+ v5 = u5;
+
+ x = _mm_mullo_epi32(u6, cospi32);
+ y = _mm_mullo_epi32(u7, cospi32);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ v7 = _mm_sub_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 3
+ u0 = _mm_add_epi32(v0, v2);
+ u1 = _mm_add_epi32(v1, v3);
+ u2 = _mm_sub_epi32(v0, v2);
+ u3 = _mm_sub_epi32(v1, v3);
+ u4 = _mm_add_epi32(v4, v6);
+ u5 = _mm_add_epi32(v5, v7);
+ u6 = _mm_sub_epi32(v4, v6);
+ u7 = _mm_sub_epi32(v5, v7);
+
+ // stage 4
+ v0 = u0;
+ v1 = u1;
+ v2 = u2;
+ v3 = u3;
+
+ x = _mm_mullo_epi32(u4, cospi16);
+ y = _mm_mullo_epi32(u5, cospi48);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi48);
+ y = _mm_mullo_epi32(u5, cospim16);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospim48);
+ y = _mm_mullo_epi32(u7, cospi16);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi16);
+ y = _mm_mullo_epi32(u7, cospi48);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 5
+ u0 = _mm_add_epi32(v0, v4);
+ u1 = _mm_add_epi32(v1, v5);
+ u2 = _mm_add_epi32(v2, v6);
+ u3 = _mm_add_epi32(v3, v7);
+ u4 = _mm_sub_epi32(v0, v4);
+ u5 = _mm_sub_epi32(v1, v5);
+ u6 = _mm_sub_epi32(v2, v6);
+ u7 = _mm_sub_epi32(v3, v7);
+
+ // stage 6
+ x = _mm_mullo_epi32(u0, cospi4);
+ y = _mm_mullo_epi32(u1, cospi60);
+ v0 = _mm_add_epi32(x, y);
+ v0 = _mm_add_epi32(v0, rnding);
+ v0 = _mm_srai_epi32(v0, bit);
+
+ x = _mm_mullo_epi32(u0, cospi60);
+ y = _mm_mullo_epi32(u1, cospim4);
+ v1 = _mm_add_epi32(x, y);
+ v1 = _mm_add_epi32(v1, rnding);
+ v1 = _mm_srai_epi32(v1, bit);
+
+ x = _mm_mullo_epi32(u2, cospi20);
+ y = _mm_mullo_epi32(u3, cospi44);
+ v2 = _mm_add_epi32(x, y);
+ v2 = _mm_add_epi32(v2, rnding);
+ v2 = _mm_srai_epi32(v2, bit);
+
+ x = _mm_mullo_epi32(u2, cospi44);
+ y = _mm_mullo_epi32(u3, cospim20);
+ v3 = _mm_add_epi32(x, y);
+ v3 = _mm_add_epi32(v3, rnding);
+ v3 = _mm_srai_epi32(v3, bit);
+
+ x = _mm_mullo_epi32(u4, cospi36);
+ y = _mm_mullo_epi32(u5, cospi28);
+ v4 = _mm_add_epi32(x, y);
+ v4 = _mm_add_epi32(v4, rnding);
+ v4 = _mm_srai_epi32(v4, bit);
+
+ x = _mm_mullo_epi32(u4, cospi28);
+ y = _mm_mullo_epi32(u5, cospim36);
+ v5 = _mm_add_epi32(x, y);
+ v5 = _mm_add_epi32(v5, rnding);
+ v5 = _mm_srai_epi32(v5, bit);
+
+ x = _mm_mullo_epi32(u6, cospi52);
+ y = _mm_mullo_epi32(u7, cospi12);
+ v6 = _mm_add_epi32(x, y);
+ v6 = _mm_add_epi32(v6, rnding);
+ v6 = _mm_srai_epi32(v6, bit);
+
+ x = _mm_mullo_epi32(u6, cospi12);
+ y = _mm_mullo_epi32(u7, cospim52);
+ v7 = _mm_add_epi32(x, y);
+ v7 = _mm_add_epi32(v7, rnding);
+ v7 = _mm_srai_epi32(v7, bit);
+
+ // stage 7
+ out[2 * 0 + col] = v1;
+ out[2 * 1 + col] = v6;
+ out[2 * 2 + col] = v3;
+ out[2 * 3 + col] = v4;
+ out[2 * 4 + col] = v5;
+ out[2 * 5 + col] = v2;
+ out[2 * 6 + col] = v7;
+ out[2 * 7 + col] = v0;
+ }
+}
+
+void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
+ TX_TYPE tx_type, int bd) {
+ __m128i in[16], out[16];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X8];
+ const int txw_idx = get_txw_idx(TX_8X8);
+ const int txh_idx = get_txh_idx(TX_8X8);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 1, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 0);
+ col_txfm_8x8_rounding(out, -shift[1]);
+ transpose_8x8(out, in);
+ fadst8x8_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], 0);
+ transpose_8x8(out, in);
+ write_buffer_8x8(in, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+ int row_index = 0;
+ int dst_index = 0;
+ int src_index = 0;
+
+ // row 0, 1, .., 7
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 8);
+
+ // row 8, 9, ..., 15
+ src_index += 16;
+ do {
+ out[dst_index] = in[src_index];
+ out[dst_index + 1] = in[src_index + 1];
+ out[dst_index + 2] = in[src_index + 16];
+ out[dst_index + 3] = in[src_index + 17];
+ dst_index += 4;
+ src_index += 2;
+ row_index += 1;
+ } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ __m128i in[64];
+ // Load 4 8x8 blocks
+ const int16_t *topL = input;
+ const int16_t *topR = input + 8;
+ const int16_t *botL = input + 8 * stride;
+ const int16_t *botR = input + 8 * stride + 8;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ // Swap left columns
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ // Swap right columns
+ tmp = topR;
+ topR = botR;
+ botR = tmp;
+ }
+
+ if (fliplr) {
+ // Swap top rows
+ tmp = topL;
+ topL = topR;
+ topR = tmp;
+ // Swap bottom rows
+ tmp = botL;
+ botL = botR;
+ botR = tmp;
+ }
+
+ // load first 8 columns
+ load_buffer_8x8(topL, &in[0], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+ // load second 8 columns
+ load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+ load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+ convert_8x8_to_16x16(in, out);
+}
+
+static INLINE void load_buffer_8x16(const int16_t *input, __m128i *out,
+ int stride, int flipud, int fliplr,
+ int shift) {
+ const int16_t *topL = input;
+ const int16_t *botL = input + 8 * stride;
+
+ const int16_t *tmp;
+
+ if (flipud) {
+ tmp = topL;
+ topL = botL;
+ botL = tmp;
+ }
+
+ load_buffer_8x8(topL, out, stride, flipud, fliplr, shift);
+ load_buffer_8x8(botL, out + 16, stride, flipud, fliplr, shift);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int col_num) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+ const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+ const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+ const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+ const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+ const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+ const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+ const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ __m128i u[16], v[16], x;
+ int col;
+
+ // Calculate the column 0, 1, 2, 3
+ for (col = 0; col < col_num; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+ u[1] = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+ u[2] = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+ u[3] = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+ u[4] = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+ u[5] = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+ u[6] = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[9] = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+ u[7] = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+ u[8] = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+ // stage 2
+ v[0] = _mm_add_epi32(u[0], u[7]);
+ v[7] = _mm_sub_epi32(u[0], u[7]);
+ v[1] = _mm_add_epi32(u[1], u[6]);
+ v[6] = _mm_sub_epi32(u[1], u[6]);
+ v[2] = _mm_add_epi32(u[2], u[5]);
+ v[5] = _mm_sub_epi32(u[2], u[5]);
+ v[3] = _mm_add_epi32(u[3], u[4]);
+ v[4] = _mm_sub_epi32(u[3], u[4]);
+ v[8] = u[8];
+ v[9] = u[9];
+
+ v[10] = _mm_mullo_epi32(u[10], cospim32);
+ x = _mm_mullo_epi32(u[13], cospi32);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi32);
+ x = _mm_mullo_epi32(u[13], cospim32);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospim32);
+ x = _mm_mullo_epi32(u[12], cospi32);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi32);
+ x = _mm_mullo_epi32(u[12], cospim32);
+ v[12] = _mm_sub_epi32(v[12], x);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+ v[14] = u[14];
+ v[15] = u[15];
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[3]);
+ u[3] = _mm_sub_epi32(v[0], v[3]);
+ u[1] = _mm_add_epi32(v[1], v[2]);
+ u[2] = _mm_sub_epi32(v[1], v[2]);
+ u[4] = v[4];
+
+ u[5] = _mm_mullo_epi32(v[5], cospim32);
+ x = _mm_mullo_epi32(v[6], cospi32);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi32);
+ x = _mm_mullo_epi32(v[6], cospim32);
+ u[6] = _mm_sub_epi32(u[6], x);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[7] = v[7];
+ u[8] = _mm_add_epi32(v[8], v[11]);
+ u[11] = _mm_sub_epi32(v[8], v[11]);
+ u[9] = _mm_add_epi32(v[9], v[10]);
+ u[10] = _mm_sub_epi32(v[9], v[10]);
+ u[12] = _mm_sub_epi32(v[15], v[12]);
+ u[15] = _mm_add_epi32(v[15], v[12]);
+ u[13] = _mm_sub_epi32(v[14], v[13]);
+ u[14] = _mm_add_epi32(v[14], v[13]);
+
+ // stage 4
+ u[0] = _mm_mullo_epi32(u[0], cospi32);
+ u[1] = _mm_mullo_epi32(u[1], cospi32);
+ v[0] = _mm_add_epi32(u[0], u[1]);
+ v[0] = _mm_add_epi32(v[0], rnding);
+ v[0] = _mm_srai_epi32(v[0], bit);
+
+ v[1] = _mm_sub_epi32(u[0], u[1]);
+ v[1] = _mm_add_epi32(v[1], rnding);
+ v[1] = _mm_srai_epi32(v[1], bit);
+
+ v[2] = _mm_mullo_epi32(u[2], cospi48);
+ x = _mm_mullo_epi32(u[3], cospi16);
+ v[2] = _mm_add_epi32(v[2], x);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_mullo_epi32(u[2], cospi16);
+ x = _mm_mullo_epi32(u[3], cospi48);
+ v[3] = _mm_sub_epi32(x, v[3]);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = _mm_add_epi32(u[4], u[5]);
+ v[5] = _mm_sub_epi32(u[4], u[5]);
+ v[6] = _mm_sub_epi32(u[7], u[6]);
+ v[7] = _mm_add_epi32(u[7], u[6]);
+ v[8] = u[8];
+
+ v[9] = _mm_mullo_epi32(u[9], cospim16);
+ x = _mm_mullo_epi32(u[14], cospi48);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi48);
+ x = _mm_mullo_epi32(u[14], cospim16);
+ v[14] = _mm_sub_epi32(v[14], x);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospim48);
+ x = _mm_mullo_epi32(u[13], cospim16);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospim16);
+ x = _mm_mullo_epi32(u[13], cospim48);
+ v[13] = _mm_sub_epi32(v[13], x);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = u[11];
+ v[12] = u[12];
+ v[15] = u[15];
+
+ // stage 5
+ u[0] = v[0];
+ u[1] = v[1];
+ u[2] = v[2];
+ u[3] = v[3];
+
+ u[4] = _mm_mullo_epi32(v[4], cospi56);
+ x = _mm_mullo_epi32(v[7], cospi8);
+ u[4] = _mm_add_epi32(u[4], x);
+ u[4] = _mm_add_epi32(u[4], rnding);
+ u[4] = _mm_srai_epi32(u[4], bit);
+
+ u[7] = _mm_mullo_epi32(v[4], cospi8);
+ x = _mm_mullo_epi32(v[7], cospi56);
+ u[7] = _mm_sub_epi32(x, u[7]);
+ u[7] = _mm_add_epi32(u[7], rnding);
+ u[7] = _mm_srai_epi32(u[7], bit);
+
+ u[5] = _mm_mullo_epi32(v[5], cospi24);
+ x = _mm_mullo_epi32(v[6], cospi40);
+ u[5] = _mm_add_epi32(u[5], x);
+ u[5] = _mm_add_epi32(u[5], rnding);
+ u[5] = _mm_srai_epi32(u[5], bit);
+
+ u[6] = _mm_mullo_epi32(v[5], cospi40);
+ x = _mm_mullo_epi32(v[6], cospi24);
+ u[6] = _mm_sub_epi32(x, u[6]);
+ u[6] = _mm_add_epi32(u[6], rnding);
+ u[6] = _mm_srai_epi32(u[6], bit);
+
+ u[8] = _mm_add_epi32(v[8], v[9]);
+ u[9] = _mm_sub_epi32(v[8], v[9]);
+ u[10] = _mm_sub_epi32(v[11], v[10]);
+ u[11] = _mm_add_epi32(v[11], v[10]);
+ u[12] = _mm_add_epi32(v[12], v[13]);
+ u[13] = _mm_sub_epi32(v[12], v[13]);
+ u[14] = _mm_sub_epi32(v[15], v[14]);
+ u[15] = _mm_add_epi32(v[15], v[14]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+
+ v[8] = _mm_mullo_epi32(u[8], cospi60);
+ x = _mm_mullo_epi32(u[15], cospi4);
+ v[8] = _mm_add_epi32(v[8], x);
+ v[8] = _mm_add_epi32(v[8], rnding);
+ v[8] = _mm_srai_epi32(v[8], bit);
+
+ v[15] = _mm_mullo_epi32(u[8], cospi4);
+ x = _mm_mullo_epi32(u[15], cospi60);
+ v[15] = _mm_sub_epi32(x, v[15]);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ v[9] = _mm_mullo_epi32(u[9], cospi28);
+ x = _mm_mullo_epi32(u[14], cospi36);
+ v[9] = _mm_add_epi32(v[9], x);
+ v[9] = _mm_add_epi32(v[9], rnding);
+ v[9] = _mm_srai_epi32(v[9], bit);
+
+ v[14] = _mm_mullo_epi32(u[9], cospi36);
+ x = _mm_mullo_epi32(u[14], cospi28);
+ v[14] = _mm_sub_epi32(x, v[14]);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[10] = _mm_mullo_epi32(u[10], cospi44);
+ x = _mm_mullo_epi32(u[13], cospi20);
+ v[10] = _mm_add_epi32(v[10], x);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[13] = _mm_mullo_epi32(u[10], cospi20);
+ x = _mm_mullo_epi32(u[13], cospi44);
+ v[13] = _mm_sub_epi32(x, v[13]);
+ v[13] = _mm_add_epi32(v[13], rnding);
+ v[13] = _mm_srai_epi32(v[13], bit);
+
+ v[11] = _mm_mullo_epi32(u[11], cospi12);
+ x = _mm_mullo_epi32(u[12], cospi52);
+ v[11] = _mm_add_epi32(v[11], x);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = _mm_mullo_epi32(u[11], cospi52);
+ x = _mm_mullo_epi32(u[12], cospi12);
+ v[12] = _mm_sub_epi32(x, v[12]);
+ v[12] = _mm_add_epi32(v[12], rnding);
+ v[12] = _mm_srai_epi32(v[12], bit);
+
+ out[0 * col_num + col] = v[0];
+ out[1 * col_num + col] = v[8];
+ out[2 * col_num + col] = v[4];
+ out[3 * col_num + col] = v[12];
+ out[4 * col_num + col] = v[2];
+ out[5 * col_num + col] = v[10];
+ out[6 * col_num + col] = v[6];
+ out[7 * col_num + col] = v[14];
+ out[8 * col_num + col] = v[1];
+ out[9 * col_num + col] = v[9];
+ out[10 * col_num + col] = v[5];
+ out[11 * col_num + col] = v[13];
+ out[12 * col_num + col] = v[3];
+ out[13 * col_num + col] = v[11];
+ out[14 * col_num + col] = v[7];
+ out[15 * col_num + col] = v[15];
+ }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit,
+ const int num_cols) {
+ const int32_t *cospi = cospi_arr(bit);
+ const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+ const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+ const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+ const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+ const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+ const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+ const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+ const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+ const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+ const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+ const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+ const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+ const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+ const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+ const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+ const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+ const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+ const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+ const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+ const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+ const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+ const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+ const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+ const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+ const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+ const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+ const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+ const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+ const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+ const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+ const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+ const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+ const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+ const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+ const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+ const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+ const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+ const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u[16], v[16], x, y;
+ int col;
+
+ for (col = 0; col < num_cols; ++col) {
+ // stage 0
+ // stage 1
+ u[0] = in[0 * num_cols + col];
+ u[1] = _mm_sub_epi32(zero, in[15 * num_cols + col]);
+ u[2] = _mm_sub_epi32(zero, in[7 * num_cols + col]);
+ u[3] = in[8 * num_cols + col];
+ u[4] = _mm_sub_epi32(zero, in[3 * num_cols + col]);
+ u[5] = in[12 * num_cols + col];
+ u[6] = in[4 * num_cols + col];
+ u[7] = _mm_sub_epi32(zero, in[11 * num_cols + col]);
+ u[8] = _mm_sub_epi32(zero, in[1 * num_cols + col]);
+ u[9] = in[14 * num_cols + col];
+ u[10] = in[6 * num_cols + col];
+ u[11] = _mm_sub_epi32(zero, in[9 * num_cols + col]);
+ u[12] = in[2 * num_cols + col];
+ u[13] = _mm_sub_epi32(zero, in[13 * num_cols + col]);
+ u[14] = _mm_sub_epi32(zero, in[5 * num_cols + col]);
+ u[15] = in[10 * num_cols + col];
+
+ // stage 2
+ v[0] = u[0];
+ v[1] = u[1];
+
+ x = _mm_mullo_epi32(u[2], cospi32);
+ y = _mm_mullo_epi32(u[3], cospi32);
+ v[2] = _mm_add_epi32(x, y);
+ v[2] = _mm_add_epi32(v[2], rnding);
+ v[2] = _mm_srai_epi32(v[2], bit);
+
+ v[3] = _mm_sub_epi32(x, y);
+ v[3] = _mm_add_epi32(v[3], rnding);
+ v[3] = _mm_srai_epi32(v[3], bit);
+
+ v[4] = u[4];
+ v[5] = u[5];
+
+ x = _mm_mullo_epi32(u[6], cospi32);
+ y = _mm_mullo_epi32(u[7], cospi32);
+ v[6] = _mm_add_epi32(x, y);
+ v[6] = _mm_add_epi32(v[6], rnding);
+ v[6] = _mm_srai_epi32(v[6], bit);
+
+ v[7] = _mm_sub_epi32(x, y);
+ v[7] = _mm_add_epi32(v[7], rnding);
+ v[7] = _mm_srai_epi32(v[7], bit);
+
+ v[8] = u[8];
+ v[9] = u[9];
+
+ x = _mm_mullo_epi32(u[10], cospi32);
+ y = _mm_mullo_epi32(u[11], cospi32);
+ v[10] = _mm_add_epi32(x, y);
+ v[10] = _mm_add_epi32(v[10], rnding);
+ v[10] = _mm_srai_epi32(v[10], bit);
+
+ v[11] = _mm_sub_epi32(x, y);
+ v[11] = _mm_add_epi32(v[11], rnding);
+ v[11] = _mm_srai_epi32(v[11], bit);
+
+ v[12] = u[12];
+ v[13] = u[13];
+
+ x = _mm_mullo_epi32(u[14], cospi32);
+ y = _mm_mullo_epi32(u[15], cospi32);
+ v[14] = _mm_add_epi32(x, y);
+ v[14] = _mm_add_epi32(v[14], rnding);
+ v[14] = _mm_srai_epi32(v[14], bit);
+
+ v[15] = _mm_sub_epi32(x, y);
+ v[15] = _mm_add_epi32(v[15], rnding);
+ v[15] = _mm_srai_epi32(v[15], bit);
+
+ // stage 3
+ u[0] = _mm_add_epi32(v[0], v[2]);
+ u[1] = _mm_add_epi32(v[1], v[3]);
+ u[2] = _mm_sub_epi32(v[0], v[2]);
+ u[3] = _mm_sub_epi32(v[1], v[3]);
+ u[4] = _mm_add_epi32(v[4], v[6]);
+ u[5] = _mm_add_epi32(v[5], v[7]);
+ u[6] = _mm_sub_epi32(v[4], v[6]);
+ u[7] = _mm_sub_epi32(v[5], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[10]);
+ u[9] = _mm_add_epi32(v[9], v[11]);
+ u[10] = _mm_sub_epi32(v[8], v[10]);
+ u[11] = _mm_sub_epi32(v[9], v[11]);
+ u[12] = _mm_add_epi32(v[12], v[14]);
+ u[13] = _mm_add_epi32(v[13], v[15]);
+ u[14] = _mm_sub_epi32(v[12], v[14]);
+ u[15] = _mm_sub_epi32(v[13], v[15]);
+
+ // stage 4
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = half_btf_sse4_1(&cospi16, &u[4], &cospi48, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi48, &u[4], &cospim16, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospim48, &u[6], &cospi16, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi16, &u[6], &cospi48, &u[7], &rnding, bit);
+ v[8] = u[8];
+ v[9] = u[9];
+ v[10] = u[10];
+ v[11] = u[11];
+ v[12] = half_btf_sse4_1(&cospi16, &u[12], &cospi48, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi48, &u[12], &cospim16, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim48, &u[14], &cospi16, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi16, &u[14], &cospi48, &u[15], &rnding, bit);
+
+ // stage 5
+ u[0] = _mm_add_epi32(v[0], v[4]);
+ u[1] = _mm_add_epi32(v[1], v[5]);
+ u[2] = _mm_add_epi32(v[2], v[6]);
+ u[3] = _mm_add_epi32(v[3], v[7]);
+ u[4] = _mm_sub_epi32(v[0], v[4]);
+ u[5] = _mm_sub_epi32(v[1], v[5]);
+ u[6] = _mm_sub_epi32(v[2], v[6]);
+ u[7] = _mm_sub_epi32(v[3], v[7]);
+ u[8] = _mm_add_epi32(v[8], v[12]);
+ u[9] = _mm_add_epi32(v[9], v[13]);
+ u[10] = _mm_add_epi32(v[10], v[14]);
+ u[11] = _mm_add_epi32(v[11], v[15]);
+ u[12] = _mm_sub_epi32(v[8], v[12]);
+ u[13] = _mm_sub_epi32(v[9], v[13]);
+ u[14] = _mm_sub_epi32(v[10], v[14]);
+ u[15] = _mm_sub_epi32(v[11], v[15]);
+
+ // stage 6
+ v[0] = u[0];
+ v[1] = u[1];
+ v[2] = u[2];
+ v[3] = u[3];
+ v[4] = u[4];
+ v[5] = u[5];
+ v[6] = u[6];
+ v[7] = u[7];
+ v[8] = half_btf_sse4_1(&cospi8, &u[8], &cospi56, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi56, &u[8], &cospim8, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi40, &u[10], &cospi24, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi24, &u[10], &cospim40, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospim56, &u[12], &cospi8, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi8, &u[12], &cospi56, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospim24, &u[14], &cospi40, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi40, &u[14], &cospi24, &u[15], &rnding, bit);
+
+ // stage 7
+ u[0] = _mm_add_epi32(v[0], v[8]);
+ u[1] = _mm_add_epi32(v[1], v[9]);
+ u[2] = _mm_add_epi32(v[2], v[10]);
+ u[3] = _mm_add_epi32(v[3], v[11]);
+ u[4] = _mm_add_epi32(v[4], v[12]);
+ u[5] = _mm_add_epi32(v[5], v[13]);
+ u[6] = _mm_add_epi32(v[6], v[14]);
+ u[7] = _mm_add_epi32(v[7], v[15]);
+ u[8] = _mm_sub_epi32(v[0], v[8]);
+ u[9] = _mm_sub_epi32(v[1], v[9]);
+ u[10] = _mm_sub_epi32(v[2], v[10]);
+ u[11] = _mm_sub_epi32(v[3], v[11]);
+ u[12] = _mm_sub_epi32(v[4], v[12]);
+ u[13] = _mm_sub_epi32(v[5], v[13]);
+ u[14] = _mm_sub_epi32(v[6], v[14]);
+ u[15] = _mm_sub_epi32(v[7], v[15]);
+
+ // stage 8
+ v[0] = half_btf_sse4_1(&cospi2, &u[0], &cospi62, &u[1], &rnding, bit);
+ v[1] = half_btf_sse4_1(&cospi62, &u[0], &cospim2, &u[1], &rnding, bit);
+ v[2] = half_btf_sse4_1(&cospi10, &u[2], &cospi54, &u[3], &rnding, bit);
+ v[3] = half_btf_sse4_1(&cospi54, &u[2], &cospim10, &u[3], &rnding, bit);
+ v[4] = half_btf_sse4_1(&cospi18, &u[4], &cospi46, &u[5], &rnding, bit);
+ v[5] = half_btf_sse4_1(&cospi46, &u[4], &cospim18, &u[5], &rnding, bit);
+ v[6] = half_btf_sse4_1(&cospi26, &u[6], &cospi38, &u[7], &rnding, bit);
+ v[7] = half_btf_sse4_1(&cospi38, &u[6], &cospim26, &u[7], &rnding, bit);
+ v[8] = half_btf_sse4_1(&cospi34, &u[8], &cospi30, &u[9], &rnding, bit);
+ v[9] = half_btf_sse4_1(&cospi30, &u[8], &cospim34, &u[9], &rnding, bit);
+ v[10] = half_btf_sse4_1(&cospi42, &u[10], &cospi22, &u[11], &rnding, bit);
+ v[11] = half_btf_sse4_1(&cospi22, &u[10], &cospim42, &u[11], &rnding, bit);
+ v[12] = half_btf_sse4_1(&cospi50, &u[12], &cospi14, &u[13], &rnding, bit);
+ v[13] = half_btf_sse4_1(&cospi14, &u[12], &cospim50, &u[13], &rnding, bit);
+ v[14] = half_btf_sse4_1(&cospi58, &u[14], &cospi6, &u[15], &rnding, bit);
+ v[15] = half_btf_sse4_1(&cospi6, &u[14], &cospim58, &u[15], &rnding, bit);
+
+ // stage 9
+ out[0 * num_cols + col] = v[1];
+ out[1 * num_cols + col] = v[14];
+ out[2 * num_cols + col] = v[3];
+ out[3 * num_cols + col] = v[12];
+ out[4 * num_cols + col] = v[5];
+ out[5 * num_cols + col] = v[10];
+ out[6 * num_cols + col] = v[7];
+ out[7 * num_cols + col] = v[8];
+ out[8 * num_cols + col] = v[9];
+ out[9 * num_cols + col] = v[6];
+ out[10 * num_cols + col] = v[11];
+ out[11 * num_cols + col] = v[4];
+ out[12 * num_cols + col] = v[13];
+ out[13 * num_cols + col] = v[2];
+ out[14 * num_cols + col] = v[15];
+ out[15 * num_cols + col] = v[0];
+ }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+ // Note:
+ // We split 16x16 rounding into 4 sections of 8x8 rounding,
+ // instead of 4 columns
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+ col_txfm_8x8_rounding(&in[32], shift);
+ col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void col_txfm_8x16_rounding(__m128i *in, int shift) {
+ col_txfm_8x8_rounding(&in[0], shift);
+ col_txfm_8x8_rounding(&in[16], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, int32_t *output) {
+ const int size_8x8 = 16 * 4;
+ write_buffer_8x8(&in[0], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[16], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[32], output);
+ output += size_8x8;
+ write_buffer_8x8(&in[48], output);
+}
+
+void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[64], out[64];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X16];
+ const int txw_idx = get_txw_idx(TX_16X16);
+ const int txh_idx = get_txh_idx(TX_16X16);
+ const int col_num = 4;
+ switch (tx_type) {
+ case DCT_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_DCT:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case DCT_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_ADST:
+ load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case FLIPADST_DCT:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case DCT_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case FLIPADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 1, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case ADST_FLIPADST:
+ load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ case FLIPADST_ADST:
+ load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+ col_txfm_16x16_rounding(out, -shift[1]);
+ transpose_16x16(out, in);
+ fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+ transpose_16x16(out, in);
+ write_buffer_16x16(in, coeff);
+ break;
+ default: assert(0);
+ }
+ (void)bd;
+}
+
+static INLINE void flip_buf_sse4_1(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; i += 2) in[30 - i] = out[i];
+ for (int i = 1; i < size; i += 2) in[size - i] = out[i];
+}
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fadst8x8_sse4_1, // ADST_DCT
+ fdct8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fadst8x8_sse4_1, // FLIPADST_DCT
+ fdct8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fdct16x16_sse4_1, // ADST_DCT
+ fadst16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fdct16x16_sse4_1, // FLIPADST_DCT
+ fadst16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
+ fdct16x16_sse4_1, // DCT_DCT
+ fadst16x16_sse4_1, // ADST_DCT
+ fdct16x16_sse4_1, // DCT_ADST
+ fadst16x16_sse4_1, // ADST_ADST
+ fadst16x16_sse4_1, // FLIPADST_DCT
+ fdct16x16_sse4_1, // DCT_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_FLIPADST
+ fadst16x16_sse4_1, // ADST_FLIPADST
+ fadst16x16_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
+ fdct8x8_sse4_1, // DCT_DCT
+ fdct8x8_sse4_1, // ADST_DCT
+ fadst8x8_sse4_1, // DCT_ADST
+ fadst8x8_sse4_1, // ADST_ADST
+ fdct8x8_sse4_1, // FLIPADST_DCT
+ fadst8x8_sse4_1, // DCT_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_FLIPADST
+ fadst8x8_sse4_1, // ADST_FLIPADST
+ fadst8x8_sse4_1, // FLIPADST_ADST
+ NULL, // IDTX
+ NULL, // V_DCT
+ NULL, // H_DCT
+ NULL, // V_ADST
+ NULL, // H_ADST
+ NULL, // V_FLIPADST
+ NULL // H_FLIPADST
+};
+
+void av1_fwd_txfm2d_16x8_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_16X8];
+ const int txw_idx = get_txw_idx(TX_16X8);
+ const int txh_idx = get_txh_idx(TX_16X8);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x8_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x16_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ for (int i = 0; i < 2; i++) {
+ load_buffer_8x8(input + i * 8, in, stride, ud_flip, 0, shift[0]);
+ col_txfm(in, in, bit, 0);
+ col_txfm_8x8_rounding(in, -shift[1]);
+ transpose_8x8(in, out + i * 16);
+ }
+
+ if (lr_flip) {
+ flip_buf_sse4_1(in, out, 32);
+ row_txfm(in, out, bit, 2);
+ } else {
+ row_txfm(out, out, bit, 2);
+ }
+
+ for (int i = 0; i < 2; i++) {
+ transpose_8x8(out + i * 16, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_16x8(in, coeff + i * 8, 16);
+ }
+
+ (void)bd;
+}
+
+void av1_fwd_txfm2d_8x16_sse4_1(const int16_t *input, int32_t *coeff,
+ int stride, TX_TYPE tx_type, int bd) {
+ __m128i in[32], out[32];
+ const int8_t *shift = fwd_txfm_shift_ls[TX_8X16];
+ const int txw_idx = get_txw_idx(TX_8X16);
+ const int txh_idx = get_txh_idx(TX_8X16);
+ const fwd_transform_1d_sse4_1 col_txfm = col_highbd_txfm8x16_arr[tx_type];
+ const fwd_transform_1d_sse4_1 row_txfm = row_highbd_txfm8x8_arr[tx_type];
+ int bit = fwd_cos_bit_col[txw_idx][txh_idx];
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ load_buffer_8x16(input, in, stride, ud_flip, lr_flip, shift[0]);
+ col_txfm(in, in, bit, 2);
+ col_txfm_8x16_rounding(in, -shift[1]);
+ transpose_8x8(in, out);
+ transpose_8x8(in + 16, out + 16);
+
+ for (int i = 0; i < 2; i++) {
+ row_txfm(out + i * 16, out, bit, 0);
+ transpose_8x8(out, in);
+ av1_round_shift_rect_array_32_sse4_1(in, in, 16, -shift[2], NewSqrt2);
+ write_buffer_8x8(in, coeff + i * 64);
+ }
+
+ (void)bd;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_avx2.c b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
new file mode 100644
index 0000000000..06aaaa7eee
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_avx2.c
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_avx2(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m256i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m256i d0 = _mm256_madd_epi16(*kl, _mm256_cvtepu8_epi16(s));
+ const __m256i dst0 = yy_loadu_256(dst);
+ const __m256i r0 = _mm256_add_epi32(dst0, d0);
+ yy_storeu_256(dst, r0);
+}
+
+static INLINE void acc_stat_win7_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ int j, k, l;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ const uint8_t *dgd_ij = dgd + j;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m256i kl =
+ _mm256_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_avx2(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_avx2(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_avx2(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m256i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_avx2(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
+ __m256i sum64 = _mm256_setzero_si256();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
+ const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
+ const __m256i v0 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i v1 = _mm256_madd_epi16(
+ xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m256i xq_coeff =
+ pair_set_epi16(xq[0], (-xq[0] * (1 << SGRPROJ_RST_BITS)));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt0_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt0 + j),
+ yy_loadu_256(flt0 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt0_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt0_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m256i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i flt1_16b = _mm256_permute4x64_epi64(
+ _mm256_packs_epi32(yy_loadu_256(flt1 + j),
+ yy_loadu_256(flt1 + j + 8)),
+ 0xd8);
+ const __m256i v0 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt1_16b, d0));
+ const __m256i v1 =
+ _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt1_16b, d0));
+ const __m256i vr0 =
+ _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
+ const __m256i vr1 =
+ _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
+ const __m256i e0 = _mm256_sub_epi16(
+ _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
+ const __m256i err0 = _mm256_madd_epi16(e0, e0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64, sum64_0);
+ sum64 = _mm256_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m256i sum32 = _mm256_setzero_si256();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j <= width - 16; j += 16) {
+ const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
+ const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
+ const __m256i diff0 = _mm256_sub_epi16(d0, s0);
+ const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
+ sum32 = _mm256_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m256i sum64_0 =
+ _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
+ const __m256i sum64_1 =
+ _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
+ sum64 = _mm256_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[4];
+ yy_storeu_256(sum, sum64);
+ err += sum[0] + sum[1] + sum[2] + sum[3];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/pickrst_sse4.c b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
new file mode 100644
index 0000000000..04e4d1afc4
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/pickrst_sse4.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/av1_rtcd.h"
+#include "av1/common/restoration.h"
+#include "av1/encoder/pickrst.h"
+
+static INLINE void acc_stat_sse41(int32_t *dst, const uint8_t *src,
+ const __m128i *shuffle, const __m128i *kl) {
+ const __m128i s = _mm_shuffle_epi8(xx_loadu_128(src), *shuffle);
+ const __m128i d0 = _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(s));
+ const __m128i d1 =
+ _mm_madd_epi16(*kl, _mm_cvtepu8_epi16(_mm_srli_si128(s, 8)));
+ const __m128i dst0 = xx_loadu_128(dst);
+ const __m128i dst1 = xx_loadu_128(dst + 4);
+ const __m128i r0 = _mm_add_epi32(dst0, d0);
+ const __m128i r1 = _mm_add_epi32(dst1, d1);
+ xx_storeu_128(dst, r0);
+ xx_storeu_128(dst + 4, r1);
+}
+
+static INLINE void acc_stat_win7_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN][WIENER_WIN], int32_t M_int[WIENER_WIN][WIENER_WIN],
+ int32_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
+ const int wiener_win = 7;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win7_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int64_t M_int64[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2][WIENER_WIN * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win7_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN2; ++k) {
+ for (l = 0; l < WIENER_WIN * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+
+static INLINE void acc_stat_win5_one_line_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end,
+ int dgd_stride, const __m128i *shuffle, int32_t *sumX,
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
+ int32_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
+ const int wiener_win = WIENER_WIN_CHROMA;
+ int j, k, l;
+ for (j = h_start; j < h_end; j += 2) {
+ const uint8_t *dgd_ij = dgd + j;
+ const uint8_t X1 = src[j];
+ const uint8_t X2 = src[j + 1];
+ *sumX += X1 + X2;
+ for (k = 0; k < wiener_win; k++) {
+ const uint8_t *dgd_ijk = dgd_ij + k * dgd_stride;
+ for (l = 0; l < wiener_win; l++) {
+ int32_t *H_ = &H_int[(l * wiener_win + k)][0];
+ const uint8_t D1 = dgd_ijk[l];
+ const uint8_t D2 = dgd_ijk[l + 1];
+ sumY[k][l] += D1 + D2;
+ M_int[k][l] += D1 * X1 + D2 * X2;
+
+ const __m128i kl =
+ _mm_cvtepu8_epi16(_mm_set1_epi16(*((uint16_t *)(dgd_ijk + l))));
+ acc_stat_sse41(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle, &kl);
+ acc_stat_sse41(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle, &kl);
+ }
+ }
+ }
+}
+
+static INLINE void compute_stats_win5_opt_sse4_1(
+ const uint8_t *dgd, const uint8_t *src, int h_start, int h_end, int v_start,
+ int v_end, int dgd_stride, int src_stride, double *M, double *H) {
+ int i, j, k, l, m, n;
+ const int wiener_win = WIENER_WIN_CHROMA;
+ const int pixel_count = (h_end - h_start) * (v_end - v_start);
+ const int wiener_win2 = wiener_win * wiener_win;
+ const int wiener_halfwin = (wiener_win >> 1);
+ const double avg =
+ find_average(dgd, h_start, h_end, v_start, v_end, dgd_stride);
+
+ int32_t M_int32[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t H_int32[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int64_t H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8] = { { 0 } };
+ int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
+ int32_t sumX = 0;
+ const uint8_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
+
+ const __m128i shuffle = xx_loadu_128(g_shuffle_stats_data);
+ for (j = v_start; j < v_end; j += 64) {
+ const int vert_end = AOMMIN(64, v_end - j) + j;
+ for (i = j; i < vert_end; i++) {
+ acc_stat_win5_one_line_sse4_1(
+ dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
+ dgd_stride, &shuffle, &sumX, sumY, M_int32, H_int32);
+ }
+ for (k = 0; k < wiener_win; ++k) {
+ for (l = 0; l < wiener_win; ++l) {
+ M_int64[k][l] += M_int32[k][l];
+ M_int32[k][l] = 0;
+ }
+ }
+ for (k = 0; k < WIENER_WIN_CHROMA * WIENER_WIN_CHROMA; ++k) {
+ for (l = 0; l < WIENER_WIN_CHROMA * 8; ++l) {
+ H_int64[k][l] += H_int32[k][l];
+ H_int32[k][l] = 0;
+ }
+ }
+ }
+
+ const double avg_square_sum = avg * avg * pixel_count;
+ for (k = 0; k < wiener_win; k++) {
+ for (l = 0; l < wiener_win; l++) {
+ const int32_t idx0 = l * wiener_win + k;
+ M[idx0] = M_int64[k][l] + avg_square_sum - avg * (sumX + sumY[k][l]);
+ double *H_ = H + idx0 * wiener_win2;
+ int64_t *H_int_ = &H_int64[idx0][0];
+ for (m = 0; m < wiener_win; m++) {
+ for (n = 0; n < wiener_win; n++) {
+ H_[m * wiener_win + n] = H_int_[n * 8 + m] + avg_square_sum -
+ avg * (sumY[k][l] + sumY[n][m]);
+ }
+ }
+ }
+ }
+}
+void av1_compute_stats_sse4_1(int wiener_win, const uint8_t *dgd,
+ const uint8_t *src, int h_start, int h_end,
+ int v_start, int v_end, int dgd_stride,
+ int src_stride, double *M, double *H) {
+ if (wiener_win == WIENER_WIN) {
+ compute_stats_win7_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else if (wiener_win == WIENER_WIN_CHROMA) {
+ compute_stats_win5_opt_sse4_1(dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ } else {
+ av1_compute_stats_c(wiener_win, dgd, src, h_start, h_end, v_start, v_end,
+ dgd_stride, src_stride, M, H);
+ }
+}
+
+static INLINE __m128i pair_set_epi16(uint16_t a, uint16_t b) {
+ return _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+int64_t av1_lowbd_pixel_proj_error_sse4_1(
+ const uint8_t *src8, int width, int height, int src_stride,
+ const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
+ int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
+ int i, j, k;
+ const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
+ const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+ __m128i sum64 = _mm_setzero_si128();
+ const uint8_t *src = src8;
+ const uint8_t *dat = dat8;
+ int64_t err = 0;
+ if (params->r[0] > 0 && params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], xq[1]);
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i u0 = _mm_slli_epi16(d0, SGRPROJ_RST_BITS);
+ const __m128i flt0_0_sub_u = _mm_sub_epi16(flt0_16b, u0);
+ const __m128i flt1_0_sub_u = _mm_sub_epi16(flt1_16b, u0);
+ const __m128i v0 = _mm_madd_epi16(
+ xq_coeff, _mm_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i v1 = _mm_madd_epi16(
+ xq_coeff, _mm_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[0] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[0], -(xq[0] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt0_16b =
+ _mm_packs_epi32(xx_loadu_128(flt0 + j), xx_loadu_128(flt0 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt0_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt0_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[0] * (flt0[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt0 += flt0_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else if (params->r[1] > 0) {
+ __m128i xq_coeff = pair_set_epi16(xq[1], -(xq[1] << SGRPROJ_RST_BITS));
+ for (i = 0; i < height; ++i) {
+ __m128i sum32 = _mm_setzero_si128();
+ for (j = 0; j < width - 8; j += 8) {
+ const __m128i d0 = _mm_cvtepu8_epi16(xx_loadl_64(dat + j));
+ const __m128i s0 = _mm_cvtepu8_epi16(xx_loadl_64(src + j));
+ const __m128i flt1_16b =
+ _mm_packs_epi32(xx_loadu_128(flt1 + j), xx_loadu_128(flt1 + j + 4));
+ const __m128i v0 =
+ _mm_madd_epi16(xq_coeff, _mm_unpacklo_epi16(flt1_16b, d0));
+ const __m128i v1 =
+ _mm_madd_epi16(xq_coeff, _mm_unpackhi_epi16(flt1_16b, d0));
+ const __m128i vr0 = _mm_srai_epi32(_mm_add_epi32(v0, rounding), shift);
+ const __m128i vr1 = _mm_srai_epi32(_mm_add_epi32(v1, rounding), shift);
+ const __m128i e0 =
+ _mm_sub_epi16(_mm_add_epi16(_mm_packs_epi32(vr0, vr1), d0), s0);
+ const __m128i err0 = _mm_madd_epi16(e0, e0);
+ sum32 = _mm_add_epi32(sum32, err0);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
+ int32_t v = xq[1] * (flt1[k] - u);
+ const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ flt1 += flt1_stride;
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64, sum64_0);
+ sum64 = _mm_add_epi64(sum64, sum64_1);
+ }
+ } else {
+ __m128i sum32 = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < width - 16; j += 16) {
+ const __m128i d = xx_loadu_128(dat + j);
+ const __m128i s = xx_loadu_128(src + j);
+ const __m128i d0 = _mm_cvtepu8_epi16(d);
+ const __m128i d1 = _mm_cvtepu8_epi16(_mm_srli_si128(d, 8));
+ const __m128i s0 = _mm_cvtepu8_epi16(s);
+ const __m128i s1 = _mm_cvtepu8_epi16(_mm_srli_si128(s, 8));
+ const __m128i diff0 = _mm_sub_epi16(d0, s0);
+ const __m128i diff1 = _mm_sub_epi16(d1, s1);
+ const __m128i err0 = _mm_madd_epi16(diff0, diff0);
+ const __m128i err1 = _mm_madd_epi16(diff1, diff1);
+ sum32 = _mm_add_epi32(sum32, err0);
+ sum32 = _mm_add_epi32(sum32, err1);
+ }
+ for (k = j; k < width; ++k) {
+ const int32_t e = (int32_t)(dat[k]) - src[k];
+ err += e * e;
+ }
+ dat += dat_stride;
+ src += src_stride;
+ }
+ const __m128i sum64_0 = _mm_cvtepi32_epi64(sum32);
+ const __m128i sum64_1 = _mm_cvtepi32_epi64(_mm_srli_si128(sum32, 8));
+ sum64 = _mm_add_epi64(sum64_0, sum64_1);
+ }
+ int64_t sum[2];
+ xx_storeu_128(sum, sum64);
+ err += sum[0] + sum[1];
+ return err;
+}
diff --git a/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 0000000000..30983d1c10
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,217 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+; void av1_temporal_filter_apply_sse2 | arg
+; (unsigned char *frame1, | 0
+; unsigned int stride, | 1
+; unsigned char *frame2, | 2
+; unsigned int block_width, | 3
+; unsigned int block_height, | 4
+; int strength, | 5
+; int filter_weight, | 6
+; unsigned int *accumulator, | 7
+; unsigned short *count) | 8
+global sym(av1_temporal_filter_apply_sse2) PRIVATE
+sym(av1_temporal_filter_apply_sse2):
+
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ALIGN_STACK 16, rax
+ %define block_width 0
+ %define block_height 16
+ %define strength 32
+ %define filter_weight 48
+ %define rounding_bit 64
+ %define rbp_backup 80
+ %define stack_size 96
+ sub rsp, stack_size
+ mov [rsp + rbp_backup], rbp
+ ; end prolog
+
+ mov edx, arg(3)
+ mov [rsp + block_width], rdx
+ mov edx, arg(4)
+ mov [rsp + block_height], rdx
+ movd xmm6, arg(5)
+ movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+ ; calculate the rounding bit outside the loop
+ ; 0x8000 >> (16 - strength)
+ mov rdx, 16
+ sub rdx, arg(5) ; 16 - strength
+ movq xmm4, rdx ; can't use rdx w/ shift
+ movdqa xmm5, [GLOBAL(_const_top_bit)]
+ psrlw xmm5, xmm4
+ movdqa [rsp + rounding_bit], xmm5
+
+ mov rsi, arg(0) ; src/frame1
+ mov rdx, arg(2) ; predictor frame
+ mov rdi, arg(7) ; accumulator
+ mov rax, arg(8) ; count
+
+ ; dup the filter weight and store for later
+ movd xmm0, arg(6) ; filter_weight
+ pshuflw xmm0, xmm0, 0
+ punpcklwd xmm0, xmm0
+ movdqa [rsp + filter_weight], xmm0
+
+ mov rbp, arg(1) ; stride
+ pxor xmm7, xmm7 ; zero for extraction
+
+ mov rcx, [rsp + block_width]
+ imul rcx, [rsp + block_height]
+ add rcx, rdx
+ cmp dword ptr [rsp + block_width], 8
+ jne .temporal_filter_apply_load_16
+
+.temporal_filter_apply_load_8:
+ movq xmm0, [rsi] ; first row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ movq xmm1, [rsi] ; second row
+ lea rsi, [rsi + rbp] ; += stride
+ punpcklbw xmm1, xmm7 ; src[ 8-15]
+ jmp .temporal_filter_apply_load_finished
+
+.temporal_filter_apply_load_16:
+ movdqa xmm0, [rsi] ; src (frame1)
+ lea rsi, [rsi + rbp] ; += stride
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; src[ 0- 7]
+ punpckhbw xmm1, xmm7 ; src[ 8-15]
+
+.temporal_filter_apply_load_finished:
+ movdqa xmm2, [rdx] ; predictor (frame2)
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm3, xmm7 ; pred[ 8-15]
+
+ ; modifier = src_byte - pixel_value
+ psubw xmm0, xmm2 ; src - pred[ 0- 7]
+ psubw xmm1, xmm3 ; src - pred[ 8-15]
+
+ ; modifier *= modifier
+ pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
+ pmullw xmm1, xmm1 ; modifer[ 8-15]^2
+
+ ; modifier *= 3
+ pmullw xmm0, [GLOBAL(_const_3w)]
+ pmullw xmm1, [GLOBAL(_const_3w)]
+
+ ; modifer += 0x8000 >> (16 - strength)
+ paddw xmm0, [rsp + rounding_bit]
+ paddw xmm1, [rsp + rounding_bit]
+
+ ; modifier >>= strength
+ psrlw xmm0, [rsp + strength]
+ psrlw xmm1, [rsp + strength]
+
+ ; modifier = 16 - modifier
+ ; saturation takes care of modifier > 16
+ movdqa xmm3, [GLOBAL(_const_16w)]
+ movdqa xmm2, [GLOBAL(_const_16w)]
+ psubusw xmm3, xmm1
+ psubusw xmm2, xmm0
+
+ ; modifier *= filter_weight
+ pmullw xmm2, [rsp + filter_weight]
+ pmullw xmm3, [rsp + filter_weight]
+
+ ; count
+ movdqa xmm4, [rax]
+ movdqa xmm5, [rax+16]
+ ; += modifier
+ paddw xmm4, xmm2
+ paddw xmm5, xmm3
+ ; write back
+ movdqa [rax], xmm4
+ movdqa [rax+16], xmm5
+ lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
+
+ ; load and extract the predictor up to shorts
+ pxor xmm7, xmm7
+ movdqa xmm0, [rdx]
+ lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
+ movdqa xmm1, xmm0
+ punpcklbw xmm0, xmm7 ; pred[ 0- 7]
+ punpckhbw xmm1, xmm7 ; pred[ 8-15]
+
+ ; modifier *= pixel_value
+ pmullw xmm0, xmm2
+ pmullw xmm1, xmm3
+
+ ; expand to double words
+ movdqa xmm2, xmm0
+ punpcklwd xmm0, xmm7 ; [ 0- 3]
+ punpckhwd xmm2, xmm7 ; [ 4- 7]
+ movdqa xmm3, xmm1
+ punpcklwd xmm1, xmm7 ; [ 8-11]
+ punpckhwd xmm3, xmm7 ; [12-15]
+
+ ; accumulator
+ movdqa xmm4, [rdi]
+ movdqa xmm5, [rdi+16]
+ movdqa xmm6, [rdi+32]
+ movdqa xmm7, [rdi+48]
+ ; += modifier
+ paddd xmm4, xmm0
+ paddd xmm5, xmm2
+ paddd xmm6, xmm1
+ paddd xmm7, xmm3
+ ; write back
+ movdqa [rdi], xmm4
+ movdqa [rdi+16], xmm5
+ movdqa [rdi+32], xmm6
+ movdqa [rdi+48], xmm7
+ lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+ cmp rdx, rcx
+ je .temporal_filter_apply_epilog
+ pxor xmm7, xmm7 ; zero for extraction
+ cmp dword ptr [rsp + block_width], 16
+ je .temporal_filter_apply_load_16
+ jmp .temporal_filter_apply_load_8
+
+.temporal_filter_apply_epilog:
+ ; begin epilog
+ mov rbp, [rsp + rbp_backup]
+ add rsp, stack_size
+ pop rsp
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+ times 8 dw 3
+align 16
+_const_top_bit:
+ times 8 dw 1<<15
+align 16
+_const_16w:
+ times 8 dw 16
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
new file mode 100644
index 0000000000..2a792f14e6
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_avx2.c
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_avx2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+
+ uint64_t csse;
+
+ const __m256i v_mask_max_w = _mm256_set1_epi16(MAX_MASK_VALUE);
+ const __m256i v_zext_q = yy_set1_64_from_32i(0xffffffff);
+
+ __m256i v_acc0_q = _mm256_setzero_si256();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m256i v_r0_w = _mm256_lddqu_si256((__m256i *)(r1 + n));
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(d + n));
+ const __m128i v_m01_b = _mm_lddqu_si128((__m128i *)(m + n));
+
+ const __m256i v_rd0l_w = _mm256_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m256i v_rd0h_w = _mm256_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m256i v_m0_w = _mm256_cvtepu8_epi16(v_m01_b);
+
+ const __m256i v_m0l_w = _mm256_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m256i v_m0h_w = _mm256_unpackhi_epi16(v_m0_w, v_mask_max_w);
+
+ const __m256i v_t0l_d = _mm256_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m256i v_t0h_d = _mm256_madd_epi16(v_rd0h_w, v_m0h_w);
+
+ const __m256i v_t0_w = _mm256_packs_epi32(v_t0l_d, v_t0h_d);
+
+ const __m256i v_sq0_d = _mm256_madd_epi16(v_t0_w, v_t0_w);
+
+ const __m256i v_sum0_q = _mm256_add_epi64(
+ _mm256_and_si256(v_sq0_d, v_zext_q), _mm256_srli_epi64(v_sq0_d, 32));
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, v_sum0_q);
+
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm256_add_epi64(v_acc0_q, _mm256_srli_si256(v_acc0_q, 8));
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc0_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc0_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+#if ARCH_X86_64
+ csse = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&csse, v_acc_q_0);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_avx2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+ __m256i v_acc0_d = _mm256_setzero_si256();
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_m01_b = _mm256_lddqu_si256((__m256i *)(m));
+ const __m256i v_m23_b = _mm256_lddqu_si256((__m256i *)(m + 32));
+
+ const __m256i v_d0_w = _mm256_lddqu_si256((__m256i *)(ds));
+ const __m256i v_d1_w = _mm256_lddqu_si256((__m256i *)(ds + 16));
+ const __m256i v_d2_w = _mm256_lddqu_si256((__m256i *)(ds + 32));
+ const __m256i v_d3_w = _mm256_lddqu_si256((__m256i *)(ds + 48));
+
+ const __m256i v_m0_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m01_b));
+ const __m256i v_m1_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m01_b, 1));
+ const __m256i v_m2_w =
+ _mm256_cvtepu8_epi16(_mm256_castsi256_si128(v_m23_b));
+ const __m256i v_m3_w =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(v_m23_b, 1));
+
+ const __m256i v_p0_d = _mm256_madd_epi16(v_d0_w, v_m0_w);
+ const __m256i v_p1_d = _mm256_madd_epi16(v_d1_w, v_m1_w);
+ const __m256i v_p2_d = _mm256_madd_epi16(v_d2_w, v_m2_w);
+ const __m256i v_p3_d = _mm256_madd_epi16(v_d3_w, v_m3_w);
+
+ const __m256i v_p01_d = _mm256_add_epi32(v_p0_d, v_p1_d);
+ const __m256i v_p23_d = _mm256_add_epi32(v_p2_d, v_p3_d);
+
+ const __m256i v_p0123_d = _mm256_add_epi32(v_p01_d, v_p23_d);
+
+ v_acc0_d = _mm256_add_epi32(v_acc0_d, v_p0123_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ __m256i v_sign_d = _mm256_srai_epi32(v_acc0_d, 31);
+ v_acc0_d = _mm256_add_epi64(_mm256_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm256_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ __m256i v_acc_q = _mm256_add_epi64(v_acc0_d, _mm256_srli_si256(v_acc0_d, 8));
+
+ __m128i v_acc_q_0 = _mm256_castsi256_si128(v_acc_q);
+ __m128i v_acc_q_1 = _mm256_extracti128_si256(v_acc_q, 1);
+ v_acc_q_0 = _mm_add_epi64(v_acc_q_0, v_acc_q_1);
+
+#if ARCH_X86_64
+ acc = (uint64_t)_mm_extract_epi64(v_acc_q_0, 0);
+#else
+ xx_storel_64(&acc, v_acc_q_0);
+#endif
+
+ return acc > limit;
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_avx2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m256i v_neg_w = _mm256_set1_epi32(0xffff0001);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m256i v_a0_w = _mm256_lddqu_si256((__m256i *)(a));
+ const __m256i v_b0_w = _mm256_lddqu_si256((__m256i *)(b));
+ const __m256i v_a1_w = _mm256_lddqu_si256((__m256i *)(a + 16));
+ const __m256i v_b1_w = _mm256_lddqu_si256((__m256i *)(b + 16));
+ const __m256i v_a2_w = _mm256_lddqu_si256((__m256i *)(a + 32));
+ const __m256i v_b2_w = _mm256_lddqu_si256((__m256i *)(b + 32));
+ const __m256i v_a3_w = _mm256_lddqu_si256((__m256i *)(a + 48));
+ const __m256i v_b3_w = _mm256_lddqu_si256((__m256i *)(b + 48));
+
+ const __m256i v_ab0l_w = _mm256_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab0h_w = _mm256_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m256i v_ab1l_w = _mm256_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab1h_w = _mm256_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m256i v_ab2l_w = _mm256_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab2h_w = _mm256_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m256i v_ab3l_w = _mm256_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m256i v_ab3h_w = _mm256_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m256i v_abl0n_w = _mm256_sign_epi16(v_ab0l_w, v_neg_w);
+ const __m256i v_abh0n_w = _mm256_sign_epi16(v_ab0h_w, v_neg_w);
+ const __m256i v_abl1n_w = _mm256_sign_epi16(v_ab1l_w, v_neg_w);
+ const __m256i v_abh1n_w = _mm256_sign_epi16(v_ab1h_w, v_neg_w);
+ const __m256i v_abl2n_w = _mm256_sign_epi16(v_ab2l_w, v_neg_w);
+ const __m256i v_abh2n_w = _mm256_sign_epi16(v_ab2h_w, v_neg_w);
+ const __m256i v_abl3n_w = _mm256_sign_epi16(v_ab3l_w, v_neg_w);
+ const __m256i v_abh3n_w = _mm256_sign_epi16(v_ab3h_w, v_neg_w);
+
+ const __m256i v_r0l_w = _mm256_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m256i v_r0h_w = _mm256_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m256i v_r1l_w = _mm256_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m256i v_r1h_w = _mm256_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m256i v_r2l_w = _mm256_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m256i v_r2h_w = _mm256_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m256i v_r3l_w = _mm256_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m256i v_r3h_w = _mm256_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m256i v_r0_w = _mm256_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m256i v_r1_w = _mm256_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m256i v_r2_w = _mm256_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m256i v_r3_w = _mm256_packs_epi32(v_r3l_w, v_r3h_w);
+
+ _mm256_store_si256((__m256i *)(d), v_r0_w);
+ _mm256_store_si256((__m256i *)(d + 16), v_r1_w);
+ _mm256_store_si256((__m256i *)(d + 32), v_r2_w);
+ _mm256_store_si256((__m256i *)(d + 48), v_r3_w);
+
+ a += 64;
+ b += 64;
+ d += 64;
+ N -= 64;
+ } while (N);
+}
diff --git a/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000000..4d2e99f258
--- /dev/null
+++ b/third_party/aom/av1/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom/aom_integer.h"
+
+#include "av1/common/reconinter.h"
+
+#define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See av1_wedge_sse_from_residuals_c
+ */
+uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d,
+ const uint8_t *m, int N) {
+ int n = -N;
+ int n8 = n + 8;
+
+ uint64_t csse;
+
+ const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+ const __m128i v_zext_q = xx_set1_64_from_32i(0xffffffff);
+
+ __m128i v_acc0_q = _mm_setzero_si128();
+
+ assert(N % 64 == 0);
+
+ r1 += N;
+ d += N;
+ m += N;
+
+ do {
+ const __m128i v_r0_w = xx_load_128(r1 + n);
+ const __m128i v_r1_w = xx_load_128(r1 + n8);
+ const __m128i v_d0_w = xx_load_128(d + n);
+ const __m128i v_d1_w = xx_load_128(d + n8);
+ const __m128i v_m01_b = xx_load_128(m + n);
+
+ const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+ const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+ const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+ const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+ const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+ const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+ const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+ const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+ const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+ const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+ const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+ const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+ const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+ const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+ const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+ _mm_srli_epi64(v_sq0_d, 32));
+ const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+ _mm_srli_epi64(v_sq1_d, 32));
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+ n8 += 16;
+ n += 16;
+ } while (n);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+ csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+ xx_storel_64(&csse, v_acc0_q);
+#endif
+
+ return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See av1_wedge_sign_from_residuals_c
+ */
+int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m,
+ int N, int64_t limit) {
+ int64_t acc;
+
+ __m128i v_sign_d;
+ __m128i v_acc0_d = _mm_setzero_si128();
+ __m128i v_acc1_d = _mm_setzero_si128();
+ __m128i v_acc_q;
+
+ // Input size limited to 8192 by the use of 32 bit accumulators and m
+ // being between [0, 64]. Overflow might happen at larger sizes,
+ // though it is practically impossible on real video input.
+ assert(N < 8192);
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_m01_b = xx_load_128(m);
+ const __m128i v_m23_b = xx_load_128(m + 16);
+ const __m128i v_m45_b = xx_load_128(m + 32);
+ const __m128i v_m67_b = xx_load_128(m + 48);
+
+ const __m128i v_d0_w = xx_load_128(ds);
+ const __m128i v_d1_w = xx_load_128(ds + 8);
+ const __m128i v_d2_w = xx_load_128(ds + 16);
+ const __m128i v_d3_w = xx_load_128(ds + 24);
+ const __m128i v_d4_w = xx_load_128(ds + 32);
+ const __m128i v_d5_w = xx_load_128(ds + 40);
+ const __m128i v_d6_w = xx_load_128(ds + 48);
+ const __m128i v_d7_w = xx_load_128(ds + 56);
+
+ const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+ const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+ const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+ const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+ const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+ const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+ const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+ const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+ const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+ const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+ const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+ const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+ const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+ const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+ const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+ const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+ const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+ const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+ const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+ v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+ v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+ ds += 64;
+ m += 64;
+
+ N -= 64;
+ } while (N);
+
+ v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+ v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+ v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+ v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+ _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+ v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+ acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+ xx_storel_64(&acc, v_acc_q);
+#endif
+
+ return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+ return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * av1_wedge_compute_delta_squares_c
+ */
+void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a,
+ const int16_t *b, int N) {
+ const __m128i v_neg_w =
+ _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0);
+
+ assert(N % 64 == 0);
+
+ do {
+ const __m128i v_a0_w = xx_load_128(a);
+ const __m128i v_b0_w = xx_load_128(b);
+ const __m128i v_a1_w = xx_load_128(a + 8);
+ const __m128i v_b1_w = xx_load_128(b + 8);
+ const __m128i v_a2_w = xx_load_128(a + 16);
+ const __m128i v_b2_w = xx_load_128(b + 16);
+ const __m128i v_a3_w = xx_load_128(a + 24);
+ const __m128i v_b3_w = xx_load_128(b + 24);
+
+ const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+ const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+ const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+ const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+ const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+ // Negate top word of pairs
+ const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+ const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+ const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+ const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+ const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+ const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+ const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+ const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+ const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+ const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+ const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+ const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+ const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+ const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+ const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+ const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+ const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+ const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+ const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+ const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+ xx_store_128(d, v_r0_w);
+ xx_store_128(d + 8, v_r1_w);
+ xx_store_128(d + 16, v_r2_w);
+ xx_store_128(d + 24, v_r3_w);
+
+ a += 32;
+ b += 32;
+ d += 32;
+ N -= 32;
+ } while (N);
+}
diff --git a/third_party/aom/av1/exports_com b/third_party/aom/av1/exports_com
new file mode 100644
index 0000000000..5c8e0e09d1
--- /dev/null
+++ b/third_party/aom/av1/exports_com
@@ -0,0 +1,2 @@
+text aom_read_obu_header_and_size
+text av1_resize_frame420
diff --git a/third_party/aom/av1/exports_dec b/third_party/aom/av1/exports_dec
new file mode 100644
index 0000000000..daabf6766d
--- /dev/null
+++ b/third_party/aom/av1/exports_dec
@@ -0,0 +1,3 @@
+data aom_codec_av1_dx_algo
+text aom_codec_av1_dx
+text av1_add_film_grain
diff --git a/third_party/aom/av1/exports_enc b/third_party/aom/av1/exports_enc
new file mode 100644
index 0000000000..dc4a9eae79
--- /dev/null
+++ b/third_party/aom/av1/exports_enc
@@ -0,0 +1,2 @@
+data aom_codec_av1_cx_algo
+text aom_codec_av1_cx
diff --git a/third_party/aom/av1/exports_test b/third_party/aom/av1/exports_test
new file mode 100644
index 0000000000..dab3775750
--- /dev/null
+++ b/third_party/aom/av1/exports_test
@@ -0,0 +1,2 @@
+text av1_get_fwd_txfm_cfg
+text av1_rtcd